In [2]:
%%bash --out h

h="$(hostname).local"
echo "$h"

In [3]:
%load_ext sql

In [4]:
# followed guide here: https://medium.com/analytics-vidhya/postgresql-integration-with-jupyter-notebook-deb97579a38d
from sqlalchemy import create_engine
import json
from typing import Dict

def read_config() -> Dict[str, str]:
    f = open('config.json', "r")
    config = json.loads(f.read())
    f.close()
    return config

config = read_config()
config['host'] = h.strip()

sql_address = f"postgresql://{config['user']}:{config['password']}@{config['host']}/{config['database']}"
%sql $sql_address

'Connected: postgres@dblp'

In [4]:
%%sql
SELECT COUNT(*) AS q1a FROM inproceedings;

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
1 rows affected.


q1a
2956396


In [15]:
%%sql
SELECT COUNT(*) AS q1b FROM article;

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
1 rows affected.


q2b
2738932


In [16]:
%%sql
SELECT COUNT(*) AS q1c FROM authorship;

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
1 rows affected.


q1c
18128940


In [17]:
%%sql
ALTER TABLE inproceedings ADD COLUMN area text;

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
Done.


[]

In [11]:
%%sql
UPDATE inproceedings SET area = 'Database' WHERE booktitle IN ('SIGMOD Conference', 'VLDB', 'ICDE', 'PODS');
UPDATE inproceedings SET area = 'Theory' WHERE booktitle IN ('STOC', 'FOCS', 'SODA', 'ICALP');
UPDATE inproceedings SET area = 'Systems' WHERE booktitle IN ('SIGCOMM', 'ISCA', 'HPCA', 'PLDI');
UPDATE inproceedings SET area = 'ML-AI' WHERE booktitle IN ('ICML', 'NIPS', 'NeurIPS', 'AAAI', 'IJCAI');
UPDATE inproceedings SET area = 'UNKNOWN' WHERE area IS NULL;

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
13126 rows affected.
13758 rows affected.
6289 rows affected.
43475 rows affected.
2879748 rows affected.


[]

In [30]:
%%sql
-- q3a
SELECT area, COUNT(author) AS cnt FROM inproceedings, authorship 
WHERE 
    inproceedings.pubkey = authorship.pubkey 
    AND area != 'UNKNOWN'
GROUP BY area;

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
4 rows affected.


area,cnt
Database,43836
ML-AI,143124
Systems,23744
Theory,33626


In [18]:
%%sql
-- q3b
WITH database_authors AS (
   SELECT authorship.pubkey, author, area FROM inproceedings, authorship
   WHERE
      inproceedings.pubkey = authorship.pubkey
      AND area = 'Database'
)

SELECT author, COUNT(*) as cnt 
FROM database_authors 
GROUP BY author
ORDER BY cnt DESC
LIMIT 10; 

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
10 rows affected.


author,cnt
Divesh Srivastava,150
H. V. Jagadish,127
Surajit Chaudhuri,127
Jiawei Han 0001,110
Philip S. Yu,110
Xuemin Lin 0001,109
Jeffrey F. Naughton,108
Beng Chin Ooi,105
Hector Garcia-Molina,104
Michael Stonebraker,100


In [29]:
%%sql
-- q3c
WITH area_authors AS (
    SELECT authorship.pubkey, author, area FROM inproceedings, authorship
    WHERE 
        inproceedings.pubkey = authorship.pubkey
        AND area != 'UNKNOWN'
),
area_counts AS (
    SELECT author, COUNT(area) AS a_cnt FROM area_authors GROUP BY author
)

SELECT COUNT(*) as cnt FROM area_counts WHERE a_cnt = 2;

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
1 rows affected.


cnt
13573


In [11]:
%%sql
-- q3d
WITH conf_count AS (
    SELECT authorship.author, COUNT(*) AS ccnt
    FROM inproceedings, authorship
    WHERE inproceedings.pubkey = authorship.pubkey
    GROUP BY authorship.author
),
journal_count AS (
    SELECT authorship.author, COUNT(*) AS jcnt
    FROM article, authorship
    WHERE article.pubkey = authorship.pubkey
    GROUP BY authorship.author 
),
combined AS (
    SELECT journal_count.author, COALESCE(ccnt, 0) AS ccnt, jcnt
    FROM journal_count
    LEFT JOIN conf_count ON journal_count.author = conf_count.author
)

SELECT COUNT(*) 
FROM combined
WHERE jcnt > ccnt

 * postgresql://postgres:***@LAPTOP-RF68TAU0.local/dblp
1 rows affected.


count
1301165
