In [1]:
%%bash --out h

h="$(hostname).local"
echo "$h"

In [2]:
%load_ext sql

In [3]:
# followed guide here: https://medium.com/analytics-vidhya/postgresql-integration-with-jupyter-notebook-deb97579a38d
from sqlalchemy import create_engine
import json
from typing import Dict

def read_config() -> Dict[str, str]:
    f = open('config.json', "r")
    config = json.loads(f.read())
    f.close()
    return config

config = read_config()
config['host'] = h.strip()

sql_address = f"postgresql://{config['user']}:{config['password']}@{config['host']}/{config['database']}"
%sql $sql_address

'Connected: postgres@dblp'

In [None]:
%%sql
SELECT COUNT(*) AS q1a FROM inproceedings;

In [None]:
%%sql
SELECT COUNT(*) AS q1b FROM article;

In [None]:
%%sql
SELECT COUNT(*) AS q1c FROM authorship;

In [None]:
%%sql
ALTER TABLE inproceedings ADD COLUMN area text;

In [None]:
%%sql
UPDATE inproceedings SET area = 'Database' WHERE booktitle IN ('SIGMOD Conference', 'VLDB', 'ICDE', 'PODS');
UPDATE inproceedings SET area = 'Theory' WHERE booktitle IN ('STOC', 'FOCS', 'SODA', 'ICALP');
UPDATE inproceedings SET area = 'Systems' WHERE booktitle IN ('SIGCOMM', 'ISCA', 'HPCA', 'PLDI');
UPDATE inproceedings SET area = 'ML-AI' WHERE booktitle IN ('ICML', 'NIPS', 'NeurIPS', 'AAAI', 'IJCAI');
UPDATE inproceedings SET area = 'UNKNOWN' WHERE area IS NULL;

In [None]:
%%sql
-- q3a
SELECT area, COUNT(author) AS cnt FROM inproceedings, authorship 
WHERE 
    inproceedings.pubkey = authorship.pubkey 
    AND area != 'UNKNOWN'
GROUP BY area;

In [None]:
%%sql
-- q3b
WITH database_authors AS (
   SELECT authorship.pubkey, author, area FROM inproceedings, authorship
   WHERE
      inproceedings.pubkey = authorship.pubkey
      AND area = 'Database'
)

SELECT author, COUNT(*) as cnt 
FROM database_authors 
GROUP BY author
ORDER BY cnt DESC
LIMIT 10; 

In [None]:
%%sql
-- q3c
WITH area_authors AS (
    SELECT authorship.pubkey, author, area FROM inproceedings, authorship
    WHERE 
        inproceedings.pubkey = authorship.pubkey
        AND area != 'UNKNOWN'
),
area_counts AS (
    SELECT author, COUNT(area) AS a_cnt FROM area_authors GROUP BY author
)

SELECT COUNT(*) as cnt FROM area_counts WHERE a_cnt = 2;

In [None]:
%%sql
-- q3d
WITH conf_count AS (
    SELECT authorship.author, COUNT(*) AS ccnt
    FROM inproceedings, authorship
    WHERE inproceedings.pubkey = authorship.pubkey
    GROUP BY authorship.author
),
journal_count AS (
    SELECT authorship.author, COUNT(*) AS jcnt
    FROM article, authorship
    WHERE article.pubkey = authorship.pubkey
    GROUP BY authorship.author 
),
combined AS (
    SELECT journal_count.author, COALESCE(ccnt, 0) AS ccnt, jcnt
    FROM journal_count
    LEFT JOIN conf_count ON journal_count.author = conf_count.author
)

SELECT COUNT(*) 
FROM combined
WHERE jcnt > ccnt;

In [None]:
%%sql
-- q3e
WITH db_conf_authors AS (
    SELECT authorship.author
    FROM inproceedings, authorship
    WHERE inproceedings.pubkey = authorship.pubkey
        AND area = 'Database'
        AND inproceedings.year >= 2000
),
db_paper_authors AS (
    SELECT authorship.author
    FROM article, authorship
    WHERE article.pubkey = authorship.pubkey
        AND year >= 2000
),
db_only_papers AS (
    SELECT author FROM db_paper_authors
    WHERE author IN (SELECT author FROM db_conf_authors);
),
db_union AS (
    SELECT * FROM db_only_papers 
    UNION ALL 
    SELECT * FROM db_conf_authors
)


SELECT author, COUNT(*) AS cnt 
FROM db_union
GROUP BY author
ORDER BY cnt DESC, author ASC
LIMIT 5;

In [4]:
%%sql
decade_counts << WITH journal_decades AS (
    SELECT (year / 10)*10 AS decade, COUNT(*) AS num_journals
    FROM inproceedings
    WHERE (year / 10)*10 IS NOT NULL
    GROUP BY decade
),
article_decade AS (
    SELECT (year / 10)*10 AS decade, COUNT(*) AS num_articles
    FROM article
    WHERE (year / 10)*10 IS NOT NULL
    GROUP BY decade
)

SELECT article_decade.decade, num_articles, num_journals FROM journal_decades, article_decade
WHERE journal_decades.decade = article_decade.decade

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
8 rows affected.
Returning data to local variable decade_counts


In [5]:
import altair as alt
import pandas as pd
from typing import List

def sql_to_df(sql, cols: List[str]) -> pd.DataFrame:
    return pd.DataFrame(sql, columns=cols)


In [30]:
decade_df = sql_to_df(decade_counts, ['decade', 'num_articles', 'num_journals'])

decade_df = pd.melt(decade_df, id_vars=['decade'], value_vars=[
                 'num_articles', 'num_journals'])

decade_df
alt.Chart(decade_df, title='Question 4A').mark_line().encode(
    x=alt.X('decade', title="Decade", axis={'format': 'd'}), 
    y=alt.Y('value', title="Papers"),
    color='variable',
)