In [1]:
%%bash --out h

h="$(hostname).local"
echo "$h"

In [2]:
%load_ext sql

In [3]:
# followed guide here: https://medium.com/analytics-vidhya/postgresql-integration-with-jupyter-notebook-deb97579a38d
from sqlalchemy import create_engine
import json
from typing import Dict

def read_config() -> Dict[str, str]:
    f = open('config.json', "r")
    config = json.loads(f.read())
    f.close()
    return config

config = read_config()
config['host'] = h.strip()

sql_address = f"postgresql://{config['user']}:{config['password']}@{config['host']}/{config['database']}"
%sql $sql_address

'Connected: postgres@dblp'

In [15]:
%%sql
SELECT COUNT(*) AS q1a FROM inproceedings;

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
1 rows affected.


q1a
2956396


In [14]:
%%sql
SELECT COUNT(*) AS q1b FROM article;

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
1 rows affected.


q1b
2738932


In [13]:
%%sql
SELECT COUNT(*) AS q1c FROM authorship;

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
1 rows affected.


q1c
18128940


In [None]:
%%sql
ALTER TABLE inproceedings ADD COLUMN area text;

In [None]:
%%sql
UPDATE inproceedings SET area = 'Database' WHERE booktitle IN ('SIGMOD Conference', 'VLDB', 'ICDE', 'PODS');
UPDATE inproceedings SET area = 'Theory' WHERE booktitle IN ('STOC', 'FOCS', 'SODA', 'ICALP');
UPDATE inproceedings SET area = 'Systems' WHERE booktitle IN ('SIGCOMM', 'ISCA', 'HPCA', 'PLDI');
UPDATE inproceedings SET area = 'ML-AI' WHERE booktitle IN ('ICML', 'NIPS', 'NeurIPS', 'AAAI', 'IJCAI');
UPDATE inproceedings SET area = 'UNKNOWN' WHERE area IS NULL;

In [12]:
%%sql
-- q3a
SELECT area, COUNT(author) AS cnt FROM inproceedings, authorship 
WHERE 
    inproceedings.pubkey = authorship.pubkey 
    AND area != 'UNKNOWN'
GROUP BY area;

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
4 rows affected.


area,cnt
Database,43836
ML-AI,143124
Systems,23744
Theory,33626


In [6]:
%%sql
-- q3b
WITH database_authors AS (
   SELECT authorship.pubkey, author, area FROM inproceedings, authorship
   WHERE
      inproceedings.pubkey = authorship.pubkey
      AND area = 'Database'
)

SELECT author, COUNT(*) as cnt 
FROM database_authors 
GROUP BY author
ORDER BY cnt DESC
LIMIT 10; 

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
10 rows affected.


author,cnt
Divesh Srivastava,150
H. V. Jagadish,127
Surajit Chaudhuri,127
Jiawei Han 0001,110
Philip S. Yu,110
Xuemin Lin 0001,109
Jeffrey F. Naughton,108
Beng Chin Ooi,105
Hector Garcia-Molina,104
Michael Stonebraker,100


In [11]:
%%sql
-- q3c
WITH area_authors AS (
    SELECT authorship.pubkey, author, area FROM inproceedings, authorship
    WHERE 
        inproceedings.pubkey = authorship.pubkey
        AND area != 'UNKNOWN'
),
area_counts AS (
    SELECT author, COUNT(area) AS a_cnt FROM area_authors GROUP BY author
)

SELECT COUNT(*) as cnt FROM area_counts WHERE a_cnt = 2;

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
1 rows affected.


cnt
13573


In [10]:
%%sql
-- q3d
WITH conf_count AS (
    SELECT authorship.author, COUNT(*) AS ccnt
    FROM inproceedings, authorship
    WHERE inproceedings.pubkey = authorship.pubkey
    GROUP BY authorship.author
),
journal_count AS (
    SELECT authorship.author, COUNT(*) AS jcnt
    FROM article, authorship
    WHERE article.pubkey = authorship.pubkey
    GROUP BY authorship.author 
),
combined AS (
    SELECT journal_count.author, COALESCE(ccnt, 0) AS ccnt, jcnt
    FROM journal_count
    LEFT JOIN conf_count ON journal_count.author = conf_count.author
)

SELECT COUNT(*) 
FROM combined
WHERE jcnt > ccnt;

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
1 rows affected.


count
1301165


In [9]:
%%sql
-- q3e
WITH db_conf_authors AS (
    SELECT authorship.author
    FROM inproceedings, authorship
    WHERE inproceedings.pubkey = authorship.pubkey
        AND area = 'Database'
        AND inproceedings.year >= 2000
),
db_paper_authors AS (
    SELECT authorship.author
    FROM article, authorship
    WHERE article.pubkey = authorship.pubkey
        AND year >= 2000
),
db_only_papers AS (
    SELECT author FROM db_paper_authors
    WHERE author IN (SELECT author FROM db_conf_authors)
),
db_union AS (
    SELECT * FROM db_only_papers 
    UNION ALL 
    SELECT * FROM db_conf_authors
)


SELECT author, COUNT(*) AS cnt 
FROM db_union
GROUP BY author
ORDER BY cnt DESC, author ASC
LIMIT 5

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
5 rows affected.


author,cnt
Dacheng Tao,878
Wei Wang,733
Yu Zhang,722
Philip S. Yu,698
Yang Liu,694


In [4]:
%%sql
decade_counts << WITH journal_decades AS (
    SELECT (year / 10)*10 AS decade, COUNT(*) AS num_confs
    FROM inproceedings
    WHERE (year / 10)*10 IS NOT NULL
    GROUP BY decade
),
article_decade AS (
    SELECT (year / 10)*10 AS decade, COUNT(*) AS num_journals
    FROM article
    WHERE (year / 10)*10 IS NOT NULL
    GROUP BY decade
)

SELECT article_decade.decade, num_journals, num_confs FROM journal_decades, article_decade
WHERE journal_decades.decade = article_decade.decade

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
8 rows affected.


decade,num_articles,num_journals
1950,1731,833
1960,10095,2966
1970,30314,15162
1980,70700,58322
1990,204592,233170
2000,556510,819806
2010,1361637,1541253
2020,503106,284884


In [11]:
import altair as alt
import pandas as pd
from typing import List

def sql_to_df(sql, cols: List[str]) -> pd.DataFrame:
    return pd.DataFrame(sql, columns=cols)


In [30]:
decade_df = sql_to_df(decade_counts, ['decade', 'num_articles', 'num_journals'])

decade_df = pd.melt(decade_df, id_vars=['decade'], value_vars=[
                 'num_articles', 'num_journals'])

decade_df
alt.Chart(decade_df, title='Question 4A').mark_line().encode(
    x=alt.X('decade', title="Decade", axis={'format': 'd'}), 
    y=alt.Y('value', title="Papers"),
    color='variable',
)

In [5]:
%%sql
avg_authors << WITH authors_per_paper AS (
    SELECT inproceedings.pubkey, area, COALESCE(COUNT(author), 0) AS authcount, (year / 10)*10 AS decade
    FROM inproceedings, authorship
    WHERE inproceedings.pubkey = authorship.pubkey
        AND area != 'UNKNOWN'
    GROUP BY inproceedings.pubkey
),
papers_per_area AS (
    SELECT area, COUNT(*) as papercount, (year / 10)*10 AS decade
    FROM inproceedings
    WHERE area != 'UNKNOWN'
    GROUP BY area, decade
),
collated_papers AS (
    SELECT area, SUM(authcount) AS totalauthors, decade
    FROM authors_per_paper
    GROUP BY area, decade
)

SELECT papers_per_area.decade, papers_per_area.area, (totalauthors / papercount) AS avgcoauthor
FROM papers_per_area, collated_papers
WHERE papers_per_area.area = collated_papers.area
    AND papers_per_area.decade = collated_papers.decade
ORDER BY decade ASC, area 

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
26 rows affected.


decade,area,avgcoauthor
1960,ML-AI,1.746031746031746
1960,Theory,1.2903225806451613
1970,Database,1.8484848484848484
1970,ML-AI,1.7921052631578949
1970,Systems,1.91796875
1970,Theory,1.5780189959294435
1980,Database,2.060858424087124
1980,ML-AI,1.8215767634854767
1980,Systems,2.2572877059569074
1980,Theory,1.8742999377722465


In [35]:
avg_authors_df = sql_to_df(avg_authors, ['decade', 'area', 'avgcoauthor'])
avg_authors_df.avgcoauthor = pd.to_numeric(avg_authors_df.avgcoauthor)


alt.Chart(avg_authors_df, title="Question 4C").mark_bar().encode(
    x='area:O',
    y='avgcoauthor:Q',
    color='area:N',
    column='decade:N'
)


In [55]:
%%sql
WITH authors_per_paper AS (
    SELECT inproceedings.pubkey, area, COALESCE(COUNT(author), 0) AS authcount, (year / 10)*10 AS decade
    FROM inproceedings, authorship
    WHERE inproceedings.pubkey = authorship.pubkey
        AND area != 'UNKNOWN'
    GROUP BY inproceedings.pubkey
),
papers_per_area AS (
    SELECT area, COUNT(*) as papercount, (year / 10)*10 AS decade
    FROM inproceedings
    WHERE area != 'UNKNOWN'
    GROUP BY area, decade
),
collated_papers AS (
    SELECT area, SUM(authcount) AS totalauthors, decade
    FROM authors_per_paper
    GROUP BY area, decade
),
avg_authors AS (
    SELECT papers_per_area.decade, papers_per_area.area, (totalauthors / papercount) AS avgcoauthor
    FROM papers_per_area, collated_papers
    WHERE papers_per_area.area = collated_papers.area
        AND papers_per_area.decade = collated_papers.decade
    ORDER BY decade ASC, area 
),
area_count AS (
    SELECT area, COUNT(*) AS n FROM avg_authors GROUP BY area
), 
decade_count AS (
    SELECT (decade % 1960) / 10 AS decade_num, area, avgcoauthor
    FROM avg_authors
),
abcdn_tab AS (
    SELECT area, SUM(decade_num * avgcoauthor) AS a, SUM(decade_num) as b, SUM(avgcoauthor) AS c, SUM(decade_num^2) AS d, COUNT(*) AS n FROM decade_count GROUP BY area
)

-- SELECT area, COUNT(*) FROM decade_count GROUP BY area
SELECT 
    area, ((n * a) - (b * c)) / ((n * d) - b^2) AS slope
FROM abcdn_tab
ORDER BY slope DESC 

 * postgresql://postgres:***@Big-Hat-Logan.local/dblp
4 rows affected.


area,slope
Systems,0.7795931455206031
Database,0.5684403328239886
ML-AI,0.4161163827095369
Theory,0.2847553752782215
