In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [24]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [25]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [26]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [27]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [28]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [29]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [30]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [31]:
total_paper_num, total_author_num

(5712, 11292)

In [32]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Country

In [33]:
all_country = list(set(authors['countrypred']))
len(all_country)

77

In [34]:
country_prop = get_simple_prop_df('authors', 'countrypred_new')
country_prop.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'Great Britain',
                    'IL': 'Israel'}, inplace = True)
country_prop

Unnamed: 0,countrypred_new,freq,prop
0,US,8467,0.75
2,Other,1661,0.147
5,Netherlands,364,0.032
3,Great Britain,297,0.026
1,Germany,276,0.024
4,Israel,227,0.02


In [35]:
country_ts = get_freq_and_prop('authors', 'countrypred_new')
country_ts[country_ts.countrypred_new == 'US']

Unnamed: 0,year,countrypred_new,freq,year total,prop
0,1951,US,24,24,1.000
2,1952,US,23,25,0.920
4,1953,US,15,17,0.882
5,1954,US,16,16,1.000
7,1955,US,10,11,0.909
...,...,...,...,...,...
260,2018,US,230,368,0.625
266,2019,US,187,269,0.695
272,2020,US,197,299,0.659
278,2021,US,267,455,0.587


In [36]:
country_ts[country_ts.countrypred_new == 'DE']

Unnamed: 0,year,countrypred_new,freq,year total,prop
39,1974,DE,1,91,0.011
73,1983,DE,1,144,0.007
94,1988,DE,1,102,0.01
108,1992,DE,2,121,0.017
113,1993,DE,3,152,0.02
122,1995,DE,3,144,0.021
128,1996,DE,2,190,0.011
133,1997,DE,2,164,0.012
139,1998,DE,5,164,0.03
145,1999,DE,3,136,0.022


In [37]:
country_ts[country_ts.countrypred_new == 'NL']

Unnamed: 0,year,countrypred_new,freq,year total,prop
57,1978,NL,2,181,0.011
76,1983,NL,3,144,0.021
115,1993,NL,1,152,0.007
125,1995,NL,2,144,0.014
130,1996,NL,3,190,0.016
136,1997,NL,2,164,0.012
142,1998,NL,2,164,0.012
148,1999,NL,3,136,0.022
154,2000,NL,8,171,0.047
164,2002,NL,6,263,0.023


In [38]:
country_ts[country_ts.countrypred_new == 'Other']

Unnamed: 0,year,countrypred_new,freq,year total,prop
1,1952,Other,2,25,0.080
3,1953,Other,2,17,0.118
6,1955,Other,1,11,0.091
10,1958,Other,2,13,0.154
15,1961,Other,1,24,0.042
...,...,...,...,...,...
259,2018,Other,68,368,0.185
265,2019,Other,45,269,0.167
271,2020,Other,66,299,0.221
277,2021,Other,90,455,0.198


In [39]:
with_us_prop = get_simple_prop_df('papers', 'with_us_authors')
with_us_prop

Unnamed: 0,with_us_authors,freq,prop
0,Yes,4523,0.792
1,No,1189,0.208


In [40]:
with_us_prop_ts = get_freq_and_prop('papers', 'with_us_authors')
with_us_prop_ts.head()

Unnamed: 0,year,with_us_authors,freq,year total,prop
0,1951,Yes,20,20,1.0
1,1952,No,2,24,0.083
2,1952,Yes,22,24,0.917
3,1953,No,2,17,0.118
4,1953,Yes,15,17,0.882


In [41]:
with_us_prop_ts.tail()

Unnamed: 0,year,with_us_authors,freq,year total,prop
129,2020,Yes,95,131,0.725
130,2021,No,56,167,0.335
131,2021,Yes,111,167,0.665
132,2022,No,34,78,0.436
133,2022,Yes,44,78,0.564


In [42]:
with_us_prop_ts.sort_values('prop')

Unnamed: 0,year,with_us_authors,freq,year total,prop
25,1968,No,1,30,0.033
32,1972,No,1,28,0.036
30,1971,No,1,25,0.040
48,1980,No,7,115,0.061
62,1987,No,3,47,0.064
...,...,...,...,...,...
23,1966,Yes,4,4,1.000
24,1967,Yes,6,6,1.000
27,1969,Yes,25,25,1.000
8,1956,Yes,13,13,1.000


## Number of countries

In [43]:
num_country_df = get_simple_prop_df('papers', 'num_country')
num_country_df

Unnamed: 0,num_country,freq,prop
0,1,5123,0.897
1,2,513,0.09
3,3,61,0.011
2,4,10,0.002
5,9,3,0.001
4,5,1,0.0
6,7,1,0.0


In [44]:
total_paper_num

5712

## Affiliations