In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [2]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [3]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [4]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [5]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [6]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [7]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [8]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [9]:
total_paper_num, total_author_num

(3169, 7083)

In [10]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Country

In [11]:
all_country = list(set(authors['countrypred']))
len(all_country)

62

In [13]:
country_prop = get_simple_prop_df('authors', 'countrypred_new')
country_prop.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'Great Britain',
                    'IL': 'Israel'}, inplace = True)
country_prop

Unnamed: 0,countrypred_new,freq,prop
0,US,4794,0.677
2,Other,1294,0.183
5,Netherlands,345,0.049
1,Germany,253,0.036
3,Great Britain,224,0.032
4,Israel,173,0.024


In [20]:
country_ts = get_freq_and_prop('authors', 'countrypred_new')
country_ts[country_ts.countrypred_new == 'US']

Unnamed: 0,year,countrypred_new,freq,year total,prop
5,2000,US,136,171,0.795
9,2001,US,142,177,0.802
15,2002,US,208,263,0.791
20,2003,US,161,214,0.752
25,2004,US,202,248,0.815
31,2005,US,246,318,0.774
37,2006,US,254,311,0.817
43,2007,US,206,308,0.669
49,2008,US,232,304,0.763
55,2009,US,223,335,0.666


In [16]:
country_ts[country_ts.countrypred_new == 'Other']

Unnamed: 0,year,countrypred_new,freq,year total,prop
4,2000,Other,16,171,0.094
8,2001,Other,25,177,0.141
14,2002,Other,41,263,0.156
19,2003,Other,35,214,0.164
24,2004,Other,31,248,0.125
30,2005,Other,53,318,0.167
36,2006,Other,33,311,0.106
42,2007,Other,68,308,0.221
48,2008,Other,42,304,0.138
54,2009,Other,76,335,0.227


In [17]:
with_us_prop = get_simple_prop_df('papers', 'with_us_authors')
with_us_prop

Unnamed: 0,with_us_authors,freq,prop
0,Yes,2290,0.723
1,No,879,0.277


In [18]:
with_us_prop_ts = get_freq_and_prop('papers', 'with_us_authors')
with_us_prop_ts.head()

Unnamed: 0,year,with_us_authors,freq,year total,prop
0,2000,No,16,88,0.182
1,2000,Yes,72,88,0.818
2,2001,No,11,92,0.12
3,2001,Yes,81,92,0.88
4,2002,No,27,125,0.216


In [19]:
with_us_prop_ts.tail()

Unnamed: 0,year,with_us_authors,freq,year total,prop
41,2020,Yes,94,131,0.718
42,2021,No,56,167,0.335
43,2021,Yes,111,167,0.665
44,2022,No,35,78,0.449
45,2022,Yes,43,78,0.551


In [21]:
with_us_prop_ts.sort_values('prop')

Unnamed: 0,year,with_us_authors,freq,year total,prop
2,2001,No,11,92,0.12
8,2004,No,15,109,0.138
12,2006,No,22,137,0.161
22,2011,No,23,138,0.167
0,2000,No,16,88,0.182
10,2005,No,27,143,0.189
6,2003,No,23,112,0.205
4,2002,No,27,125,0.216
16,2008,No,32,148,0.216
20,2010,No,31,139,0.223


## Number of countries

In [22]:
num_country_df = get_simple_prop_df('papers', 'num_country')
num_country_df

Unnamed: 0,num_country,freq,prop
0,1,2690,0.849
1,2,408,0.129
3,3,57,0.018
2,4,9,0.003
5,9,3,0.001
4,6,1,0.0
6,7,1,0.0


In [23]:
total_paper_num

3169

## Affiliations