In [41]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [42]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [43]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [44]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [45]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)

In [46]:
country_dic = {
    'US': 'US',
    'NL': 'Netherlands',
    'DE': 'Germany',
    'GB': 'UK',
    'IL': 'Israel',
    'Other': 'Other'
}

In [47]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
top_country

['US', 'NL', 'DE', 'GB', 'IL']

In [48]:
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )
authors['countrypred_new'] = [country_dic[x] for x in authors['countrypred_new']]

In [49]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [50]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [51]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

In [52]:
def get_table(df, var1, var2):
    '''this functin gets a table: var1 (row) by var 2 (col)
    Input:
        - df: authors
        - var1: most likely 'journal'
        - var2: 'racepred', 'genderpred', 'countrypred_new'
    '''
    dff = df.groupby([var1, var2]).size().to_frame('freq').reset_index()
    dff = dff.pivot(index=var1, columns=var2, values='freq')
    dff.fillna(0, inplace = True)
    dff['Total'] = dff.sum(axis=1)
    rowsum = dff.sum(axis = 0).tolist()
    dff.loc['Total'] = rowsum
    return dff

In [53]:
def update_row(row):
    '''this function divide each cell by the row total
    '''
    row_data = row.tolist()
    # get total
    total = row_data[-1]
    # remove total
    row_data = row_data[:-1]
    prop = ["{0:.1%}".format(x / total) for x in row_data]
    lst = []
    for i in range(len(row_data)):
        if row_data[i] > 0:
            string = str(int(row_data[i])) + ' ' +  '(' + prop[i] + ')'
        else:
            string = str(0)
        lst.append(string)
    lst.append(total)
    return lst

In [54]:
def update_df(df):
    '''This function updates df so that each cell is in the format of 'number (percentage)'
    Input:
        - df: j_race, j_gender, j_country
    '''
    tuples = []
    for index, row in df.iterrows():
        lst = update_row(row)
        lst.insert(0, index)
        tuples.append(tuple(lst))
    colnames = df.columns.tolist()
    if df.index.name == 'journal':
        colnames.insert(0, 'Jounral')
    else:
        colnames.insert(0, 'Aff country')
    df = pd.DataFrame(tuples, columns = colnames)
    return df

## Country by race

In [55]:
country_by_race = get_table(authors, 'countrypred_new', 'racepred')
country_by_race = update_df(country_by_race)
country_by_race

Unnamed: 0,Aff country,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,Germany,4 (1.9%),1 (0.5%),0,0,1 (0.5%),203 (97.1%),209.0
1,Israel,5 (4.0%),0,2 (1.6%),0,17 (13.5%),102 (81.0%),126.0
2,Netherlands,5 (1.8%),0,1 (0.4%),1 (0.4%),2 (0.7%),274 (96.8%),283.0
3,Other,256 (29.3%),8 (0.9%),103 (11.8%),0,10 (1.1%),497 (56.9%),874.0
4,UK,11 (7.0%),2 (1.3%),6 (3.8%),0,2 (1.3%),137 (86.7%),158.0
5,US,518 (18.6%),78 (2.8%),110 (4.0%),7 (0.3%),31 (1.1%),2040 (73.3%),2784.0
6,Total,799 (18.0%),89 (2.0%),222 (5.0%),8 (0.2%),63 (1.4%),3253 (73.4%),4434.0


In [56]:
print(country_by_race.to_latex(index=True)) 

\begin{tabular}{llllllllr}
\toprule
{} &  Aff country &        Asian &      Black &     Hispanic & Indigenous & Middle Eastern &         White &   Total \\
\midrule
0 &      Germany &     4 (1.9\%) &   1 (0.5\%) &            0 &          0 &       1 (0.5\%) &   203 (97.1\%) &   209.0 \\
1 &       Israel &     5 (4.0\%) &          0 &     2 (1.6\%) &          0 &     17 (13.5\%) &   102 (81.0\%) &   126.0 \\
2 &  Netherlands &     5 (1.8\%) &          0 &     1 (0.4\%) &   1 (0.4\%) &       2 (0.7\%) &   274 (96.8\%) &   283.0 \\
3 &        Other &  256 (29.3\%) &   8 (0.9\%) &  103 (11.8\%) &          0 &      10 (1.1\%) &   497 (56.9\%) &   874.0 \\
4 &           UK &    11 (7.0\%) &   2 (1.3\%) &     6 (3.8\%) &          0 &       2 (1.3\%) &   137 (86.7\%) &   158.0 \\
5 &           US &  518 (18.6\%) &  78 (2.8\%) &   110 (4.0\%) &   7 (0.3\%) &      31 (1.1\%) &  2040 (73.3\%) &  2784.0 \\
6 &        Total &  799 (18.0\%) &  89 (2.0\%) &   222 (5.0\%) &   8 (0.2\%) &      63 (1.4\

  print(country_by_race.to_latex(index=True))


## Journal by race

In [57]:
j_race = get_table(authors, 'journal', 'racepred')
update_df(j_race)

Unnamed: 0,Jounral,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,CCC,97 (17.0%),28 (4.9%),33 (5.8%),2 (0.4%),13 (2.3%),396 (69.6%),569.0
1,CT,50 (10.8%),10 (2.2%),21 (4.5%),0,7 (1.5%),376 (81.0%),464.0
2,HCR,165 (20.0%),10 (1.2%),37 (4.5%),1 (0.1%),11 (1.3%),603 (72.9%),827.0
3,JCMC,231 (24.2%),6 (0.6%),54 (5.7%),3 (0.3%),10 (1.0%),650 (68.1%),954.0
4,JOC,256 (15.8%),35 (2.2%),77 (4.8%),2 (0.1%),22 (1.4%),1228 (75.8%),1620.0
5,Total,799 (18.0%),89 (2.0%),222 (5.0%),8 (0.2%),63 (1.4%),3253 (73.4%),4434.0


## Journal by gender

In [58]:
j_gender = get_table(authors, 'journal', 'genderpred')
update_df(j_gender)

Unnamed: 0,Jounral,F,M,N,Total
0,CCC,323 (56.8%),242 (42.5%),4 (0.7%),569.0
1,CT,188 (40.5%),276 (59.5%),0,464.0
2,HCR,393 (47.5%),432 (52.2%),2 (0.2%),827.0
3,JCMC,451 (47.3%),503 (52.7%),0,954.0
4,JOC,758 (46.8%),861 (53.1%),1 (0.1%),1620.0
5,Total,2113 (47.7%),2314 (52.2%),7 (0.2%),4434.0


## Journal by country

In [59]:
j_country = get_table(authors, 'journal', 'countrypred_new')
update_df(j_country)

Unnamed: 0,Jounral,Germany,Israel,Netherlands,Other,UK,US,Total
0,CCC,7 (1.2%),21 (3.7%),6 (1.1%),106 (18.6%),45 (7.9%),384 (67.5%),569
1,CT,26 (5.6%),31 (6.7%),12 (2.6%),108 (23.3%),18 (3.9%),269 (58.0%),464
2,HCR,28 (3.4%),16 (1.9%),68 (8.2%),105 (12.7%),9 (1.1%),601 (72.7%),827
3,JCMC,29 (3.0%),20 (2.1%),91 (9.5%),264 (27.7%),44 (4.6%),506 (53.0%),954
4,JOC,119 (7.3%),38 (2.3%),106 (6.5%),291 (18.0%),42 (2.6%),1024 (63.2%),1620
5,Total,209 (4.7%),126 (2.8%),283 (6.4%),874 (19.7%),158 (3.6%),2784 (62.8%),4434
