In [43]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [44]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [45]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [46]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [47]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)

In [48]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [49]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [50]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Analysis

In [51]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
top_country

['US', 'NL', 'DE', 'GB', 'IL']

In [52]:
all_country = list(set(authors['countrypred']))
len(all_country)

51

In [53]:
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [54]:
country_prop = get_simple_prop_df('authors', 'countrypred_new')
country_prop.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'United Kimdom',
                    'IL': 'Israel'}, inplace = True)
country_prop

Unnamed: 0,countrypred_new,freq,prop
0,US,2784,0.628
2,Other,874,0.197
5,Netherlands,283,0.064
1,Germany,209,0.047
3,United Kimdom,158,0.036
4,Israel,126,0.028


In [55]:
countries = country_prop.countrypred_new.tolist()
country_colors = ['grey', 'orange', 'red', 'teal', 'pink', 'purple']

In [56]:
country_dist_chart = alt.Chart(country_prop).mark_bar().encode(
    x = alt.X(
        'countrypred_new',
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
    ),
    color = alt.Color(
        'countrypred_new',
        title = 'Country/Region',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'a',
    height=300,
    width=140
)

country_dist_chart

In [57]:
country_ts = get_freq_and_prop('authors', 'countrypred_new')
country_ts.columns = ['year', 'country', 'freq', 'year total', 'prop']
country_ts.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'United Kimdom',
                    'IL': 'Israel'}, inplace = True)
country_ts

Unnamed: 0,year,country,freq,year total,prop
0,2010,Germany,3,295,0.010
1,2010,United Kimdom,3,295,0.010
2,2010,Israel,11,295,0.037
3,2010,Netherlands,9,295,0.031
4,2010,Other,50,295,0.169
...,...,...,...,...,...
73,2022,United Kimdom,9,200,0.045
74,2022,Israel,5,200,0.025
75,2022,Netherlands,20,200,0.100
76,2022,Other,38,200,0.190


In [58]:
source = country_ts

country_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Authors'
    ),
    color=alt.Color(
        "country:N",
        title = 'Country/Region',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

country_stacked_chart

In [59]:
line = alt.Chart(transform_year(source)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'country',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
)

points = line.mark_point().encode(
    color = alt.Color(
        'country',
        title = 'Country/Region',
#         legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
    shape = alt.Shape(
        'country',
#         legend = None,
        scale = alt.Scale(
            domain = countries
        )
    )
)

country_prop_ts_chart = alt.layer(
    line,
    points,
).resolve_scale(
    color = 'independent',
    shape = 'independent'
).properties(
    title = 'c',
    height=300,
    width=300
)

country_prop_ts_chart

In [60]:
chart1 = alt.hconcat(
    country_dist_chart,
    country_stacked_chart,
    country_prop_ts_chart,
)

chart1

### With US authors

In [61]:
with_us_prop = get_simple_prop_df('papers', 'with_us_authors')
with_us_prop

Unnamed: 0,with_us_authors,freq,prop
0,Yes,1301,0.68
1,No,611,0.32


In [62]:
source = with_us_prop
with_us_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'with_us_authors',
        title = 'With US authors',
        sort = '-y',
        axis = alt.Axis(labelAngle = 0)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    ),
    color = alt.Color(
        'with_us_authors',
        legend = None,
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'd',
    height=200,
    width=100
)

with_us_chart

In [63]:
with_us_prop_ts = get_freq_and_prop('papers', 'with_us_authors')
with_us_prop_ts.head()

Unnamed: 0,year,with_us_authors,freq,year total,prop
0,2010,No,31,139,0.223
1,2010,Yes,108,139,0.777
2,2011,No,23,138,0.167
3,2011,Yes,115,138,0.833
4,2012,No,57,163,0.35


In [64]:
with_us_ts_chart = alt.Chart(transform_year(with_us_prop_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'with_us_authors',
        title = 'With US authors',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'e',
    height=200,
    width=200
)

with_us_ts_chart

## Number of countries

In [65]:
num_country_df = get_simple_prop_df('papers', 'num_country')
num_country_df

Unnamed: 0,num_country,freq,prop
0,1,1581,0.827
1,2,278,0.145
3,3,43,0.022
2,4,6,0.003
5,9,3,0.002
4,6,1,0.001


In [66]:
country_num_dist_chart = alt.Chart(num_country_df).mark_bar().encode(
    x = alt.X(
        'num_country:N',
        title = 'Number of countries',
        axis = alt.Axis(labelAngle = 0)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    ),
    color = alt.Color(
        'num_country:N',
        legend = None,
    )
).properties(
    title = 'f',
    height=200,
    width=150
)

country_num_dist_chart

In [67]:
num_country_ts = get_freq_and_prop('papers', 'num_country')
num_country_ts = transform_year(num_country_ts)
num_country_ts.head()

Unnamed: 0,year,num_country,freq,year total,prop
0,2010-01-01,1,120,139,0.863
1,2010-01-01,2,16,139,0.115
2,2010-01-01,3,3,139,0.022
3,2011-01-01,1,120,138,0.87
4,2011-01-01,2,15,138,0.109


In [68]:
alt.Chart(num_country_ts).mark_line().encode(
    x = 'year',
    y = 'prop',
    color = 'num_country:N'
)

## Affiliation types

In [69]:
afftype_prop = get_simple_prop_df('authors', 'afftypepred')
afftype_prop.replace({
    'Education': 'Edu',
    'Non Education': 'Non-Edu'
}, inplace = True)
afftype_prop

Unnamed: 0,afftypepred,freq,prop
0,Edu,4293,0.968
1,Non-Edu,141,0.032


In [70]:
afftype_dist_chart = alt.Chart(afftype_prop).mark_bar().encode(
    x = alt.X(
        'afftypepred',
        title = 'Affiliation type',
        axis = alt.Axis(labelAngle = -45),
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
#         scale = alt.Scale(type = 'log')
    ),
    color = alt.Color(
        'afftypepred',
        legend = None,
    )
).properties(
    title = 'g',
    height=200,
    width=100
)

afftype_dist_chart

In [71]:
chart2 = alt.hconcat(
    with_us_chart,
    with_us_ts_chart,
    country_num_dist_chart,
    afftype_dist_chart
).resolve_scale(
    color='independent',
)

chart2

In [72]:
alt.vconcat(
    chart1,
    chart2
).configure_axis(
    labelFontSize=15,
    titleFontSize=20
).configure_legend(
    titleFontSize=16,
    labelFontSize=16
).configure_title(
    anchor='start',
    fontSize=22
).configure_point(
    size=100
)

## Country tier/ north-south analysis

In [73]:
from pycountry_convert import *

country_codes = list(set(authors.countrypred))
country_names = [country_alpha2_to_country_name(x) for x in country_codes]

In [74]:
tier = pd.read_csv('../data/raw/ica-country-tier.csv')

a = [x for x in tier['Tier A'] if str(x) != 'nan']
b = [x for x in tier['Tier B'] if str(x) != 'nan']
c = [x for x in tier['Tier C'] if str(x) != 'nan']
name2tier_dic = dict()
for x in a:
    name2tier_dic[x] = 'Tier A'
for x in b:
    name2tier_dic[x] = 'Tier B'
for x in c:
    name2tier_dic[x] = 'Tier C'

In [75]:
authors['country_name'] = authors['countrypred'].apply(
    lambda x: country_alpha2_to_country_name(x)
)
authors['country_tier'] = authors['country_name'].apply(
    lambda x: name2tier_dic[x]
)
authors['country_position'] = authors['country_tier'].apply(
    lambda x: 'Global North' if x == 'Tier A' else 'Global South'
)

In [76]:
tier_prop = get_simple_prop_df('authors', 'country_tier')
tier_prop

Unnamed: 0,country_tier,freq,prop
0,Tier A,4210,0.949
1,Tier B,156,0.035
2,Tier C,68,0.015


In [77]:
alt.Chart(tier_prop).mark_bar().encode(
    x = alt.X(
        'country_tier',
        title = 'Country Tier',
        axis = alt.Axis(labelAngle = -45),
    ),
    y = alt.Y(
        'freq',
        title = '# of Authros'
    ),
    color = alt.Color(
        'country_tier',
        legend = None,
    )
)

In [78]:
position_prop = get_simple_prop_df('authors', 'country_position')
position_prop

Unnamed: 0,country_position,freq,prop
0,Global North,4210,0.949
1,Global South,224,0.051


In [79]:
alt.Chart(position_prop).mark_bar().encode(
    x = alt.X(
        'country_position',
        title = 'Country Status',
        axis = alt.Axis(labelAngle = -45),
    ),
    y = alt.Y(
        'freq',
        title = '# of Authros'
    ),
    color = alt.Color(
        'country_position',
        legend = None,
    )
)

## Race composition for countries

In [80]:
# top = authors[authors.countrypred.isin(top_country)]
# races = list(set(authors.racepred))
# races.sort()
# races

In [81]:
# tuples = []
# for cgroup in authors.groupby('countrypred_new'):
#     country = cgroup[0]
#     total = len(cgroup[1])
#     for rgroup in cgroup[1].groupby('racepred'):
#         race = rgroup[0]
#         rtotal = len(rgroup[1])
#         rprop = "{0:.1%}".format(rtotal / total)
#         tuples.append((country, total, race, rtotal, rprop))
# dff = pd.DataFrame(tuples, columns = ['country', 'grand total', 'race', 'freq', 'prop'])
# dff['freq_prop'] = dff.apply(
#     lambda x: str(x['freq']) + ' ' +  '(' + x['prop'] + ')',
#     axis = 1
# )
# dff.head()

In [82]:
# dff = dff.pivot(index="country", columns="race", values='freq_prop')
# dff.fillna(0, inplace = True)
# dff

In [83]:
# dff = authors.groupby(['countrypred_new', 'racepred']).size().to_frame('freq').reset_index()
# dff = dff.pivot(index="countrypred_new", columns="racepred", values='freq')
# dff.fillna(0, inplace = True)
# dff['Total'] = dff.sum(axis=1)
# rowsum = dff.sum(axis = 0).tolist()
# dff.loc[len(dff.index)] = rowsum
# dff

In [84]:
# print(dff.to_latex(index=True)) 

In [85]:
# dff.to_latex('../data/plots/race-country.tex')

In [86]:
def get_table(df, var1, var2):
    '''this functin gets a table: var1 (row) by var 2 (col)
    Input:
        - df: authors
        - var1: most likely 'journal'
        - var2: 'racepred', 'genderpred', 'countrypred_new'
    '''
    dff = df.groupby([var1, var2]).size().to_frame('freq').reset_index()
    dff = dff.pivot(index=var1, columns=var2, values='freq')
    dff.fillna(0, inplace = True)
    dff['Total'] = dff.sum(axis=1)
    rowsum = dff.sum(axis = 0).tolist()
    dff.loc['Total'] = rowsum
    return dff

In [87]:
def update_row(row):
    '''this function divide each cell by the row total
    '''
    row_data = row.tolist()
    # get total
    total = row_data[-1]
    # remove total
    row_data = row_data[:-1]
    prop = ["{0:.1%}".format(x / total) for x in row_data]
    lst = []
    for i in range(len(row_data)):
        if row_data[i] > 0:
            string = str(int(row_data[i])) + ' ' +  '(' + prop[i] + ')'
        else:
            string = str(0)
        lst.append(string)
    lst.append(total)
    return lst

In [88]:
def update_df(df):
    '''This function updates df so that each cell is in the format of 'number (percentage)'
    Input:
        - df: j_race, j_gender, j_country
    '''
    tuples = []
    for index, row in df.iterrows():
        lst = update_row(row)
        lst.insert(0, index)
        tuples.append(tuple(lst))
    colnames = df.columns.tolist()
    if df.index.name == 'journal':
        colnames.insert(0, 'Jounral')
    else:
        colnames.insert(0, 'Aff country')
    df = pd.DataFrame(tuples, columns = colnames)
    return df

In [89]:
country_by_race = get_table(authors, 'countrypred_new', 'racepred')
country_by_race = update_df(country_by_race)
country_by_race

Unnamed: 0,Aff country,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,DE,4 (1.9%),1 (0.5%),0,0,1 (0.5%),203 (97.1%),209.0
1,GB,11 (7.0%),2 (1.3%),6 (3.8%),0,2 (1.3%),137 (86.7%),158.0
2,IL,5 (4.0%),0,2 (1.6%),0,17 (13.5%),102 (81.0%),126.0
3,NL,5 (1.8%),0,1 (0.4%),1 (0.4%),2 (0.7%),274 (96.8%),283.0
4,Other,256 (29.3%),8 (0.9%),103 (11.8%),0,10 (1.1%),497 (56.9%),874.0
5,US,518 (18.6%),78 (2.8%),110 (4.0%),7 (0.3%),31 (1.1%),2040 (73.3%),2784.0
6,Total,799 (18.0%),89 (2.0%),222 (5.0%),8 (0.2%),63 (1.4%),3253 (73.4%),4434.0


In [90]:
print(country_by_race.to_latex(index=True)) 

\begin{tabular}{llllllllr}
\toprule
{} & Aff country &        Asian &      Black &     Hispanic & Indigenous & Middle Eastern &         White &   Total \\
\midrule
0 &          DE &     4 (1.9\%) &   1 (0.5\%) &            0 &          0 &       1 (0.5\%) &   203 (97.1\%) &   209.0 \\
1 &          GB &    11 (7.0\%) &   2 (1.3\%) &     6 (3.8\%) &          0 &       2 (1.3\%) &   137 (86.7\%) &   158.0 \\
2 &          IL &     5 (4.0\%) &          0 &     2 (1.6\%) &          0 &     17 (13.5\%) &   102 (81.0\%) &   126.0 \\
3 &          NL &     5 (1.8\%) &          0 &     1 (0.4\%) &   1 (0.4\%) &       2 (0.7\%) &   274 (96.8\%) &   283.0 \\
4 &       Other &  256 (29.3\%) &   8 (0.9\%) &  103 (11.8\%) &          0 &      10 (1.1\%) &   497 (56.9\%) &   874.0 \\
5 &          US &  518 (18.6\%) &  78 (2.8\%) &   110 (4.0\%) &   7 (0.3\%) &      31 (1.1\%) &  2040 (73.3\%) &  2784.0 \\
6 &       Total &  799 (18.0\%) &  89 (2.0\%) &   222 (5.0\%) &   8 (0.2\%) &      63 (1.4\%) &  32

  print(country_by_race.to_latex(index=True))


In [91]:
j_race = get_table(authors, 'journal', 'racepred')
update_df(j_race)

Unnamed: 0,Jounral,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,CCC,97 (17.0%),28 (4.9%),33 (5.8%),2 (0.4%),13 (2.3%),396 (69.6%),569.0
1,CT,50 (10.8%),10 (2.2%),21 (4.5%),0,7 (1.5%),376 (81.0%),464.0
2,HCR,165 (20.0%),10 (1.2%),37 (4.5%),1 (0.1%),11 (1.3%),603 (72.9%),827.0
3,JCMC,231 (24.2%),6 (0.6%),54 (5.7%),3 (0.3%),10 (1.0%),650 (68.1%),954.0
4,JOC,256 (15.8%),35 (2.2%),77 (4.8%),2 (0.1%),22 (1.4%),1228 (75.8%),1620.0
5,Total,799 (18.0%),89 (2.0%),222 (5.0%),8 (0.2%),63 (1.4%),3253 (73.4%),4434.0


In [92]:
j_gender = get_table(authors, 'journal', 'genderpred')
update_df(j_gender)

Unnamed: 0,Jounral,F,M,N,Total
0,CCC,323 (56.8%),242 (42.5%),4 (0.7%),569.0
1,CT,188 (40.5%),276 (59.5%),0,464.0
2,HCR,393 (47.5%),432 (52.2%),2 (0.2%),827.0
3,JCMC,451 (47.3%),503 (52.7%),0,954.0
4,JOC,758 (46.8%),861 (53.1%),1 (0.1%),1620.0
5,Total,2113 (47.7%),2314 (52.2%),7 (0.2%),4434.0


In [93]:
j_country = get_table(authors, 'journal', 'countrypred_new')
update_df(j_country)

Unnamed: 0,Jounral,DE,GB,IL,NL,Other,US,Total
0,CCC,7 (1.2%),45 (7.9%),21 (3.7%),6 (1.1%),106 (18.6%),384 (67.5%),569
1,CT,26 (5.6%),18 (3.9%),31 (6.7%),12 (2.6%),108 (23.3%),269 (58.0%),464
2,HCR,28 (3.4%),9 (1.1%),16 (1.9%),68 (8.2%),105 (12.7%),601 (72.7%),827
3,JCMC,29 (3.0%),44 (4.6%),20 (2.1%),91 (9.5%),264 (27.7%),506 (53.0%),954
4,JOC,119 (7.3%),42 (2.6%),38 (2.3%),106 (6.5%),291 (18.0%),1024 (63.2%),1620
5,Total,209 (4.7%),158 (3.6%),126 (2.8%),283 (6.4%),874 (19.7%),2784 (62.8%),4434
