In [30]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [31]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [32]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [33]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [34]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)

In [35]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [36]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [37]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## General Trends

### Total number of papers by year

In [38]:
yearly_papernum = transform_year(yearly_papernum)
yearly_papernum

Unnamed: 0,year,freq
0,2010-01-01,139
1,2011-01-01,138
2,2012-01-01,163
3,2013-01-01,154
4,2014-01-01,187
5,2015-01-01,165
6,2016-01-01,159
7,2017-01-01,153
8,2018-01-01,157
9,2019-01-01,121


In [39]:
papernum_by_year_chart = alt.Chart(yearly_papernum).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    )
).properties(
    title = 'a',
    height=300,
    width=260
)
papernum_by_year_chart

### Average number of authors by year / by journal

In [40]:
yearly_ave_authornum = papers.groupby(
    'year')['numberOfAuthors'].mean().to_frame().reset_index()

In [41]:
yearly_ave_authornum = transform_year(yearly_ave_authornum)
yearly_ave_authornum

Unnamed: 0,year,numberOfAuthors
0,2010-01-01,2.122302
1,2011-01-01,2.231884
2,2012-01-01,2.134969
3,2013-01-01,2.305195
4,2014-01-01,2.304813
5,2015-01-01,2.436364
6,2016-01-01,2.213836
7,2017-01-01,2.300654
8,2018-01-01,2.343949
9,2019-01-01,2.22314


In [42]:
yearly_ave_authornum_chart = alt.Chart(yearly_ave_authornum).mark_line().encode(
    
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'numberOfAuthors',
        title = 'Average # of Authors'
    )
).properties(
    title = 'b',
    height=300,
    width=260
)
yearly_ave_authornum_chart

## Gender & Race

In [43]:
gender_ts = get_freq_and_prop(data = 'authors', var = 'genderpred')
gender_ts.head()

Unnamed: 0,year,genderpred,freq,year total,prop
0,2010,F,140,295,0.475
1,2010,M,155,295,0.525
2,2011,F,145,308,0.471
3,2011,M,163,308,0.529
4,2012,F,152,348,0.437


In [44]:
gender_ts[gender_ts.genderpred == 'N'].freq.sum()

7

In [45]:
# gender_ts['year'] = pd.to_datetime(gender_ts['year'], format='%Y')

gender_stack_chart = alt.Chart(gender_ts).mark_bar().encode(
    x = alt.X(
        'year:N',
        axis = alt.Axis(
            tickCount= 4,
            labelAngle = - 90
    )
    ),
    y = alt.Y(
        'sum(freq)',
        title = '# of Authors'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'c',
    height=300,
    width=260
)

gender_stack_chart

In [46]:
race_ts = get_freq_and_prop(data = 'authors', var = 'racepred')
race_ts.head()

Unnamed: 0,year,racepred,freq,year total,prop
0,2010,Asian,55,295,0.186
1,2010,Black,5,295,0.017
2,2010,Hispanic,11,295,0.037
3,2010,Middle Eastern,5,295,0.017
4,2010,White,219,295,0.742


In [47]:
unique_races = list(set(race_ts.racepred))
unique_races.sort()
unique_races

['Asian', 'Black', 'Hispanic', 'Indigenous', 'Middle Eastern', 'White']

In [48]:
race_colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f']

In [49]:
race_ts = transform_year(race_ts)

In [50]:
race_stack_chart = alt.Chart(race_ts).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'd',
    height=300,
    width=280
)

race_stack_chart

## Intersection of Gender & Race

In [51]:
race_count_dic_long = dict(Counter(authors.racepred))
race_count_dic_long

{'White': 3253,
 'Hispanic': 222,
 'Asian': 799,
 'Black': 89,
 'Middle Eastern': 63,
 'Indigenous': 8}

In [53]:
gender_count_dic = dict(Counter(authors.genderpred))
gender_count_dic

{'F': 2113, 'M': 2314, 'N': 7}

In [54]:
gender_by_race_long = authors.groupby(
    ['genderpred', 'racepred']).size().to_frame('freq').reset_index()
gender_by_race_long['gender_total'] = [gender_count_dic[x] for x in gender_by_race_long.genderpred]
gender_by_race_long['race_total'] = [race_count_dic_long[x] for x in gender_by_race_long.racepred]
gender_by_race_long['race_in_gender'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['gender_total'], 
    3)
gender_by_race_long['gender_in_race'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['race_total'], 
    3)
gender_by_race_long

Unnamed: 0,genderpred,racepred,freq,gender_total,race_total,race_in_gender,gender_in_race
0,F,Asian,433,2113,799,0.205,0.542
1,F,Black,53,2113,89,0.025,0.596
2,F,Hispanic,122,2113,222,0.058,0.55
3,F,Indigenous,5,2113,8,0.002,0.625
4,F,Middle Eastern,36,2113,63,0.017,0.571
5,F,White,1464,2113,3253,0.693,0.45
6,M,Asian,365,2314,799,0.158,0.457
7,M,Black,36,2314,89,0.016,0.404
8,M,Hispanic,98,2314,222,0.042,0.441
9,M,Indigenous,3,2314,8,0.001,0.375


In [56]:
gender_and_race_3 = alt.Chart(gender_by_race_long).mark_bar().encode(
    x = alt.X(
        'genderpred',
        sort = ["M", "F", "N"],
        axis = None
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
#         scale=alt.Scale(type="log")  # Here the scale is applied
    ),
    color = alt.Color(
         'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    ),
    column = alt.Column('racepred:N', 
                        title = None,
                        header=alt.Header(labelFontSize=13)
    )
).properties(
    title = 'a',
    height=300,
    width=60
)

gender_and_race_3

In [58]:
gender_and_race_4 = alt.Chart(gender_by_race_long.sample(gender_by_race_long.shape[0])).mark_bar().encode(
    x = alt.X(
        'racepred:N',
        axis = None
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
#         scale=alt.Scale(type="log")  # Here the scale is applied
    ),
    color = alt.Color(
         'racepred:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    ),
    column = alt.Column(
        'genderpred:N', 
        title = None,
        sort = ["M", "F", "N"],
        header=alt.Header(labelFontSize=13)
    )
).properties(
    title = 'b',
    height=300,
    width=100
)

gender_and_race_4

In [59]:
female_prop_by_race = authors.groupby(
    ['year', 'genderpred', 'racepred']).size().to_frame('freq').reset_index()
female_prop_by_race.head()

Unnamed: 0,year,genderpred,racepred,freq
0,2010,F,Asian,26
1,2010,F,Black,2
2,2010,F,Hispanic,8
3,2010,F,Middle Eastern,3
4,2010,F,White,101


In [60]:
tuples = []
for year in list(set(female_prop_by_race.year)):
    for race in list(set(female_prop_by_race.racepred)):
        year_race = female_prop_by_race[
            (female_prop_by_race.year == year) & (female_prop_by_race.racepred == race)]
        total = sum(year_race['freq'])
        try:
            female_num = year_race[year_race.genderpred == 'F'].iloc[0]['freq']
            female_prop = female_num / total
        except:
            female_num = 0
            female_prop = 0
        tuples.append((year, race, total, female_num, female_prop))

In [61]:
female_prop_by_race = pd.DataFrame(tuples, columns = [
    'year', 
    'race', 
    'yearly race total', 
    'female_num', 
    'female_prop'])
female_prop_by_race

Unnamed: 0,year,race,yearly race total,female_num,female_prop
0,2016,Middle Eastern,6,5,0.833333
1,2016,Indigenous,1,1,1.000000
2,2016,Asian,70,35,0.500000
3,2016,White,249,106,0.425703
4,2016,Hispanic,20,9,0.450000
...,...,...,...,...,...
73,2015,Indigenous,1,0,0.000000
74,2015,Asian,65,30,0.461538
75,2015,White,305,128,0.419672
76,2015,Hispanic,22,12,0.545455


In [62]:
# unique_races.remove('Indigenous')
# race_colors.remove(race_colors[3])

line = alt.Chart(transform_year(female_prop_by_race)).mark_line(
).encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'female_prop',
        title = 'Female Proportion'
    ),
    color = alt.Color(
         'race:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors),
        legend = None
    ),
)

points = line.mark_point().encode(
    color = alt.Color(
        'race:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors),
    ),
    shape=alt.Shape('race', 
                    scale=alt.Scale(
                        domain=unique_races,
                        range=['cross', 'circle', 'square', 'triangle-right', 'diamond'])
    ),
    
)

female_prop_ts_chart = alt.layer(
    line,
    points
).resolve_scale(
    color = 'independent',
    shape = 'independent'
).properties(
    title = 'c',
    height=300,
    width=360
)

female_prop_ts_chart

## Country and affiliations

In [63]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
top_country

['US', 'NL', 'DE', 'GB', 'IL']

In [64]:
all_country = list(set(authors['countrypred']))
len(all_country)

51

In [65]:
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [66]:
country_prop = get_simple_prop_df('authors', 'countrypred_new')
country_prop.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'United Kimdom',
                    'IL': 'Israel'}, inplace = True)
country_prop

Unnamed: 0,countrypred_new,freq,prop
0,US,2784,0.628
2,Other,874,0.197
5,Netherlands,283,0.064
1,Germany,209,0.047
3,United Kimdom,158,0.036
4,Israel,126,0.028


In [67]:
countries = country_prop.countrypred_new.tolist()
country_colors = ['grey', 'orange', 'red', 'teal', 'pink', 'purple']

In [68]:
country_ts = get_freq_and_prop('authors', 'countrypred_new')
country_ts.columns = ['year', 'country', 'freq', 'year total', 'prop']
country_ts.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'United Kimdom',
                    'IL': 'Israel'}, inplace = True)
country_ts

Unnamed: 0,year,country,freq,year total,prop
0,2010,Germany,3,295,0.010
1,2010,United Kimdom,3,295,0.010
2,2010,Israel,11,295,0.037
3,2010,Netherlands,9,295,0.031
4,2010,Other,50,295,0.169
...,...,...,...,...,...
73,2022,United Kimdom,9,200,0.045
74,2022,Israel,5,200,0.025
75,2022,Netherlands,20,200,0.100
76,2022,Other,38,200,0.190


In [70]:
source = country_ts

country_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Authors'
    ),
    color=alt.Color(
        "country:N",
        title = 'Country/Region',
#         legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'a',
    height=300,
    width=260
)

country_stacked_chart

In [72]:
with_us_ts = get_freq_and_prop('papers', 'with_us_authors')
with_us_ts.head()

Unnamed: 0,year,with_us_authors,freq,year total,prop
0,2010,No,31,139,0.223
1,2010,Yes,108,139,0.777
2,2011,No,23,138,0.167
3,2011,Yes,115,138,0.833
4,2012,No,57,163,0.35


In [76]:
source = with_us_ts

with_us_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Papers'
    ),
    color=alt.Color(
        "with_us_authors:N",
        title = 'Involving US Authors?',
#         legend = None,
        scale = alt.Scale(
            domain = ['No', 'Yes'],
            range = ['orange', 'steelblue'],
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

with_us_stacked_chart

In [77]:
with_us_ts_chart = alt.Chart(transform_year(with_us_prop_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'with_us_authors',
        title = 'With US authors',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'e',
    height=200,
    width=200
)

with_us_ts_chart

### Affiliation types

In [78]:
afftype_prop = get_simple_prop_df('authors', 'afftypepred')
afftype_prop.replace({
    'Education': 'Edu',
    'Non Education': 'Non-Edu'
}, inplace = True)
afftype_prop

Unnamed: 0,afftypepred,freq,prop
0,Edu,4293,0.968
1,Non-Edu,141,0.032


In [79]:
afftype_dist_chart = alt.Chart(afftype_prop).mark_bar().encode(
    x = alt.X(
        'afftypepred',
        title = 'Affiliation type',
        axis = alt.Axis(labelAngle = -45),
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
#         scale = alt.Scale(type = 'log')
    ),
    color = alt.Color(
        'afftypepred',
        legend = None,
    )
).properties(
    title = 'g',
    height=200,
    width=100
)

afftype_dist_chart

## Collaborations

In [80]:
cross_var_idx = [15, 16, 17, 18, 21, 22, 23, 24]
all_paper_cols = papers.columns.tolist()
cross_vars = [all_paper_cols[x] for x in cross_var_idx]
cross_vars

['cross_country',
 'cross_type',
 'cross_gender',
 'cross_race',
 'cross_gender_and_race',
 'cross_gender_and_country',
 'cross_country_and_race',
 'cross_gender_race_and_country']

In [81]:
cross_vars_new = [
    'Cross Country',
    'Cross Type',
    'Cross Gender',
    'Cross Race',
    'Cross Gender & Race',
    'Cross Gender & Country',
    'Cross Country & Race',
    'Cross Gender, Race & Country'
]

In [82]:
rename_cross_vars_dic = dict(zip(cross_vars, cross_vars_new))
rename_cross_vars_dic

{'cross_country': 'Cross Country',
 'cross_type': 'Cross Type',
 'cross_gender': 'Cross Gender',
 'cross_race': 'Cross Race',
 'cross_gender_and_race': 'Cross Gender & Race',
 'cross_gender_and_country': 'Cross Gender & Country',
 'cross_country_and_race': 'Cross Country & Race',
 'cross_gender_race_and_country': 'Cross Gender, Race & Country'}

In [83]:
dfs = []
for var in cross_vars:
    dff = get_simple_prop_df('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dfs.append(dff)

In [84]:
collab_df = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df.replace(rename_cross_vars_dic, inplace=True)
sorted_collab = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)['collab_type'].tolist()
sorted_collab

['Cross Gender',
 'Cross Race',
 'Cross Country',
 'Cross Gender & Race',
 'Cross Gender & Country',
 'Cross Country & Race',
 'Cross Gender, Race & Country',
 'Cross Type']

In [85]:
collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)

Unnamed: 0,binary,freq,prop,collab_type
5,Yes,750,0.392,Cross Gender
7,Yes,492,0.257,Cross Race
1,Yes,331,0.173,Cross Country
9,Yes,327,0.171,Cross Gender & Race
11,Yes,229,0.12,Cross Gender & Country
13,Yes,146,0.076,Cross Country & Race
15,Yes,113,0.059,"Cross Gender, Race & Country"
3,Yes,74,0.039,Cross Type


In [86]:
source = collab_df
cross_count_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'collab_type',
        title = None,
        sort = sorted_collab,
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
        sort='-x'
    ),
    color = alt.Color(
        'binary',
        title = 'Binary',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    ),
#     color = 'binary'
).properties(
    title = 'a',
    height=300,
    width=160
)

cross_count_chart

In [87]:
dfs = []
for var in cross_vars:
    dff = get_freq_and_prop('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dff.drop(columns = ['year total'], inplace = True)
    dfs.append(dff)

In [88]:
collab_df_ts = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df_ts.replace(rename_cross_vars_dic, inplace=True)
collab_df_ts = collab_df_ts[collab_df_ts.binary == 'Yes']
collab_df_ts = transform_year(collab_df_ts)
collab_df_ts.head()

Unnamed: 0,year,binary,freq,prop,collab_type
1,2010-01-01,Yes,19,0.137,Cross Country
3,2011-01-01,Yes,18,0.13,Cross Country
5,2012-01-01,Yes,28,0.172,Cross Country
7,2013-01-01,Yes,17,0.11,Cross Country
9,2014-01-01,Yes,39,0.209,Cross Country


In [90]:
source = collab_df_ts

collab_ts_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Papers'
    ),
    color=alt.Color(
        "collab_type:N",
        title = 'Collaboration',
#         sort=alt.EncodingSortField('freq', op='sum', order='descending'),
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

collab_ts_stacked_chart

In [91]:
source = collab_df_ts
all_cross_ts_chart = alt.Chart(source).mark_line(point=alt.OverlayMarkDef()).encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'collab_type',
        title = 'Collaboration',
        sort=alt.EncodingSortField('prop', op='mean', order='descending'),
    )
).properties(
    title = 'b',
    height=300,
    width=300
)

all_cross_ts_chart