In [130]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [131]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [132]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [133]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [134]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)

In [135]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [136]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [137]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Analysis

In [138]:
cross_var_idx = [15, 16, 17, 18, 21, 22, 23, 24]
all_paper_cols = papers.columns.tolist()
cross_vars = [all_paper_cols[x] for x in cross_var_idx]
cross_vars

['cross_country',
 'cross_type',
 'cross_gender',
 'cross_race',
 'cross_gender_and_race',
 'cross_gender_and_country',
 'cross_country_and_race',
 'cross_gender_race_and_country']

In [139]:
cross_vars_new = [
    'Cross Country',
    'Cross Type',
    'Cross Gender',
    'Cross Race',
    'Cross Gender & Race',
    'Cross Gender & Country',
    'Cross Country & Race',
    'Cross Gender, Race & Country'
]

In [140]:
rename_cross_vars_dic = dict(zip(cross_vars, cross_vars_new))
rename_cross_vars_dic

{'cross_country': 'Cross Country',
 'cross_type': 'Cross Type',
 'cross_gender': 'Cross Gender',
 'cross_race': 'Cross Race',
 'cross_gender_and_race': 'Cross Gender & Race',
 'cross_gender_and_country': 'Cross Gender & Country',
 'cross_country_and_race': 'Cross Country & Race',
 'cross_gender_race_and_country': 'Cross Gender, Race & Country'}

In [141]:
dfs = []
for var in cross_vars:
    dff = get_simple_prop_df('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dfs.append(dff)

In [142]:
collab_df = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df.replace(rename_cross_vars_dic, inplace=True)
sorted_collab = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)['collab_type'].tolist()
sorted_collab

['Cross Gender',
 'Cross Race',
 'Cross Country',
 'Cross Gender & Race',
 'Cross Gender & Country',
 'Cross Country & Race',
 'Cross Gender, Race & Country',
 'Cross Type']

In [143]:
collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)

Unnamed: 0,binary,freq,prop,collab_type
5,Yes,750,0.392,Cross Gender
7,Yes,492,0.257,Cross Race
1,Yes,331,0.173,Cross Country
9,Yes,327,0.171,Cross Gender & Race
11,Yes,229,0.12,Cross Gender & Country
13,Yes,146,0.076,Cross Country & Race
15,Yes,113,0.059,"Cross Gender, Race & Country"
3,Yes,74,0.039,Cross Type


In [144]:
source = collab_df
cross_count_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'collab_type',
        title = None,
        sort = sorted_collab,
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
        sort='-x'
    ),
    color = alt.Color(
        'binary',
        title = 'Binary',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    ),
#     color = 'binary'
).properties(
    title = 'a',
    height=300,
    width=160
)

cross_count_chart

In [145]:
dfs = []
for var in cross_vars:
    dff = get_freq_and_prop('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dff.drop(columns = ['year total'], inplace = True)
    dfs.append(dff)

In [146]:
collab_df_ts = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df_ts.replace(rename_cross_vars_dic, inplace=True)
collab_df_ts = collab_df_ts[collab_df_ts.binary == 'Yes']
collab_df_ts = transform_year(collab_df_ts)
collab_df_ts.head()

Unnamed: 0,year,binary,freq,prop,collab_type
1,2010-01-01,Yes,19,0.137,Cross Country
3,2011-01-01,Yes,18,0.13,Cross Country
5,2012-01-01,Yes,28,0.172,Cross Country
7,2013-01-01,Yes,17,0.11,Cross Country
9,2014-01-01,Yes,39,0.209,Cross Country


In [147]:
source = collab_df_ts
all_cross_ts_chart = alt.Chart(source).mark_line(point=alt.OverlayMarkDef()).encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'collab_type',
        title = 'Collaboration',
        sort=alt.EncodingSortField('prop', op='mean', order='descending'),
    )
).properties(
    title = 'b',
    height=300,
    width=300
)

all_cross_ts_chart

## Cross-race 

In [148]:
cross_race_details_df = get_simple_prop_df('papers', 'cross_race_details')
cross_race_details_df.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_df

Unnamed: 0,cross_race_details,freq,prop
1,White only,1133,0.593
0,Cross race,492,0.257
2,Asian only,209,0.109
3,Hispanic only,36,0.019
4,Black only,26,0.014
5,Middle Eastern only,15,0.008
6,Indigenous only,1,0.001


In [149]:
cross_race_count_chart = alt.Chart(cross_race_details_df).mark_bar().encode(
    x = alt.X(
        'cross_race_details:N', 
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
    ),
    color = alt.Color(
        'cross_race_details:N',
        legend = None,
    )
).properties(
    title = 'c',
    height=300,
    width=160
)

cross_race_count_chart

In [150]:
cross_race_details_ts = get_freq_and_prop('papers', 'cross_race_details')
cross_race_details_ts.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_ts = transform_year(cross_race_details_ts)
cross_race_details_ts.head()

Unnamed: 0,year,cross_race_details,freq,year total,prop
0,2010-01-01,Asian only,15,139,0.108
1,2010-01-01,Black only,2,139,0.014
2,2010-01-01,Hispanic only,3,139,0.022
3,2010-01-01,Middle Eastern only,1,139,0.007
4,2010-01-01,White only,84,139,0.604


In [151]:
alt.Chart(cross_race_details_ts).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'cross_race_details:N',
        title = 'Race',
        sort=alt.EncodingSortField('prop', op='mean', order='descending')
    )
)

In [152]:
alt.hconcat(
    cross_count_chart,
    cross_race_count_chart,
    all_cross_ts_chart,
).resolve_scale(
    color='independent',
).configure_axis(
    labelFontSize=12,
    titleFontSize=20
).configure_legend(
    titleFontSize=16,
    labelFontSize=11
).configure_title(
    anchor='start',
    fontSize=22
)