In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [37]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [38]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [39]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [40]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [41]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [42]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [43]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [44]:
total_paper_num, total_author_num

(3169, 7083)

In [45]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Collaboration

In [46]:
cross_var_idx = [15, 16, 17, 18, 21, 22, 23, 24]
all_paper_cols = papers.columns.tolist()
cross_vars = [all_paper_cols[x] for x in cross_var_idx]
cross_vars

['cross_country',
 'cross_type',
 'cross_gender',
 'cross_race',
 'cross_gender_and_race',
 'cross_gender_and_country',
 'cross_country_and_race',
 'cross_gender_race_and_country']

In [47]:
cross_vars_new = [
    'Cross Country',
    'Cross Type',
    'Cross Gender',
    'Cross Race',
    'Cross Gender & Race',
    'Cross Gender & Country',
    'Cross Country & Race',
    'Cross Gender, Race & Country'
]

In [48]:
rename_cross_vars_dic = dict(zip(cross_vars, cross_vars_new))
rename_cross_vars_dic

{'cross_country': 'Cross Country',
 'cross_type': 'Cross Type',
 'cross_gender': 'Cross Gender',
 'cross_race': 'Cross Race',
 'cross_gender_and_race': 'Cross Gender & Race',
 'cross_gender_and_country': 'Cross Gender & Country',
 'cross_country_and_race': 'Cross Country & Race',
 'cross_gender_race_and_country': 'Cross Gender, Race & Country'}

In [49]:
dfs = []
for var in cross_vars:
    dff = get_simple_prop_df('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dfs.append(dff)

In [50]:
collab_df = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df.replace(rename_cross_vars_dic, inplace=True)
sorted_collab = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)['collab_type'].tolist()
sorted_collab

['Cross Gender',
 'Cross Race',
 'Cross Gender & Race',
 'Cross Country',
 'Cross Gender & Country',
 'Cross Country & Race',
 'Cross Type',
 'Cross Gender, Race & Country']

In [51]:
collab_df = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)
collab_df = collab_df[['freq', 'prop', 'collab_type']]
collab_df.rename(
    columns = {
        'freq': 'Count',
        'prop': 'Proportion',
        'collab_type': 'Collaboration'
    }
)

Unnamed: 0,Count,Proportion,Collaboration
5,1181,0.373,Cross Gender
7,748,0.236,Cross Race
9,487,0.154,Cross Gender & Race
1,479,0.151,Cross Country
11,309,0.098,Cross Gender & Country
13,213,0.067,Cross Country & Race
3,160,0.05,Cross Type
15,151,0.048,"Cross Gender, Race & Country"


In [52]:
total_paper_num

3169

In [53]:
dfs = []
for var in cross_vars:
    dff = get_freq_and_prop('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dff.drop(columns = ['year total'], inplace = True)
    dfs.append(dff)

In [54]:
collab_df_ts = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df_ts.replace(rename_cross_vars_dic, inplace=True)
collab_df_ts = collab_df_ts[collab_df_ts.binary == 'Yes']
collab_df_ts.head()

Unnamed: 0,year,binary,freq,prop,collab_type
1,2000,Yes,12,0.136,Cross Country
3,2001,Yes,9,0.098,Cross Country
5,2002,Yes,10,0.08,Cross Country
7,2003,Yes,10,0.089,Cross Country
9,2004,Yes,13,0.119,Cross Country


In [55]:
collab_df_ts[collab_df_ts.collab_type == 'Cross Gender']

Unnamed: 0,year,binary,freq,prop,collab_type
93,2000,Yes,21,0.239,Cross Gender
95,2001,Yes,31,0.337,Cross Gender
97,2002,Yes,40,0.32,Cross Gender
99,2003,Yes,32,0.286,Cross Gender
101,2004,Yes,41,0.376,Cross Gender
103,2005,Yes,55,0.385,Cross Gender
105,2006,Yes,62,0.453,Cross Gender
107,2007,Yes,51,0.336,Cross Gender
109,2008,Yes,39,0.264,Cross Gender
111,2009,Yes,59,0.391,Cross Gender


In [56]:
collab_df_ts[collab_df_ts.collab_type == 'Cross Race']

Unnamed: 0,year,binary,freq,prop,collab_type
139,2000,Yes,15,0.17,Cross Race
141,2001,Yes,7,0.076,Cross Race
143,2002,Yes,23,0.184,Cross Race
145,2003,Yes,22,0.196,Cross Race
147,2004,Yes,28,0.257,Cross Race
149,2005,Yes,34,0.238,Cross Race
151,2006,Yes,32,0.234,Cross Race
153,2007,Yes,33,0.217,Cross Race
155,2008,Yes,30,0.203,Cross Race
157,2009,Yes,33,0.219,Cross Race


## Cross race and gender details

In [57]:
cross_race_details_df = get_simple_prop_df('papers', 'cross_race_details')
cross_race_details_df.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_df.columns = ['cross_details', 'freq', 'prop']
cross_race_details_df

Unnamed: 0,cross_details,freq,prop
1,White only,1993,0.629
0,Cross race,748,0.236
2,Asian only,318,0.1
3,Hispanic only,50,0.016
4,Black only,40,0.013
5,Middle Eastern only,19,0.006
6,Indigenous only,1,0.0


In [58]:
total_paper_num

3169

In [59]:
cross_gender_details_df = get_simple_prop_df('papers', 'cross_gender_details')
cross_gender_details_df.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_df.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_df.columns = ['cross_details', 'freq', 'prop']
cross_gender_details_df

Unnamed: 0,cross_details,freq,prop
1,Cross gender,1181,0.373
2,Male only,1118,0.353
0,Female only,864,0.273
3,Non-binary only,6,0.002


In [60]:
cross_race_details_ts = get_freq_and_prop('papers', 'cross_race_details')
cross_race_details_ts.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_race_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,2000,Asian only,3,88,0.034
1,2000,Black only,1,88,0.011
2,2000,Hispanic only,1,88,0.011
3,2000,White only,68,88,0.773
4,2000,Cross race,15,88,0.17


In [61]:
cross_gender_details_ts = get_freq_and_prop('papers', 'cross_gender_details')
cross_gender_details_ts.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_ts.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_gender_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,2000,Female only,27,88,0.307
1,2000,Male only,40,88,0.455
2,2000,Cross gender,21,88,0.239
3,2001,Female only,25,92,0.272
4,2001,Male only,36,92,0.391


In [62]:
cross_details_ts = pd.concat([cross_race_details_ts, cross_gender_details_ts])
cross_details_ts

Unnamed: 0,year,cross_details,freq,year total,prop
0,2000,Asian only,3,88,0.034
1,2000,Black only,1,88,0.011
2,2000,Hispanic only,1,88,0.011
3,2000,White only,68,88,0.773
4,2000,Cross race,15,88,0.170
...,...,...,...,...,...
69,2021,Male only,46,167,0.275
70,2021,Cross gender,66,167,0.395
71,2022,Female only,25,78,0.321
72,2022,Male only,23,78,0.295


In [63]:
cross_details_ts[cross_details_ts.cross_details == 'White only']

Unnamed: 0,year,cross_details,freq,year total,prop
3,2000,White only,68,88,0.773
8,2001,White only,80,92,0.87
14,2002,White only,88,125,0.704
19,2003,White only,80,112,0.714
24,2004,White only,69,109,0.633
29,2005,White only,94,143,0.657
33,2006,White only,91,137,0.664
38,2007,White only,101,152,0.664
44,2008,White only,99,148,0.669
49,2009,White only,89,151,0.589
