In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [2]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [3]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [4]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [5]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [106]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [107]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [108]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [109]:
total_paper_num, total_author_num

(5712, 11292)

In [110]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## How many cross type papers

In [111]:
Counter(papers.cross_type)

Counter({'No': 5451, 'Yes': 261})

In [10]:
papers.shape

(5712, 34)

## How many multi-author papers

In [8]:
Counter(papers.numberOfAuthors)

Counter({1.0: 2658,
         2.0: 1705,
         3.0: 759,
         4.0: 332,
         5.0: 126,
         6.0: 61,
         7.0: 36,
         8.0: 15,
         9.0: 7,
         10.0: 4,
         11.0: 2,
         18.0: 1,
         17.0: 1,
         37.0: 1,
         14.0: 1,
         15.0: 1,
         13.0: 1,
         12.0: 1})

In [13]:
papers.shape[0] - 2568

3144

In [9]:
2568/papers.shape[0]

0.4495798319327731

In [11]:
1 - 2568/papers.shape[0]

0.5504201680672269

## Collaboration

In [112]:
cross_var_idx = [15, 16, 17, 18, 21, 22, 23, 24]
all_paper_cols = papers.columns.tolist()
cross_vars = [all_paper_cols[x] for x in cross_var_idx]
cross_vars

['cross_country',
 'cross_type',
 'cross_gender',
 'cross_race',
 'cross_gender_and_race',
 'cross_gender_and_country',
 'cross_country_and_race',
 'cross_gender_race_and_country']

In [113]:
cross_vars_new = [
    'Cross Country',
    'Cross Type',
    'Cross Gender',
    'Cross Race',
    'Cross Gender & Race',
    'Cross Gender & Country',
    'Cross Country & Race',
    'Cross Gender, Race & Country'
]

In [114]:
rename_cross_vars_dic = dict(zip(cross_vars, cross_vars_new))
rename_cross_vars_dic

{'cross_country': 'Cross Country',
 'cross_type': 'Cross Type',
 'cross_gender': 'Cross Gender',
 'cross_race': 'Cross Race',
 'cross_gender_and_race': 'Cross Gender & Race',
 'cross_gender_and_country': 'Cross Gender & Country',
 'cross_country_and_race': 'Cross Country & Race',
 'cross_gender_race_and_country': 'Cross Gender, Race & Country'}

In [115]:
dfs = []
for var in cross_vars:
    dff = get_simple_prop_df('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dfs.append(dff)

In [116]:
collab_df = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df.replace(rename_cross_vars_dic, inplace=True)
sorted_collab = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)['collab_type'].tolist()
sorted_collab

['Cross Gender',
 'Cross Race',
 'Cross Country',
 'Cross Gender & Race',
 'Cross Gender & Country',
 'Cross Type',
 'Cross Country & Race',
 'Cross Gender, Race & Country']

In [117]:
def combine_two_cols(row, var1, var2):
    var2 = "{0:.1%}".format(row[var2])
    return str(row[var1]) + ' (' + str(var2) + ')'

In [118]:
collab_df = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)
collab_df = collab_df[['freq', 'prop', 'collab_type']]
collab_df['stats'] = collab_df.apply(
    lambda row: combine_two_cols(row, 'freq', 'prop'), axis = 1
)
collab_df.rename(
    columns = {
        'stats': 'Statistics',
        'collab_type': 'Collaboration'
    }, inplace = True)
collab_df = collab_df[['Collaboration', 'Statistics']]
collab_df

Unnamed: 0,Collaboration,Statistics
5,Cross Gender,1663 (29.1%)
7,Cross Race,873 (15.3%)
1,Cross Country,589 (10.3%)
9,Cross Gender & Race,552 (9.7%)
11,Cross Gender & Country,352 (6.2%)
3,Cross Type,261 (4.6%)
13,Cross Country & Race,231 (4.0%)
15,"Cross Gender, Race & Country",155 (2.7%)


In [119]:
print(collab_df.to_latex(index=False)) 

\begin{tabular}{ll}
\toprule
               Collaboration &   Statistics \\
\midrule
                Cross Gender & 1663 (29.1\%) \\
                  Cross Race &  873 (15.3\%) \\
               Cross Country &  589 (10.3\%) \\
         Cross Gender \& Race &   552 (9.7\%) \\
      Cross Gender \& Country &   352 (6.2\%) \\
                  Cross Type &   261 (4.6\%) \\
        Cross Country \& Race &   231 (4.0\%) \\
Cross Gender, Race \& Country &   155 (2.7\%) \\
\bottomrule
\end{tabular}



  print(collab_df.to_latex(index=False))


In [120]:
total_paper_num

5712

In [121]:
dfs = []
for var in cross_vars:
    dff = get_freq_and_prop('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dff.drop(columns = ['year total'], inplace = True)
    dfs.append(dff)

In [122]:
collab_df_ts = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df_ts.replace(rename_cross_vars_dic, inplace=True)
collab_df_ts = collab_df_ts[collab_df_ts.binary == 'Yes']
collab_df_ts.head()

Unnamed: 0,year,binary,freq,prop,collab_type
13,1963,Yes,1,0.042,Cross Country
22,1971,Yes,2,0.08,Cross Country
24,1972,Yes,1,0.036,Cross Country
27,1974,Yes,2,0.036,Cross Country
29,1975,Yes,3,0.029,Cross Country


In [123]:
collab_df_ts[collab_df_ts.collab_type == 'Cross Gender'].sort_values('year')

Unnamed: 0,year,binary,freq,prop,collab_type
253,1952,Yes,1,0.042,Cross Gender
259,1957,Yes,1,0.062,Cross Gender
262,1959,Yes,2,0.118,Cross Gender
264,1960,Yes,1,0.062,Cross Gender
266,1961,Yes,1,0.048,Cross Gender
270,1964,Yes,1,0.048,Cross Gender
272,1965,Yes,1,0.059,Cross Gender
275,1967,Yes,1,0.167,Cross Gender
277,1968,Yes,4,0.133,Cross Gender
279,1969,Yes,1,0.04,Cross Gender


In [124]:
collab_df_ts[collab_df_ts.collab_type == 'Cross Gender'].sort_values('prop')

Unnamed: 0,year,binary,freq,prop,collab_type
279,1969,Yes,1,0.04,Cross Gender
253,1952,Yes,1,0.042,Cross Gender
266,1961,Yes,1,0.048,Cross Gender
270,1964,Yes,1,0.048,Cross Gender
272,1965,Yes,1,0.059,Cross Gender
259,1957,Yes,1,0.062,Cross Gender
264,1960,Yes,1,0.062,Cross Gender
287,1973,Yes,1,0.071,Cross Gender
281,1970,Yes,2,0.08,Cross Gender
299,1979,Yes,13,0.114,Cross Gender


In [125]:
collab_df_ts[collab_df_ts.collab_type == 'Cross Race']

Unnamed: 0,year,binary,freq,prop,collab_type
388,1952,Yes,1,0.042,Cross Race
394,1957,Yes,2,0.125,Cross Race
403,1965,Yes,1,0.059,Cross Race
408,1969,Yes,1,0.04,Cross Race
411,1971,Yes,1,0.04,Cross Race
413,1972,Yes,1,0.036,Cross Race
416,1974,Yes,1,0.018,Cross Race
418,1975,Yes,3,0.029,Cross Race
420,1976,Yes,3,0.028,Cross Race
422,1977,Yes,8,0.064,Cross Race


## Cross race and gender details

In [126]:
cross_race_details_df = get_simple_prop_df('papers', 'cross_race_details')
cross_race_details_df.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_df.columns = ['cross_details', 'freq', 'prop']
cross_race_details_df

Unnamed: 0,cross_details,freq,prop
1,White only,4319,0.756
0,Cross race,873,0.153
2,Asian only,367,0.064
3,Hispanic only,73,0.013
4,Black only,49,0.009
5,Middle Eastern only,29,0.005
6,Indigenous only,2,0.0


In [127]:
total_paper_num

5712

In [128]:
cross_gender_details_df = get_simple_prop_df('papers', 'cross_gender_details')
cross_gender_details_df.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_df.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_df.columns = ['cross_details', 'freq', 'prop']
cross_gender_details_df

Unnamed: 0,cross_details,freq,prop
2,Male only,2781,0.487
1,Cross gender,1663,0.291
0,Female only,1261,0.221
3,Non-binary only,7,0.001


In [129]:
cross_race_details_ts = get_freq_and_prop('papers', 'cross_race_details')
cross_race_details_ts.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_race_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,1951,White only,20,20,1.0
1,1952,Asian only,1,24,0.042
2,1952,White only,22,24,0.917
3,1952,Cross race,1,24,0.042
4,1953,Asian only,1,17,0.059


In [130]:
cross_gender_details_ts = get_freq_and_prop('papers', 'cross_gender_details')
cross_gender_details_ts.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_ts.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_gender_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,1951,Male only,20,20,1.0
1,1952,Female only,4,24,0.167
2,1952,Male only,19,24,0.792
3,1952,Cross gender,1,24,0.042
4,1953,Male only,17,17,1.0


In [131]:
cross_gender_details_ts.tail()

Unnamed: 0,year,cross_details,freq,year total,prop
197,2021,Male only,46,167,0.275
198,2021,Cross gender,66,167,0.395
199,2022,Female only,25,78,0.321
200,2022,Male only,23,78,0.295
201,2022,Cross gender,30,78,0.385


In [132]:
cross_details_ts = pd.concat([cross_race_details_ts, cross_gender_details_ts])
cross_details_ts

Unnamed: 0,year,cross_details,freq,year total,prop
0,1951,White only,20,20,1.0
1,1952,Asian only,1,24,0.042
2,1952,White only,22,24,0.917
3,1952,Cross race,1,24,0.042
4,1953,Asian only,1,17,0.059
5,1953,Black only,1,17,0.059
6,1953,Hispanic only,1,17,0.059
7,1953,White only,14,17,0.824
8,1954,White only,15,15,1.0
9,1955,White only,11,11,1.0


In [133]:
cross_details_ts[cross_details_ts.cross_details == 'White only']

Unnamed: 0,year,cross_details,freq,year total,prop
0,1951,White only,20,20,1.0
2,1952,White only,22,24,0.917
7,1953,White only,14,17,0.824
8,1954,White only,15,15,1.0
9,1955,White only,11,11,1.0
10,1956,White only,13,13,1.0
11,1957,White only,14,16,0.875
13,1958,White only,13,13,1.0
15,1959,White only,16,17,0.941
17,1960,White only,15,16,0.938


In [134]:
cross_details_ts[cross_details_ts.cross_details == 'Female only']

Unnamed: 0,year,cross_details,freq,year total,prop
1,1952,Female only,4,24,0.167
8,1957,Female only,2,16,0.125
12,1959,Female only,3,17,0.176
15,1960,Female only,2,16,0.125
21,1963,Female only,1,24,0.042
23,1964,Female only,2,21,0.095
26,1965,Female only,1,17,0.059
32,1968,Female only,3,30,0.1
35,1969,Female only,2,25,0.08
38,1970,Female only,3,25,0.12


## Looking inside cross race and cross gender

In [19]:
cross_race_papers = papers[papers.cross_race_details == 'cross race']
Counter(cross_race_papers.first_author_race)

Counter({'White': 450,
         'Asian': 295,
         'Hispanic': 76,
         'Middle Eastern': 25,
         'Black': 23,
         'Indigenous': 4})

In [20]:
cross_race_papers.shape

(873, 34)

In [21]:
cross_gender_papers = papers[papers.cross_gender_details == 'cross gender']
Counter(cross_gender_papers.first_author_gender)

Counter({'M': 918, 'F': 742, 'N': 3})

In [22]:
cross_gender_papers.shape

(1663, 34)