In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [2]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [3]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [4]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [5]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [6]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [7]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [8]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [9]:
total_paper_num, total_author_num

(5712, 11292)

In [10]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Intersection

### race + gender

In [11]:
race_count_dic_long = dict(Counter(authors.racepred))
race_count_dic_long

{'White': 9304,
 'Hispanic': 345,
 'Asian': 1361,
 'Black': 158,
 'Middle Eastern': 106,
 'Indigenous': 18}

In [12]:
gender_count_dic = dict(Counter(authors.genderpred))
gender_count_dic

{'F': 4405, 'M': 6876, 'N': 11}

In [30]:
country_count_dic = dict(Counter(authors.countrypred_new))
country_count_dic

{'US': 8467, 'DE': 276, 'Other': 1661, 'GB': 297, 'IL': 227, 'NL': 364}

In [13]:
gender_by_race_long = authors.groupby(
    ['genderpred', 'racepred']).size().to_frame('freq').reset_index()
gender_by_race_long['gender_total'] = [gender_count_dic[x] for x in gender_by_race_long.genderpred]
gender_by_race_long['race_total'] = [race_count_dic_long[x] for x in gender_by_race_long.racepred]
gender_by_race_long['race_in_gender'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['gender_total'], 
    3)
gender_by_race_long['gender_in_race'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['race_total'], 
    3)
gender_by_race_long

Unnamed: 0,genderpred,racepred,freq,gender_total,race_total,race_in_gender,gender_in_race
0,F,Asian,661,4405,1361,0.15,0.486
1,F,Black,81,4405,158,0.018,0.513
2,F,Hispanic,167,4405,345,0.038,0.484
3,F,Indigenous,14,4405,18,0.003,0.778
4,F,Middle Eastern,52,4405,106,0.012,0.491
5,F,White,3430,4405,9304,0.779,0.369
6,M,Asian,699,6876,1361,0.102,0.514
7,M,Black,77,6876,158,0.011,0.487
8,M,Hispanic,173,6876,345,0.025,0.501
9,M,Indigenous,4,6876,18,0.001,0.222


In [14]:
authors['gender_and_race'] = authors['genderpred'] + '_' + authors['racepred']

In [15]:
get_simple_prop_df('authors', 'gender_and_race').sort_values('prop', ascending = False)

Unnamed: 0,gender_and_race,freq,prop
2,M_White,5869,0.52
0,F_White,3430,0.304
3,M_Asian,699,0.062
4,F_Asian,661,0.059
1,F_Hispanic,167,0.015
7,M_Hispanic,173,0.015
5,F_Black,81,0.007
8,M_Black,77,0.007
6,M_Middle Eastern,54,0.005
9,F_Middle Eastern,52,0.005


### Race + country

In [16]:
country_by_race_long = authors.groupby(
    ['countrypred_new', 'racepred']).size().to_frame('freq').reset_index()

In [17]:
country_by_race_long

Unnamed: 0,countrypred_new,racepred,freq
0,DE,Asian,5
1,DE,Black,1
2,DE,Middle Eastern,1
3,DE,White,269
4,GB,Asian,19
5,GB,Black,4
6,GB,Hispanic,8
7,GB,Middle Eastern,3
8,GB,White,263
9,IL,Asian,8


In [24]:
authors['country_and_race'] = authors['countrypred_new'] + '_' + authors['racepred']
country_and_race_table = get_simple_prop_df(
    'authors', 'country_and_race').sort_values('prop', ascending = False)

In [25]:
country_and_race_table

Unnamed: 0,country_and_race,freq,prop
0,US_White,7142,0.632
3,Other_White,1086,0.096
4,US_Asian,912,0.081
9,Other_Asian,412,0.036
12,NL_White,355,0.031
2,DE_White,269,0.024
6,GB_White,263,0.023
1,US_Hispanic,198,0.018
8,IL_White,189,0.017
7,US_Black,144,0.013


In [26]:
country_and_race_table.head(5).prop.sum()

0.876

### country + gender

In [27]:
authors['country_and_gender'] = authors['countrypred_new'] + '_' + authors['genderpred']
country_and_gender_table = get_simple_prop_df('authors', 'country_and_gender').sort_values('prop', ascending = False)

In [28]:
country_and_gender_table

Unnamed: 0,country_and_gender,freq,prop
4,US_M,5164,0.457
0,US_F,3293,0.292
3,Other_M,1031,0.091
5,Other_F,629,0.056
6,GB_M,188,0.017
9,NL_M,197,0.017
1,DE_M,171,0.015
10,NL_F,167,0.015
8,IL_M,125,0.011
11,GB_F,109,0.01


In [29]:
country_and_gender_table.head(5).prop.sum()

0.913

In [34]:
gender_by_country_long = authors.groupby(
    ['genderpred', 'countrypred_new']).size().to_frame('freq').reset_index()
gender_by_country_long['gender_total'] = [gender_count_dic[x] for x in gender_by_country_long.genderpred]
gender_by_country_long['country_total'] = [country_count_dic[x] for x in gender_by_country_long.countrypred_new]
gender_by_country_long['country_in_gender'] = round(
    gender_by_country_long['freq'] / gender_by_country_long['gender_total'], 
    3)
gender_by_country_long['gender_in_country'] = round(
    gender_by_country_long['freq'] / gender_by_country_long['country_total'], 
    3)
gender_by_country_long

Unnamed: 0,genderpred,countrypred_new,freq,gender_total,country_total,country_in_gender,gender_in_country
0,F,DE,105,4405,276,0.024,0.38
1,F,GB,109,4405,297,0.025,0.367
2,F,IL,102,4405,227,0.023,0.449
3,F,NL,167,4405,364,0.038,0.459
4,F,Other,629,4405,1661,0.143,0.379
5,F,US,3293,4405,8467,0.748,0.389
6,M,DE,171,6876,276,0.025,0.62
7,M,GB,188,6876,297,0.027,0.633
8,M,IL,125,6876,227,0.018,0.551
9,M,NL,197,6876,364,0.029,0.541
