In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [4]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [5]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [6]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [7]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [8]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [9]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [10]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [11]:
total_paper_num, total_author_num

(5712, 11292)

In [12]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Intersection

In [13]:
race_count_dic_long = dict(Counter(authors.racepred))
race_count_dic_long

{'White': 9304,
 'Hispanic': 345,
 'Asian': 1361,
 'Black': 158,
 'Middle Eastern': 106,
 'Indigenous': 18}

In [14]:
gender_count_dic = dict(Counter(authors.genderpred))
gender_count_dic

{'F': 4405, 'M': 6876, 'N': 11}

In [15]:
gender_by_race_long = authors.groupby(
    ['genderpred', 'racepred']).size().to_frame('freq').reset_index()
gender_by_race_long['gender_total'] = [gender_count_dic[x] for x in gender_by_race_long.genderpred]
gender_by_race_long['race_total'] = [race_count_dic_long[x] for x in gender_by_race_long.racepred]
gender_by_race_long['race_in_gender'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['gender_total'], 
    3)
gender_by_race_long['gender_in_race'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['race_total'], 
    3)
gender_by_race_long

Unnamed: 0,genderpred,racepred,freq,gender_total,race_total,race_in_gender,gender_in_race
0,F,Asian,661,4405,1361,0.15,0.486
1,F,Black,81,4405,158,0.018,0.513
2,F,Hispanic,167,4405,345,0.038,0.484
3,F,Indigenous,14,4405,18,0.003,0.778
4,F,Middle Eastern,52,4405,106,0.012,0.491
5,F,White,3430,4405,9304,0.779,0.369
6,M,Asian,699,6876,1361,0.102,0.514
7,M,Black,77,6876,158,0.011,0.487
8,M,Hispanic,173,6876,345,0.025,0.501
9,M,Indigenous,4,6876,18,0.001,0.222


In [16]:
authors['gender_and_race'] = authors['genderpred'] + '_' + authors['racepred']

In [17]:
get_simple_prop_df('authors', 'gender_and_race').sort_values('prop', ascending = False)

Unnamed: 0,gender_and_race,freq,prop
2,M_White,5869,0.52
0,F_White,3430,0.304
3,M_Asian,699,0.062
4,F_Asian,661,0.059
1,F_Hispanic,167,0.015
7,M_Hispanic,173,0.015
5,F_Black,81,0.007
8,M_Black,77,0.007
6,M_Middle Eastern,54,0.005
9,F_Middle Eastern,52,0.005
