In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [2]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [3]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [4]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [5]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [6]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [7]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [8]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [9]:
total_paper_num, total_author_num

(3169, 7083)

In [10]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Intersection

In [11]:
race_count_dic_long = dict(Counter(authors.racepred))
race_count_dic_long

{'White': 5354,
 'Hispanic': 286,
 'Asian': 1206,
 'Black': 133,
 'Middle Eastern': 88,
 'Indigenous': 16}

In [13]:
gender_count_dic = dict(Counter(authors.genderpred))
gender_count_dic

{'F': 3275, 'M': 3799, 'N': 9}

In [15]:
gender_by_race_long = authors.groupby(
    ['genderpred', 'racepred']).size().to_frame('freq').reset_index()
gender_by_race_long['gender_total'] = [gender_count_dic[x] for x in gender_by_race_long.genderpred]
gender_by_race_long['race_total'] = [race_count_dic_long[x] for x in gender_by_race_long.racepred]
gender_by_race_long['race_in_gender'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['gender_total'], 
    3)
gender_by_race_long['gender_in_race'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['race_total'], 
    3)
gender_by_race_long

Unnamed: 0,genderpred,racepred,freq,gender_total,race_total,race_in_gender,gender_in_race
0,F,Asian,622,3275,1206,0.19,0.516
1,F,Black,73,3275,133,0.022,0.549
2,F,Hispanic,149,3275,286,0.045,0.521
3,F,Indigenous,13,3275,16,0.004,0.812
4,F,Middle Eastern,47,3275,88,0.014,0.534
5,F,White,2371,3275,5354,0.724,0.443
6,M,Asian,583,3799,1206,0.153,0.483
7,M,Black,60,3799,133,0.016,0.451
8,M,Hispanic,133,3799,286,0.035,0.465
9,M,Indigenous,3,3799,16,0.001,0.188


In [18]:
authors['gender_and_race'] = authors['genderpred'] + '_' + authors['racepred']

In [19]:
get_simple_prop_df('authors', 'gender_and_race')

Unnamed: 0,gender_and_race,freq,prop
2,M_White,2979,0.421
0,F_White,2371,0.335
4,F_Asian,622,0.088
3,M_Asian,583,0.082
1,F_Hispanic,149,0.021
7,M_Hispanic,133,0.019
5,F_Black,73,0.01
8,M_Black,60,0.008
9,F_Middle Eastern,47,0.007
6,M_Middle Eastern,41,0.006
