In [434]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [435]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [436]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [437]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [438]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [439]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [440]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [441]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [442]:
total_paper_num, total_author_num

(3169, 7083)

In [443]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

In [444]:
def get_table(df, var1, var2):
    '''this functin gets a table: var1 (row) by var 2 (col)
    Input:
        - df: authors
        - var1: most likely 'journal'
        - var2: 'racepred', 'genderpred', 'countrypred_new'
    '''
    dff = df.groupby([var1, var2]).size().to_frame('freq').reset_index()
    dff = dff.pivot(index=var1, columns=var2, values='freq')
    dff.fillna(0, inplace = True)
    dff['Total'] = dff.sum(axis=1)
    rowsum = dff.sum(axis = 0).tolist()
    dff.loc['Total'] = rowsum
    return dff

In [445]:
def update_row(row):
    '''this function divide each cell by the row total
    '''
    row_data = row.tolist()
    # get total
    total = row_data[-1]
    # remove total
    row_data = row_data[:-1]
    prop = ["{0:.1%}".format(x / total) for x in row_data]
    lst = []
    for i in range(len(row_data)):
        if row_data[i] > 0:
            string = str(int(row_data[i])) + ' ' +  '(' + prop[i] + ')'
        else:
            string = str(0)
        lst.append(string)
    lst.append(total)
    return lst

In [446]:
def update_df(df):
    '''This function updates df so that each cell is in the format of 'number (percentage)'
    Input:
        - df: j_race, j_gender, j_country
    '''
    tuples = []
    for index, row in df.iterrows():
        lst = update_row(row)
        lst.insert(0, index)
        tuples.append(tuple(lst))
    colnames = df.columns.tolist()
    if df.index.name == 'journal':
        colnames.insert(0, 'Journal')
    else:
        colnames.insert(0, 'Aff country')
    df = pd.DataFrame(tuples, columns = colnames)
    return df

## Dataset

### Statistics of paper number and author number by journal

In [447]:
# start_year_dic = {}
# for group in papers.groupby('journal'):
#     journal = group[0]
#     all_years = group[1].year.tolist()
#     start_year = min(all_years)
#     start_year_dic[journal] = start_year
# start_year_dic

In [448]:
def combine_two_cols(row, var1, var2):
    var2 = "{0:.1%}".format(row[var2])
    return str(row[var1]) + ' (' + str(var2) + ')'

In [449]:
author_num_by_j = get_simple_prop_df('authors', 'journal')
author_num_by_j['author'] = author_num_by_j.apply(
    lambda row: combine_two_cols(row, 'freq', 'prop'), axis=1
)
author_num_by_j.columns = ['journal', 'a_freq', 'a_prop', 'author']
# author_num_by_j = author_num_by_j[['journal', 'author']]
# author_num_by_j

In [450]:
paper_num_by_j = get_simple_prop_df('papers', 'journal')
paper_num_by_j['paper'] = paper_num_by_j.apply(
    lambda row: combine_two_cols(row, 'freq', 'prop'), axis=1
)
paper_num_by_j.columns = ['journal', 'p_freq', 'p_prop', 'paper']
# paper_num_by_j = paper_num_by_j[['journal', 'paper']]
# paper_num_by_j

In [451]:
t1 = paper_num_by_j.merge(author_num_by_j, on = 'journal')
t1['average_author_num_per_paper'] = round(t1['a_freq'] / t1['p_freq'], 2)
t1.sort_values(by = 'p_freq', ascending = False, inplace = True)
t1 = t1[['journal', 'paper', 'author', 'average_author_num_per_paper']]
t1.columns = ['Journal', f'# of Papers ({total_paper_num})', f'# of Authors ({total_author_num})', 'Ave. # of Authors Per Paper']
# t1.loc[len(t1.index)] = ['Total', total_paper_num, total_author_num, '']
t1

Unnamed: 0,Journal,# of Papers (3169),# of Authors (7083),Ave. # of Authors Per Paper
0,JOC,966 (30.5%),2481 (35.0%),2.57
1,JCMC,733 (23.1%),1791 (25.3%),2.44
2,HCR,517 (16.3%),1386 (19.6%),2.68
3,CT,494 (15.6%),786 (11.1%),1.59
4,CCC,459 (14.5%),639 (9.0%),1.39


In [452]:
# print(t1.to_latex(index=False)) 

### Total number of papers by year

In [453]:
yearly_papernum

Unnamed: 0,year,freq
0,2000,88
1,2001,92
2,2002,125
3,2003,112
4,2004,109
5,2005,143
6,2006,137
7,2007,152
8,2008,148
9,2009,151


In [454]:
papernum_by_year_chart = alt.Chart(transform_year(yearly_papernum)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    )
).properties(
    title = 'a',
    height=300,
    width=260
)
papernum_by_year_chart

## TS: averge number of authors

In [455]:
yearly_ave_authornum = yearly_papernum.merge(yearly_authornum, on = 'year')
yearly_ave_authornum['ave_authornum'] = round(yearly_ave_authornum[
    'freq_y'] / yearly_ave_authornum['freq_x'], 2)
yearly_ave_authornum.columns = ['year', 'papernum', 'authornu', 'ave_authornum']
yearly_ave_authornum

Unnamed: 0,year,papernum,authornu,ave_authornum
0,2000,88,171,1.94
1,2001,92,177,1.92
2,2002,125,263,2.1
3,2003,112,214,1.91
4,2004,109,248,2.28
5,2005,143,318,2.22
6,2006,137,311,2.27
7,2007,152,308,2.03
8,2008,148,304,2.05
9,2009,151,335,2.22


In [456]:
yearly_ave_authornum_chart = alt.Chart(transform_year(yearly_ave_authornum)).mark_line().encode(
    
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'ave_authornum',
        title = 'Average # of Authors'
    )
).properties(
    title = 'b',
    height=300,
    width=260
)
yearly_ave_authornum_chart

## Race

In [457]:
get_simple_prop_df('authors', 'racepred')

Unnamed: 0,racepred,freq,prop
0,White,5354,0.756
2,Asian,1206,0.17
1,Hispanic,286,0.04
3,Black,133,0.019
4,Middle Eastern,88,0.012
5,Indigenous,16,0.002


In [458]:
country_by_race = get_table(authors, 'countrypred_new', 'racepred')
country_by_race = update_df(country_by_race)
country_by_race

Unnamed: 0,Aff country,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,DE,5 (2.0%),1 (0.4%),0,0,1 (0.4%),246 (97.2%),253.0
1,GB,15 (6.7%),2 (0.9%),8 (3.6%),0,3 (1.3%),196 (87.5%),224.0
2,IL,6 (3.5%),0,3 (1.7%),0,22 (12.7%),142 (82.1%),173.0
3,NL,5 (1.4%),0,1 (0.3%),1 (0.3%),2 (0.6%),336 (97.4%),345.0
4,Other,383 (29.6%),10 (0.8%),123 (9.5%),0,17 (1.3%),761 (58.8%),1294.0
5,US,792 (16.5%),120 (2.5%),151 (3.1%),15 (0.3%),43 (0.9%),3673 (76.6%),4794.0
6,Total,1206 (17.0%),133 (1.9%),286 (4.0%),16 (0.2%),88 (1.2%),5354 (75.6%),7083.0


In [459]:
j_race = get_table(authors, 'journal', 'racepred')
update_df(j_race)

Unnamed: 0,Journal,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,CCC,107 (16.7%),31 (4.9%),35 (5.5%),3 (0.5%),13 (2.0%),450 (70.4%),639
1,CT,77 (9.8%),15 (1.9%),27 (3.4%),2 (0.3%),9 (1.1%),656 (83.5%),786
2,HCR,235 (17.0%),15 (1.1%),53 (3.8%),5 (0.4%),12 (0.9%),1066 (76.9%),1386
3,JCMC,406 (22.7%),15 (0.8%),77 (4.3%),3 (0.2%),25 (1.4%),1265 (70.6%),1791
4,JOC,381 (15.4%),57 (2.3%),94 (3.8%),3 (0.1%),29 (1.2%),1917 (77.3%),2481
5,Total,1206 (17.0%),133 (1.9%),286 (4.0%),16 (0.2%),88 (1.2%),5354 (75.6%),7083


In [460]:
j_gender = get_table(authors, 'journal', 'genderpred')
update_df(j_gender)

Unnamed: 0,Journal,F,M,N,Total
0,CCC,365 (57.1%),270 (42.3%),4 (0.6%),639.0
1,CT,322 (41.0%),464 (59.0%),0,786.0
2,HCR,659 (47.5%),723 (52.2%),4 (0.3%),1386.0
3,JCMC,808 (45.1%),983 (54.9%),0,1791.0
4,JOC,1121 (45.2%),1359 (54.8%),1 (0.0%),2481.0
5,Total,3275 (46.2%),3799 (53.6%),9 (0.1%),7083.0


In [473]:
j_country = get_table(authors, 'journal', 'countrypred_new')

j_country = update_df(j_country)
j_country.rename(columns = {
    'DE': 'Germany',
    'GB': 'Great Britain',
    'IL': 'Israel',
    'NL': 'Netherlands',
}, inplace= True)
j_country = j_country[['Journal',
                       'US',
                       'Other',
                       'Netherlands',
                       'Germany',
                       'Great Britain',
                       'Israel',
                       'Total'
                      ]]
j_country

Unnamed: 0,Journal,US,Other,Netherlands,Germany,Great Britain,Israel,Total
0,CCC,437 (68.4%),114 (17.8%),6 (0.9%),7 (1.1%),54 (8.5%),21 (3.3%),639
1,CT,534 (67.9%),143 (18.2%),21 (2.7%),32 (4.1%),19 (2.4%),37 (4.7%),786
2,HCR,1086 (78.4%),149 (10.8%),78 (5.6%),32 (2.3%),19 (1.4%),22 (1.6%),1386
3,JCMC,1022 (57.1%),502 (28.0%),107 (6.0%),51 (2.8%),78 (4.4%),31 (1.7%),1791
4,JOC,1715 (69.1%),386 (15.6%),133 (5.4%),131 (5.3%),54 (2.2%),62 (2.5%),2481
5,Total,4794 (67.7%),1294 (18.3%),345 (4.9%),253 (3.6%),224 (3.2%),173 (2.4%),7083


### Race proportion by year

In [462]:
race_ts = get_freq_and_prop(data = 'authors', var = 'racepred')
race_ts.head()

Unnamed: 0,year,racepred,freq,year total,prop
0,2000,Asian,18,171,0.105
1,2000,Black,1,171,0.006
2,2000,Hispanic,5,171,0.029
3,2000,Indigenous,1,171,0.006
4,2000,White,146,171,0.854


In [463]:
race_ts[race_ts.racepred == 'White'].sort_values("prop")

Unnamed: 0,year,racepred,freq,year total,prop
107,2019,White,180,269,0.669
118,2021,White,305,455,0.67
112,2020,White,210,299,0.702
91,2016,White,249,352,0.707
124,2022,White,145,200,0.725
96,2017,White,256,352,0.727
68,2012,White,256,348,0.736
57,2010,White,219,295,0.742
102,2018,White,275,368,0.747
62,2011,White,232,308,0.753


In [464]:
race_ts[race_ts.racepred == 'Hispanic'].sort_values("prop")

Unnamed: 0,year,racepred,freq,year total,prop
40,2007,Hispanic,2,308,0.006
7,2001,Hispanic,2,177,0.011
34,2006,Hispanic,5,311,0.016
23,2004,Hispanic,6,248,0.024
45,2008,Hispanic,8,304,0.026
11,2002,Hispanic,7,263,0.027
2,2000,Hispanic,5,171,0.029
50,2009,Hispanic,11,335,0.033
71,2013,Hispanic,12,355,0.034
17,2003,Hispanic,8,214,0.037


In [465]:
race_ts[race_ts.racepred == 'Indigenous']

Unnamed: 0,year,racepred,freq,year total,prop
3,2000,Indigenous,1,171,0.006
12,2002,Indigenous,2,263,0.008
18,2003,Indigenous,1,214,0.005
24,2004,Indigenous,1,248,0.004
35,2006,Indigenous,1,311,0.003
51,2009,Indigenous,1,335,0.003
66,2012,Indigenous,1,348,0.003
77,2014,Indigenous,1,431,0.002
83,2015,Indigenous,1,402,0.002
89,2016,Indigenous,2,352,0.006


In [466]:
unique_races = list(set(race_ts.racepred))
unique_races.sort()
unique_races

['Asian', 'Black', 'Hispanic', 'Indigenous', 'Middle Eastern', 'White']

In [467]:
race_colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f']

In [468]:
race_ts = transform_year(race_ts)

In [469]:
race_stack_chart = alt.Chart(race_ts).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'd',
    height=300,
    width=280
)

race_stack_chart

In [470]:
race_prop_chart = alt.Chart(race_ts).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Porportion',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'f',
    height=300,
    width=280
)

race_prop_chart