In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [2]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [3]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [4]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [5]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [6]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [7]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [8]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [9]:
total_paper_num, total_author_num

(3169, 7083)

In [10]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Gender

In [15]:
get_simple_prop_df('authors', 'genderpred')

Unnamed: 0,genderpred,freq,prop
1,M,3799,0.536
0,F,3275,0.462
2,N,9,0.001


In [16]:
gender_ts = get_freq_and_prop(data = 'authors', var = 'genderpred')
gender_ts.head(10)

Unnamed: 0,year,genderpred,freq,year total,prop
0,2000,F,74,171,0.433
1,2000,M,97,171,0.567
2,2001,F,80,177,0.452
3,2001,M,97,177,0.548
4,2002,F,118,263,0.449
5,2002,M,145,263,0.551
6,2003,F,97,214,0.453
7,2003,M,117,214,0.547
8,2004,F,109,248,0.44
9,2004,M,138,248,0.556


In [18]:
gender_ts[gender_ts.genderpred == 'F'].sort_values('prop')

Unnamed: 0,year,genderpred,freq,year total,prop
15,2007,F,124,308,0.403
17,2008,F,129,304,0.424
0,2000,F,74,171,0.433
34,2015,F,175,402,0.435
26,2012,F,152,348,0.437
8,2004,F,109,248,0.44
13,2006,F,138,311,0.444
36,2016,F,157,352,0.446
20,2009,F,150,335,0.448
4,2002,F,118,263,0.449


In [19]:
gender_ts[gender_ts.genderpred == 'N']

Unnamed: 0,year,genderpred,freq,year total,prop
10,2004,N,1,248,0.004
19,2008,N,1,304,0.003
30,2013,N,1,355,0.003
33,2014,N,1,431,0.002
40,2017,N,2,352,0.006
43,2018,N,1,368,0.003
48,2020,N,1,299,0.003
53,2022,N,1,200,0.005


In [12]:
gender_ts[gender_ts.genderpred == 'N'].freq.sum()

9

In [13]:
# gender_ts['year'] = pd.to_datetime(gender_ts['year'], format='%Y')

gender_stack_chart = alt.Chart(gender_ts).mark_bar().encode(
    x = alt.X(
        'year:N',
        axis = alt.Axis(
            tickCount= 4,
            labelAngle = -45
    )
    ),
    y = alt.Y(
        'sum(freq)',
        title = '# of Authors'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'a',
    height=300,
    width=260
)

gender_stack_chart

In [14]:
gender_ts_prop_chart = alt.Chart(transform_year(gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

gender_ts_prop_chart 