In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [47]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [3]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [4]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [5]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [6]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [7]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [8]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [9]:
total_paper_num, total_author_num

(5712, 11292)

In [10]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Gender

In [11]:
get_simple_prop_df('authors', 'genderpred')

Unnamed: 0,genderpred,freq,prop
1,M,6876,0.609
0,F,4405,0.39
2,N,11,0.001


In [12]:
gender_ts = get_freq_and_prop(data = 'authors', var = 'genderpred')
gender_ts.head(10)

Unnamed: 0,year,genderpred,freq,year total,prop
0,1951,M,24,24,1.0
1,1952,F,5,25,0.2
2,1952,M,20,25,0.8
3,1953,M,17,17,1.0
4,1954,M,16,16,1.0
5,1955,M,11,11,1.0
6,1956,M,14,14,1.0
7,1957,F,3,19,0.158
8,1957,M,16,19,0.842
9,1958,M,13,13,1.0


In [13]:
gender_ts[gender_ts.genderpred == 'F'].sort_values('year')

Unnamed: 0,year,genderpred,freq,year total,prop
1,1952,F,5,25,0.200
7,1957,F,3,19,0.158
10,1959,F,5,21,0.238
12,1960,F,3,17,0.176
14,1961,F,1,24,0.042
...,...,...,...,...,...
132,2018,F,176,368,0.478
135,2019,F,140,269,0.520
137,2020,F,141,299,0.472
140,2021,F,229,455,0.503


In [14]:
gender_ts[gender_ts.genderpred == 'F'].sort_values('prop')

Unnamed: 0,year,genderpred,freq,year total,prop
17,1963,F,1,30,0.033
14,1961,F,1,24,0.042
36,1973,F,1,18,0.056
19,1964,F,3,29,0.103
28,1969,F,4,39,0.103
...,...,...,...,...,...
142,2022,F,100,200,0.500
140,2021,F,229,455,0.503
119,2013,F,179,355,0.504
129,2017,F,180,352,0.511


In [15]:
gender_ts[gender_ts.genderpred == 'N']

Unnamed: 0,year,genderpred,freq,year total,prop
86,1997,N,1,164,0.006
101,2004,N,1,248,0.004
110,2008,N,1,304,0.003
121,2013,N,1,355,0.003
124,2014,N,1,431,0.002
131,2017,N,2,352,0.006
134,2018,N,1,368,0.003
139,2020,N,1,299,0.003
144,2022,N,2,200,0.01


In [16]:
gender_ts[gender_ts.genderpred == 'N'].freq.sum()

11

In [17]:
# gender_ts['year'] = pd.to_datetime(gender_ts['year'], format='%Y')

gender_stack_chart = alt.Chart(gender_ts).mark_bar().encode(
    x = alt.X(
        'year:N',
        axis = alt.Axis(
            tickCount= 4,
            labelAngle = -45
    )
    ),
    y = alt.Y(
        'sum(freq)',
        title = '# of Authors'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'a',
    height=300,
    width=260
)

gender_stack_chart

In [18]:
gender_ts_prop_chart = alt.Chart(transform_year(gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

gender_ts_prop_chart 

## First author gender

In [30]:
get_simple_prop_df(data = 'papers', var = 'first_author_gender')

Unnamed: 0,first_author_gender,freq,prop
1,M,3699,0.648
0,F,2003,0.351
2,N,10,0.002


In [19]:
first_author_gender_ts = get_freq_and_prop(data = 'papers', var = 'first_author_gender')
first_author_gender_ts

Unnamed: 0,year,first_author_gender,freq,year total,prop
0,1951,M,20,20,1.000
1,1952,F,5,24,0.208
2,1952,M,19,24,0.792
3,1953,M,17,17,1.000
4,1954,M,15,15,1.000
...,...,...,...,...,...
137,2021,F,88,167,0.527
138,2021,M,79,167,0.473
139,2022,F,37,78,0.474
140,2022,M,40,78,0.513


In [20]:
alt.Chart(transform_year(first_author_gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'first_author_gender',
        title = 'First author gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)


## Gender parity for white people

In [23]:
white_authors = authors[authors.racepred == 'White']
white_authors.head(2)

Unnamed: 0,authorID,doi,url,year,title,journal,numberOfAuthors,authorPosition,authorFullName,firstName,...,affiliation.2,ROR_AFFNAME,matchMethod,ROR_ID,type,countrypred,genderpred,racepred,afftypepred,countrypred_new
0,10.1093/joc/jqac004+1.0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/...,2022,The Gender Divide in Wikipedia: Quantifying an...,JOC,2.0,1.0,Isabelle Langrock,Isabelle,...,"Annenberg School for Communication, University...",University of Pennsylvania,Exact,https://ror.org/00b30xv10,R,US,F,White,Education,US
2,10.1093/joc/jqac009+1.0,10.1093/joc/jqac009,https://academic.oup.com/joc/article/72/3/322/...,2022,Mapping Exposure Diversity: The Divergent Effe...,JOC,2.0,1.0,Pascal Jürgens,Pascal,...,"Department of Communication, Jakob-Welder-Weg ...",Johannes Gutenberg University of Mainz,API_QUERY,https://ror.org/023b0x485,R,DE,M,White,Education,DE


In [39]:
Counter(white_authors.genderpred)

Counter({'F': 3430, 'M': 5869, 'N': 5})

In [44]:
3430/white_authors.shape[0]

0.36865864144453997

In [45]:
5869/white_authors.shape[0]

0.6308039552880481

In [27]:
def get_freq_and_prop_for_white_authors(data, var):
    dic = yearly_authornum_dic
    df = data.groupby(['year', var]).size().to_frame('freq').reset_index()
    df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [28]:
white_gender_ts = get_freq_and_prop_for_white_authors(data = white_authors, var = 'genderpred')
white_gender_ts.head(10)

Unnamed: 0,year,genderpred,freq,year total,prop
0,1951,M,24,24,1.0
1,1952,F,3,25,0.12
2,1952,M,20,25,0.8
3,1953,M,14,17,0.824
4,1954,M,16,16,1.0
5,1955,M,11,11,1.0
6,1956,M,14,14,1.0
7,1957,F,3,19,0.158
8,1957,M,14,19,0.737
9,1958,M,13,13,1.0


In [29]:
alt.Chart(transform_year(white_gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'genderpred',
        title = 'White author gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)