In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [4]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [5]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [6]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [7]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [8]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [9]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [10]:
total_paper_num, total_author_num

(5712, 11292)

In [11]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Gender

In [12]:
get_simple_prop_df('authors', 'genderpred')

Unnamed: 0,genderpred,freq,prop
1,M,6876,0.609
0,F,4405,0.39
2,N,11,0.001


In [13]:
gender_ts = get_freq_and_prop(data = 'authors', var = 'genderpred')
gender_ts.head(10)

Unnamed: 0,year,genderpred,freq,year total,prop
0,1951,M,24,24,1.0
1,1952,F,5,25,0.2
2,1952,M,20,25,0.8
3,1953,M,17,17,1.0
4,1954,M,16,16,1.0
5,1955,M,11,11,1.0
6,1956,M,14,14,1.0
7,1957,F,3,19,0.158
8,1957,M,16,19,0.842
9,1958,M,13,13,1.0


In [14]:
gender_ts[gender_ts.genderpred == 'F'].sort_values('year')

Unnamed: 0,year,genderpred,freq,year total,prop
1,1952,F,5,25,0.2
7,1957,F,3,19,0.158
10,1959,F,5,21,0.238
12,1960,F,3,17,0.176
14,1961,F,1,24,0.042
17,1963,F,1,30,0.033
19,1964,F,3,29,0.103
21,1965,F,2,19,0.105
24,1967,F,1,9,0.111
26,1968,F,7,41,0.171


In [15]:
gender_ts[gender_ts.genderpred == 'F'].sort_values('year').tail(10)

Unnamed: 0,year,genderpred,freq,year total,prop
119,2013,F,179,355,0.504
122,2014,F,197,431,0.457
125,2015,F,176,402,0.438
127,2016,F,158,352,0.449
129,2017,F,180,352,0.511
132,2018,F,176,368,0.478
135,2019,F,140,269,0.52
137,2020,F,141,299,0.472
140,2021,F,229,455,0.503
142,2022,F,100,200,0.5


In [16]:
gender_ts[gender_ts.genderpred == 'F'].sort_values('prop')

Unnamed: 0,year,genderpred,freq,year total,prop
17,1963,F,1,30,0.033
14,1961,F,1,24,0.042
36,1973,F,1,18,0.056
19,1964,F,3,29,0.103
28,1969,F,4,39,0.103
21,1965,F,2,19,0.105
32,1971,F,4,37,0.108
24,1967,F,1,9,0.111
7,1957,F,3,19,0.158
26,1968,F,7,41,0.171


In [17]:
gender_ts[gender_ts.genderpred == 'N']

Unnamed: 0,year,genderpred,freq,year total,prop
86,1997,N,1,164,0.006
101,2004,N,1,248,0.004
110,2008,N,1,304,0.003
121,2013,N,1,355,0.003
124,2014,N,1,431,0.002
131,2017,N,2,352,0.006
134,2018,N,1,368,0.003
139,2020,N,1,299,0.003
144,2022,N,2,200,0.01


In [18]:
gender_ts[gender_ts.genderpred == 'N'].freq.sum()

11

In [19]:
# gender_ts['year'] = pd.to_datetime(gender_ts['year'], format='%Y')

gender_stack_chart = alt.Chart(gender_ts).mark_bar().encode(
    x = alt.X(
        'year:N',
        axis = alt.Axis(
            tickCount= 4,
            labelAngle = -45
    )
    ),
    y = alt.Y(
        'sum(freq)',
        title = '# of Authors'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'a',
    height=300,
    width=260
)

gender_stack_chart

In [20]:
gender_ts_prop_chart = alt.Chart(transform_year(gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

gender_ts_prop_chart 

## First author gender

In [21]:
get_simple_prop_df(data = 'papers', var = 'first_author_gender')

Unnamed: 0,first_author_gender,freq,prop
1,M,3699,0.648
0,F,2003,0.351
2,N,10,0.002


In [22]:
first_author_gender_ts = get_freq_and_prop(data = 'papers', var = 'first_author_gender')
first_author_gender_ts

Unnamed: 0,year,first_author_gender,freq,year total,prop
0,1951,M,20,20,1.0
1,1952,F,5,24,0.208
2,1952,M,19,24,0.792
3,1953,M,17,17,1.0
4,1954,M,15,15,1.0
5,1955,M,11,11,1.0
6,1956,M,13,13,1.0
7,1957,F,2,16,0.125
8,1957,M,14,16,0.875
9,1958,M,13,13,1.0


In [23]:
alt.Chart(transform_year(first_author_gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'first_author_gender',
        title = 'First author gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)


## Gender parity for white people

In [24]:
authors.shape

(11292, 44)

In [25]:
white_authors = authors[authors.racepred == 'White']
white_authors.head(2)

Unnamed: 0,authorID,doi,url,year,title,journal,numberOfAuthors,authorPosition,authorFullName,firstName,lastName,affiliation,gscholarLink,googleSearch,genderize,genderize_prob,genderize_basedon,genderAccuracy,authorFullName.1,firstName.1,lastName.1,affiliation.1,gscholarLink.1,googleSearch.1,race,racePredAccuracy,api,black,hispanic,white,raceHighest,raceSecondHighest,raceDiff,affProcessed,affiliation.2,ROR_AFFNAME,matchMethod,ROR_ID,type,countrypred,genderpred,racepred,afftypepred,countrypred_new
0,10.1093/joc/jqac004+1.0,10.1093/joc/jqac004,https://academic.oup.com/joc/article/72/3/297/6529454,2022,The Gender Divide in Wikipedia: Quantifying and Assessing the Impact of Two Feminist Interventions,JOC,2.0,1.0,Isabelle Langrock,Isabelle,Langrock,"Annenberg School for Communication, University of Pennsylvania , Philadelphia, PA 19104, USA",https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q=Isabelle+Langrock,https://www.google.com/search?q=Isabelle+Langrock+annenberg+school+for+communication+university+of+pennsylvania+philadelphia+pa+usa,female,0.99,89728.0,High,Isabelle Langrock,Isabelle,Langrock,"Annenberg School for Communication, University of Pennsylvania , Philadelphia, PA 19104, USA",https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q=Isabelle+Langrock,https://www.google.com/search?q=Isabelle+Langrock+annenberg+school+for+communication+university+of+pennsylvania+philadelphia+pa+usa,white,High,0.007762,0.066429,0.030049,0.89576,0.89576,0.066429,0.829331,annenberg school for communication university of pennsylvania philadelphia pa usa,"Annenberg School for Communication, University of Pennsylvania , Philadelphia, PA 19104, USA",University of Pennsylvania,Exact,https://ror.org/00b30xv10,R,US,F,White,Education,US
2,10.1093/joc/jqac009+1.0,10.1093/joc/jqac009,https://academic.oup.com/joc/article/72/3/322/6549217,2022,Mapping Exposure Diversity: The Divergent Effects of Algorithmic Curation on News Consumption,JOC,2.0,1.0,Pascal Jürgens,Pascal,Jürgens,"Department of Communication, Jakob-Welder-Weg 12, Johannes Gutenberg-Universität Mainz , 55099 Mainz, Rhineland-Palatinate, Germany",https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q=Pascal+Jürgens,https://www.google.com/search?q=Pascal+J%C3%BCrgens+department+of+communication+jakob+welder+weg+johannes+gutenberg+universit+t+mainz+mainz+rhineland+palatinate+germany,male,0.99,84147.0,High,Pascal Jürgens,Pascal,Jürgens,"Department of Communication, Jakob-Welder-Weg 12, Johannes Gutenberg-Universität Mainz , 55099 Mainz, Rhineland-Palatinate, Germany",https://scholar.google.com/scholar?hl=en&as_sdt=0%252C50&q=Pascal+Jürgens,https://www.google.com/search?q=Pascal+J%C3%BCrgens+department+of+communication+jakob+welder+weg+johannes+gutenberg+universit+t+mainz+mainz+rhineland+palatinate+germany,white,High,0.007112,0.088763,0.031022,0.873102,0.873102,0.088763,0.78434,department of communication jakob welder weg johannes gutenberg universit t mainz mainz rhineland palatinate germany,"Department of Communication, Jakob-Welder-Weg 12, Johannes Gutenberg-Universität Mainz , 55099 Mainz, Rhineland-Palatinate, Germany",Johannes Gutenberg University of Mainz,API_QUERY,https://ror.org/023b0x485,R,DE,M,White,Education,DE


In [26]:
white_authors.shape

(9304, 44)

In [27]:
Counter(white_authors.genderpred)

Counter({'F': 3430, 'M': 5869, 'N': 5})

In [28]:
3430/white_authors.shape[0]

0.36865864144453997

In [29]:
5869/white_authors.shape[0]

0.6308039552880481

In [30]:
yearly_white_authornum = white_authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_white_authornum_dic = dict(zip(yearly_white_authornum.year, yearly_white_authornum.freq))

In [31]:
def get_freq_and_prop_for_white_authors(data, var):
    dic = yearly_white_authornum_dic
    df = data.groupby(['year', var]).size().to_frame('freq').reset_index()
    df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [32]:
white_gender_ts = get_freq_and_prop_for_white_authors(data = white_authors, var = 'genderpred')
white_gender_ts.head(10)

Unnamed: 0,year,genderpred,freq,year total,prop
0,1951,M,24,24,1.0
1,1952,F,3,23,0.13
2,1952,M,20,23,0.87
3,1953,M,14,14,1.0
4,1954,M,16,16,1.0
5,1955,M,11,11,1.0
6,1956,M,14,14,1.0
7,1957,F,3,17,0.176
8,1957,M,14,17,0.824
9,1958,M,13,13,1.0


In [33]:
alt.Chart(transform_year(white_gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'genderpred',
        title = 'White author gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)