In [421]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [422]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [423]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [424]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [425]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)

In [426]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [427]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [428]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [429]:
total_paper_num, total_author_num

(5712, 11292)

In [430]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

In [431]:
def get_table(df, var1, var2):
    '''this functin gets a table: var1 (row) by var 2 (col)
    Input:
        - df: authors
        - var1: most likely 'journal'
        - var2: 'racepred', 'genderpred', 'countrypred_new'
    '''
    dff = df.groupby([var1, var2]).size().to_frame('freq').reset_index()
    dff = dff.pivot(index=var1, columns=var2, values='freq')
    dff.fillna(0, inplace = True)
    dff['Total'] = dff.sum(axis=1)
    rowsum = dff.sum(axis = 0).tolist()
    dff.loc['Total'] = rowsum
    return dff

In [432]:
def update_row(row):
    '''this function divide each cell by the row total
    '''
    row_data = row.tolist()
    # get total
    total = row_data[-1]
    # remove total
    row_data = row_data[:-1]
    prop = ["{0:.1%}".format(x / total) for x in row_data]
    lst = []
    for i in range(len(row_data)):
        if row_data[i] > 0:
            string = str(int(row_data[i])) + ' ' +  '(' + prop[i] + ')'
        else:
            string = str(0)
        lst.append(string)
    lst.append(total)
    return lst

In [433]:
def update_df(df):
    '''This function updates df so that each cell is in the format of 'number (percentage)'
    Input:
        - df: j_race, j_gender, j_country
    '''
    tuples = []
    for index, row in df.iterrows():
        lst = update_row(row)
        lst.insert(0, index)
        tuples.append(tuple(lst))
    colnames = df.columns.tolist()
    if df.index.name == 'journal':
        colnames.insert(0, 'Journal')
    else:
        colnames.insert(0, 'Aff country')
    df = pd.DataFrame(tuples, columns = colnames)
    return df

In [434]:
yearly_papernum

Unnamed: 0,year,freq
0,1951,20
1,1952,24
2,1953,17
3,1954,15
4,1955,11
5,1956,13
6,1957,16
7,1958,13
8,1959,17
9,1960,16


In [435]:
papernum_by_year_chart = alt.Chart(transform_year(yearly_papernum)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    )
).properties(
    title = 'a',
    height=300,
    width=260
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

papernum_by_year_chart

In [436]:
yearly_ave_authornum = yearly_papernum.merge(yearly_authornum, on = 'year')
yearly_ave_authornum['ave_authornum'] = round(yearly_ave_authornum[
    'freq_y'] / yearly_ave_authornum['freq_x'], 2)
yearly_ave_authornum.columns = ['year', 'papernum', 'authornu', 'ave_authornum']
yearly_ave_authornum

Unnamed: 0,year,papernum,authornu,ave_authornum
0,1951,20,24,1.2
1,1952,24,25,1.04
2,1953,17,17,1.0
3,1954,15,16,1.07
4,1955,11,11,1.0
5,1956,13,14,1.08
6,1957,16,19,1.19
7,1958,13,13,1.0
8,1959,17,21,1.24
9,1960,16,17,1.06


In [437]:
yearly_ave_authornum_chart = alt.Chart(transform_year(yearly_ave_authornum)).mark_line().encode(
    
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'ave_authornum',
        title = 'Average # of Authors'
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

yearly_ave_authornum_chart

In [438]:
race_ts = get_freq_and_prop(data = 'authors', var = 'racepred')
race_ts.head()

Unnamed: 0,year,racepred,freq,year total,prop
0,1951,White,24,24,1.0
1,1952,Asian,2,25,0.08
2,1952,White,23,25,0.92
3,1953,Asian,1,17,0.059
4,1953,Black,1,17,0.059


In [439]:
unique_races = list(set(race_ts.racepred))
unique_races.sort()
unique_races

['Asian', 'Black', 'Hispanic', 'Indigenous', 'Middle Eastern', 'White']

In [440]:
race_colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f']

In [441]:
race_color_dict = dict(zip(unique_races, race_colors))

In [442]:
race_prop_chart = alt.Chart(transform_year(race_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Porportion',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=['White', 'Asian', 'Hispanic', 'Black', 'Middle Eastern', 'Indigenous'],
                      range=[race_color_dict[x] for x in [
                          'White', 'Asian', 'Hispanic', 'Black', 'Middle Eastern', 'Indigenous']]),
    )
).properties(
    title = 'c',
    height=300,
    width=280
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

race_prop_chart

In [443]:
gender_ts = get_freq_and_prop(data = 'authors', var = 'genderpred')
gender_ts.head(10)

Unnamed: 0,year,genderpred,freq,year total,prop
0,1951,M,24,24,1.0
1,1952,F,5,25,0.2
2,1952,M,20,25,0.8
3,1953,M,17,17,1.0
4,1954,M,16,16,1.0
5,1955,M,11,11,1.0
6,1956,M,14,14,1.0
7,1957,F,3,19,0.158
8,1957,M,16,19,0.842
9,1958,M,13,13,1.0


In [444]:
gender_ts_prop_chart = alt.Chart(transform_year(gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'd',
    height=300,
    width=260
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

gender_ts_prop_chart 

In [445]:
chart10 = alt.hconcat(
    papernum_by_year_chart,
    yearly_ave_authornum_chart,
    race_prop_chart,
    gender_ts_prop_chart,
).resolve_scale(
    color='independent',
).configure_axis(
    labelFontSize=13,
    titleFontSize=20
).configure_legend(
    titleFontSize=16,
    labelFontSize=16
).configure_title(
    anchor='start',
    fontSize=22
).configure_point(
    size=100
)

chart10

In [446]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
top_country

['US', 'NL', 'GB', 'DE', 'IL']

In [447]:
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [448]:
country_prop = get_simple_prop_df('authors', 'countrypred_new')
country_prop.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'Great Britain',
                    'IL': 'Israel'}, inplace = True)
country_prop

Unnamed: 0,countrypred_new,freq,prop
0,US,8467,0.75
2,Other,1660,0.147
5,Netherlands,365,0.032
3,Great Britain,297,0.026
1,Germany,276,0.024
4,Israel,227,0.02


In [449]:
countries = country_prop.countrypred_new.tolist()
country_colors = ['grey', 'orange', 'red', 'teal', 'pink', 'purple']

In [450]:
country_dist_chart = alt.Chart(country_prop).mark_bar().encode(
    x = alt.X(
        'countrypred_new',
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
    ),
    color = alt.Color(
        'countrypred_new',
        title = 'Country/Region',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'a',
    height=300,
    width=140
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

country_dist_chart

In [451]:
country_ts = get_freq_and_prop('authors', 'countrypred_new')
country_ts.columns = ['year', 'country', 'freq', 'year total', 'prop']
country_ts.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'Great Britain',
                    'IL': 'Israel'}, inplace = True)
country_ts

Unnamed: 0,year,country,freq,year total,prop
0,1951,US,24,24,1.0
1,1952,Other,2,25,0.08
2,1952,US,23,25,0.92
3,1953,Other,2,17,0.118
4,1953,US,15,17,0.882
5,1954,US,16,16,1.0
6,1955,Other,1,11,0.091
7,1955,US,10,11,0.909
8,1956,US,14,14,1.0
9,1957,US,19,19,1.0


In [452]:
line = alt.Chart(transform_year(country_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'country',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
)

points = line.mark_point().encode(
    color = alt.Color(
        'country',
        title = 'Country/Region',
#         legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
    shape = alt.Shape(
        'country',
#         legend = None,
        scale = alt.Scale(
            domain = countries
        )
    )
)

country_prop_ts_chart = alt.layer(
    line,
    points,
).resolve_scale(
    color = 'independent',
    shape = 'independent'
).properties(
    title = 'b',
    height=300,
    width=300
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

country_prop_ts_chart

In [453]:
with_us_prop = get_simple_prop_df('papers', 'with_us_authors')
with_us_prop

Unnamed: 0,with_us_authors,freq,prop
0,Yes,4523,0.792
1,No,1189,0.208


In [454]:
source = with_us_prop
with_us_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'with_us_authors',
        title = 'With US-based authors',
        sort = '-y',
        axis = alt.Axis(labelAngle = 0)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    ),
    color = alt.Color(
        'with_us_authors',
        legend = None,
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'c',
    height=200,
    width=100
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

with_us_chart

In [455]:
with_us_prop_ts = get_freq_and_prop('papers', 'with_us_authors')
with_us_prop_ts.head()

Unnamed: 0,year,with_us_authors,freq,year total,prop
0,1951,Yes,20,20,1.0
1,1952,No,2,24,0.083
2,1952,Yes,22,24,0.917
3,1953,No,2,17,0.118
4,1953,Yes,15,17,0.882


In [456]:
with_us_ts_chart = alt.Chart(transform_year(with_us_prop_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'with_us_authors',
        title = 'With US-based authors',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'd',
    height=260,
    width=260
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

with_us_ts_chart

In [457]:
afftype_prop = get_simple_prop_df('authors', 'afftypepred')
afftype_prop.replace({
    'Education': 'Edu',
    'Non Education': 'Non-Edu'
}, inplace = True)
afftype_prop

Unnamed: 0,afftypepred,freq,prop
0,Edu,10642,0.942
1,Non-Edu,650,0.058


In [458]:
afftype_dist_chart = alt.Chart(afftype_prop).mark_bar().encode(
    x = alt.X(
        'afftypepred',
        title = 'Affiliation type',
        axis = alt.Axis(labelAngle = -45),
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
#         scale = alt.Scale(type = 'log')
    ),
    color = alt.Color(
        'afftypepred',
        legend = None,
    )
).properties(
    title = 'e',
    height=200,
    width=100
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

afftype_dist_chart

In [459]:
chart11 = alt.hconcat(
    country_dist_chart,
    country_prop_ts_chart,
    with_us_chart,
    with_us_ts_chart,
    afftype_dist_chart
).resolve_scale(
    color='independent',
).configure_axis(
    labelFontSize=15,
    titleFontSize=20
).configure_legend(
    titleFontSize=16,
    labelFontSize=16
).configure_title(
    anchor='start',
    fontSize=22
).configure_point(
    size=50
)

chart11

In [460]:
cross_var_idx = [15, 16, 17, 18, 21, 22, 23, 24]
all_paper_cols = papers.columns.tolist()
cross_vars = [all_paper_cols[x] for x in cross_var_idx]
cross_vars

['cross_country',
 'cross_type',
 'cross_gender',
 'cross_race',
 'cross_gender_and_race',
 'cross_gender_and_country',
 'cross_country_and_race',
 'cross_gender_race_and_country']

In [461]:
cross_vars_new = [
    'Cross Country',
    'Cross Type',
    'Cross Gender',
    'Cross Race',
    'Cross Gender & Race',
    'Cross Gender & Country',
    'Cross Country & Race',
    'Cross Gender, Race & Country'
]

In [462]:
rename_cross_vars_dic = dict(zip(cross_vars, cross_vars_new))
rename_cross_vars_dic

{'cross_country': 'Cross Country',
 'cross_type': 'Cross Type',
 'cross_gender': 'Cross Gender',
 'cross_race': 'Cross Race',
 'cross_gender_and_race': 'Cross Gender & Race',
 'cross_gender_and_country': 'Cross Gender & Country',
 'cross_country_and_race': 'Cross Country & Race',
 'cross_gender_race_and_country': 'Cross Gender, Race & Country'}

In [463]:
dfs = []
for var in cross_vars:
    dff = get_simple_prop_df('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dfs.append(dff)

In [464]:
collab_df = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df.replace(rename_cross_vars_dic, inplace=True)
sorted_collab = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)['collab_type'].tolist()
sorted_collab

['Cross Gender',
 'Cross Race',
 'Cross Country',
 'Cross Gender & Race',
 'Cross Gender & Country',
 'Cross Type',
 'Cross Country & Race',
 'Cross Gender, Race & Country']

In [465]:
collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)

Unnamed: 0,binary,freq,prop,collab_type
5,Yes,1663,0.291,Cross Gender
7,Yes,873,0.153,Cross Race
1,Yes,589,0.103,Cross Country
9,Yes,552,0.097,Cross Gender & Race
11,Yes,352,0.062,Cross Gender & Country
3,Yes,261,0.046,Cross Type
13,Yes,231,0.04,Cross Country & Race
15,Yes,155,0.027,"Cross Gender, Race & Country"


In [466]:
source = collab_df
cross_count_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'collab_type',
        title = None,
        sort = sorted_collab,
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
        sort='-x'
    ),
    color = alt.Color(
        'binary',
        title = 'Binary',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    ),
#     color = 'binary'
).properties(
    title = 'a',
    height=300,
    width=200
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

cross_count_chart

In [467]:
dfs = []
for var in cross_vars:
    dff = get_freq_and_prop('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dff.drop(columns = ['year total'], inplace = True)
    dfs.append(dff)

In [468]:
collab_df_ts = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df_ts.replace(rename_cross_vars_dic, inplace=True)
collab_df_ts = collab_df_ts[collab_df_ts.binary == 'Yes']
collab_df_ts = transform_year(collab_df_ts)
collab_df_ts.head()

Unnamed: 0,year,binary,freq,prop,collab_type
13,1963-01-01,Yes,1,0.042,Cross Country
22,1971-01-01,Yes,2,0.08,Cross Country
24,1972-01-01,Yes,1,0.036,Cross Country
27,1974-01-01,Yes,2,0.036,Cross Country
29,1975-01-01,Yes,3,0.029,Cross Country


In [469]:
source = collab_df_ts
all_cross_ts_chart = alt.Chart(source).mark_line(point=alt.OverlayMarkDef()).encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'collab_type',
        title = 'Collaboration',
        sort=alt.EncodingSortField('prop', op='mean', order='descending'),
    )
).properties(
    title = 'b',
    height=300,
    width=380
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=11
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

all_cross_ts_chart

In [470]:
cross_race_details_df = get_simple_prop_df('papers', 'cross_race_details')
cross_race_details_df.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_df.columns = ['cross_details', 'freq', 'prop']
cross_race_details_df

Unnamed: 0,cross_details,freq,prop
1,White only,4319,0.756
0,Cross race,873,0.153
2,Asian only,367,0.064
3,Hispanic only,73,0.013
4,Black only,49,0.009
5,Middle Eastern only,29,0.005
6,Indigenous only,2,0.0


In [471]:
cross_gender_details_df = get_simple_prop_df('papers', 'cross_gender_details')
cross_gender_details_df.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_df.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_df.columns = ['cross_details', 'freq', 'prop']
cross_gender_details_df

Unnamed: 0,cross_details,freq,prop
2,Male only,2781,0.487
1,Cross gender,1663,0.291
0,Female only,1261,0.221
3,Non-binary only,7,0.001


In [472]:
cross_details_df = pd.concat([cross_race_details_df, cross_gender_details_df])
cross_details_df

Unnamed: 0,cross_details,freq,prop
1,White only,4319,0.756
0,Cross race,873,0.153
2,Asian only,367,0.064
3,Hispanic only,73,0.013
4,Black only,49,0.009
5,Middle Eastern only,29,0.005
6,Indigenous only,2,0.0
2,Male only,2781,0.487
1,Cross gender,1663,0.291
0,Female only,1261,0.221


In [473]:
cross_race_count_chart = alt.Chart(cross_details_df).mark_bar().encode(
    x = alt.X(
        'cross_details:N', 
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
    ),
    color = alt.Color(
        'cross_details:N',
        legend = None,
    )
).properties(
    title = 'c',
    height=300,
    width=250
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

cross_race_count_chart

In [474]:
cross_race_details_ts = get_freq_and_prop('papers', 'cross_race_details')
cross_race_details_ts.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_race_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,1951,White only,20,20,1.0
1,1952,Asian only,1,24,0.042
2,1952,White only,22,24,0.917
3,1952,Cross race,1,24,0.042
4,1953,Asian only,1,17,0.059


In [475]:
cross_gender_details_ts = get_freq_and_prop('papers', 'cross_gender_details')
cross_gender_details_ts.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_ts.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_gender_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,1951,Male only,20,20,1.0
1,1952,Female only,4,24,0.167
2,1952,Male only,19,24,0.792
3,1952,Cross gender,1,24,0.042
4,1953,Male only,17,17,1.0


In [476]:
cross_details_ts = pd.concat([cross_race_details_ts, cross_gender_details_ts])
cross_details_ts

Unnamed: 0,year,cross_details,freq,year total,prop
0,1951,White only,20,20,1.0
1,1952,Asian only,1,24,0.042
2,1952,White only,22,24,0.917
3,1952,Cross race,1,24,0.042
4,1953,Asian only,1,17,0.059
5,1953,Black only,1,17,0.059
6,1953,Hispanic only,1,17,0.059
7,1953,White only,14,17,0.824
8,1954,White only,15,15,1.0
9,1955,White only,11,11,1.0


In [477]:
cross_details_ts_chart = alt.Chart(transform_year(cross_details_ts)).mark_line(point=alt.OverlayMarkDef()).encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'cross_details:N',
        title = 'Details',
        sort=alt.EncodingSortField('prop', op='mean', order='descending')
    )
).properties(
    title = 'd',
    height=300,
    width=360
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

cross_details_ts_chart

In [478]:
chart20 = alt.hconcat(
    cross_count_chart,
    all_cross_ts_chart,
    cross_race_count_chart,
    cross_details_ts_chart
).resolve_scale(
    color='independent',
).configure_axis(
    labelFontSize=13,
    titleFontSize=20
).configure_legend(
    titleFontSize=16,
    labelFontSize=16
).configure_title(
    anchor='start',
    fontSize=22
).configure_point(
    size=50
)

chart20

In [479]:
race_count_dic_long = dict(Counter(authors.racepred))
race_count_dic_long

{'White': 9304,
 'Hispanic': 345,
 'Asian': 1361,
 'Black': 158,
 'Middle Eastern': 106,
 'Indigenous': 18}

In [480]:
gender_count_dic = dict(Counter(authors.genderpred))
gender_count_dic

{'F': 4405, 'M': 6876, 'N': 11}

In [481]:
gender_by_race_long = authors.groupby(
    ['genderpred', 'racepred']).size().to_frame('freq').reset_index()
gender_by_race_long['gender_total'] = [gender_count_dic[x] for x in gender_by_race_long.genderpred]
gender_by_race_long['race_total'] = [race_count_dic_long[x] for x in gender_by_race_long.racepred]
gender_by_race_long['race_in_gender'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['gender_total'], 
    3)
gender_by_race_long['gender_in_race'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['race_total'], 
    3)
gender_by_race_long

Unnamed: 0,genderpred,racepred,freq,gender_total,race_total,race_in_gender,gender_in_race
0,F,Asian,661,4405,1361,0.15,0.486
1,F,Black,81,4405,158,0.018,0.513
2,F,Hispanic,167,4405,345,0.038,0.484
3,F,Indigenous,14,4405,18,0.003,0.778
4,F,Middle Eastern,52,4405,106,0.012,0.491
5,F,White,3430,4405,9304,0.779,0.369
6,M,Asian,699,6876,1361,0.102,0.514
7,M,Black,77,6876,158,0.011,0.487
8,M,Hispanic,173,6876,345,0.025,0.501
9,M,Indigenous,4,6876,18,0.001,0.222


In [482]:
gender_and_race_3 = alt.Chart(gender_by_race_long).mark_bar().encode(
    x = alt.X(
        'genderpred',
        sort = ["M", "F", "N"],
        axis = None
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
#         scale=alt.Scale(type="log")  # Here the scale is applied
    ),
    color = alt.Color(
         'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    ),
    column = alt.Column('racepred:N', 
                        title = None,
                        header=alt.Header(labelFontSize=13)
    )
).properties(
    title = 'a',
    height=300,
    width=60
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

gender_and_race_3

In [483]:
gender_and_race_4 = alt.Chart(gender_by_race_long.sample(gender_by_race_long.shape[0])).mark_bar().encode(
    x = alt.X(
        'racepred:N',
        axis = None
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
#         scale=alt.Scale(type="log")  # Here the scale is applied
    ),
    color = alt.Color(
         'racepred:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    ),
    column = alt.Column(
        'genderpred:N', 
        title = None,
        sort = ["M", "F", "N"],
        header=alt.Header(labelFontSize=13)
    )
).properties(
    title = 'b',
    height=300,
    width=100
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

gender_and_race_4

In [484]:
female_prop_by_race = authors.groupby(
    ['year', 'genderpred', 'racepred']).size().to_frame('freq').reset_index()
female_prop_by_race.head()

Unnamed: 0,year,genderpred,racepred,freq
0,1951,M,White,24
1,1952,F,Asian,2
2,1952,F,White,3
3,1952,M,White,20
4,1953,M,Asian,1


In [485]:
tuples = []
for year in list(set(female_prop_by_race.year)):
    for race in list(set(female_prop_by_race.racepred)):
        year_race = female_prop_by_race[
            (female_prop_by_race.year == year) & (female_prop_by_race.racepred == race)]
        total = sum(year_race['freq'])
        try:
            female_num = year_race[year_race.genderpred == 'F'].iloc[0]['freq']
            female_prop = female_num / total
        except:
            female_num = 0
            female_prop = 0
        tuples.append((year, race, total, female_num, female_prop))

In [486]:
female_prop_by_race = pd.DataFrame(tuples, columns = [
    'year', 
    'race', 
    'yearly race total', 
    'female_num', 
    'female_prop'])
female_prop_by_race

Unnamed: 0,year,race,yearly race total,female_num,female_prop
0,1951,Asian,0,0,0.0
1,1951,Hispanic,0,0,0.0
2,1951,Black,0,0,0.0
3,1951,Middle Eastern,0,0,0.0
4,1951,White,24,0,0.0
5,1951,Indigenous,0,0,0.0
6,1952,Asian,2,2,1.0
7,1952,Hispanic,0,0,0.0
8,1952,Black,0,0,0.0
9,1952,Middle Eastern,0,0,0.0


In [487]:
# unique_races.remove('Indigenous')
# race_colors.remove(race_colors[3])

line = alt.Chart(transform_year(female_prop_by_race)).mark_line(
).encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'female_prop',
        title = 'Female Proportion'
    ),
    color = alt.Color(
         'race:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors),
        legend = None
    ),
    opacity=alt.condition(
        alt.datum.race == 'Indigenous' or alt.datum.race == 'Middle Eastern',
        alt.value(0.3),
        alt.value(1)
    ),
)

points = line.mark_point().encode(
    color = alt.Color(
        'race:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors),
    ),
    shape=alt.Shape('race', 
                    scale=alt.Scale(
                        domain=unique_races,
                        range=['cross', 'circle', 'square', 'triangle-right', 'diamond'])
    ),
    
)

female_prop_ts_chart = alt.layer(
    line,
    points
).resolve_scale(
    color = 'independent',
    shape = 'independent'
).properties(
    title = 'c',
    height=300,
    width=360
)

# .configure_axis(
#     labelFontSize=13,
#     titleFontSize=20
# ).configure_legend(
#     titleFontSize=16,
#     labelFontSize=16
# ).configure_title(
#     anchor='start',
#     fontSize=22
# )

female_prop_ts_chart

In [488]:
chart21 = alt.hconcat(
    gender_and_race_3,
    gender_and_race_4,
    female_prop_ts_chart
).resolve_scale(
    color='independent',
).configure_axis(
    labelFontSize=15,
    titleFontSize=20
).configure_legend(
    titleFontSize=16,
    labelFontSize=16
).configure_title(
    anchor='start',
    fontSize=22
).configure_point(
    size=100
)

chart21