In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [59]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [60]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [61]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [62]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [63]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [64]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [65]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [66]:
total_paper_num, total_author_num

(5712, 11292)

In [67]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

In [68]:
def get_table(df, var1, var2):
    '''this functin gets a table: var1 (row) by var 2 (col)
    Input:
        - df: authors
        - var1: most likely 'journal'
        - var2: 'racepred', 'genderpred', 'countrypred_new'
    '''
    dff = df.groupby([var1, var2]).size().to_frame('freq').reset_index()
    dff = dff.pivot(index=var1, columns=var2, values='freq')
    dff.fillna(0, inplace = True)
    dff['Total'] = dff.sum(axis=1)
    rowsum = dff.sum(axis = 0).tolist()
    dff.loc['Total'] = rowsum
    return dff

In [69]:
def update_row(row):
    '''this function divide each cell by the row total
    '''
    row_data = row.tolist()
    # get total
    total = row_data[-1]
    # remove total
    row_data = row_data[:-1]
    prop = ["{0:.1%}".format(x / total) for x in row_data]
    lst = []
    for i in range(len(row_data)):
        if row_data[i] > 0:
            string = str(int(row_data[i])) + ' ' +  '(' + prop[i] + ')'
        else:
            string = str(0)
        lst.append(string)
    lst.append(total)
    return lst

In [70]:
def update_df(df):
    '''This function updates df so that each cell is in the format of 'number (percentage)'
    Input:
        - df: j_race, j_gender, j_country
    '''
    tuples = []
    for index, row in df.iterrows():
        lst = update_row(row)
        lst.insert(0, index)
        tuples.append(tuple(lst))
    colnames = df.columns.tolist()
    if df.index.name == 'journal':
        colnames.insert(0, 'Journal')
    else:
        colnames.insert(0, 'Aff country')
    df = pd.DataFrame(tuples, columns = colnames)
    return df

## Dataset

### Statistics of paper number and author number by journal

In [71]:
# start_year_dic = {}
# for group in papers.groupby('journal'):
#     journal = group[0]
#     all_years = group[1].year.tolist()
#     start_year = min(all_years)
#     start_year_dic[journal] = start_year
# start_year_dic

In [72]:
def combine_two_cols(row, var1, var2):
    var2 = "{0:.1%}".format(row[var2])
    return str(row[var1]) + ' (' + str(var2) + ')'

In [73]:
author_num_by_j = get_simple_prop_df('authors', 'journal')
author_num_by_j['author'] = author_num_by_j.apply(
    lambda row: combine_two_cols(row, 'freq', 'prop'), axis=1
)
author_num_by_j.columns = ['journal', 'a_freq', 'a_prop', 'author']
# author_num_by_j = author_num_by_j[['journal', 'author']]
# author_num_by_j

In [74]:
paper_num_by_j = get_simple_prop_df('papers', 'journal')
paper_num_by_j['paper'] = paper_num_by_j.apply(
    lambda row: combine_two_cols(row, 'freq', 'prop'), axis=1
)
paper_num_by_j.columns = ['journal', 'p_freq', 'p_prop', 'paper']
# paper_num_by_j = paper_num_by_j[['journal', 'paper']]
# paper_num_by_j

In [75]:
t1 = paper_num_by_j.merge(author_num_by_j, on = 'journal')
t1['average_author_num_per_paper'] = round(t1['a_freq'] / t1['p_freq'], 2)
t1.sort_values(by = 'p_freq', ascending = False, inplace = True)
t1 = t1[['journal', 'paper', 'author', 'average_author_num_per_paper']]
t1.columns = ['Journal', f'# of Papers ({total_paper_num})', f'# of Authors ({total_author_num})', 'Ave. # of Authors Per Paper']
# t1.loc[len(t1.index)] = ['Total', total_paper_num, total_author_num, '']
t1

Unnamed: 0,Journal,# of Papers (5712),# of Authors (11292),Ave. # of Authors Per Paper
0,JOC,2626 (46.0%),4973 (44.0%),1.89
1,HCR,1153 (20.2%),2693 (23.8%),2.34
2,JCMC,845 (14.8%),2005 (17.8%),2.37
3,CT,629 (11.0%),982 (8.7%),1.56
4,CCC,459 (8.0%),639 (5.7%),1.39


In [76]:
print(t1.to_latex(index=False)) 

\begin{tabular}{lllr}
\toprule
Journal & \# of Papers (5712) & \# of Authors (11292) &  Ave. \# of Authors Per Paper \\
\midrule
    JOC &       2626 (46.0\%) &         4973 (44.0\%) &                         1.89 \\
    HCR &       1153 (20.2\%) &         2693 (23.8\%) &                         2.34 \\
   JCMC &        845 (14.8\%) &         2005 (17.8\%) &                         2.37 \\
     CT &        629 (11.0\%) &           982 (8.7\%) &                         1.56 \\
    CCC &         459 (8.0\%) &           639 (5.7\%) &                         1.39 \\
\bottomrule
\end{tabular}



  print(t1.to_latex(index=False))


### Total number of papers by year

In [77]:
yearly_papernum

Unnamed: 0,year,freq
0,1951,20
1,1952,24
2,1953,17
3,1954,15
4,1955,11
5,1956,13
6,1957,16
7,1958,13
8,1959,17
9,1960,16


In [78]:
papernum_by_year_chart = alt.Chart(transform_year(yearly_papernum)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    )
).properties(
    title = 'a',
    height=300,
    width=260
)
papernum_by_year_chart

## TS: averge number of authors

In [79]:
yearly_ave_authornum = yearly_papernum.merge(yearly_authornum, on = 'year')
yearly_ave_authornum['ave_authornum'] = round(yearly_ave_authornum[
    'freq_y'] / yearly_ave_authornum['freq_x'], 2)
yearly_ave_authornum.columns = ['year', 'papernum', 'authornum', 'ave_authornum']
yearly_ave_authornum

Unnamed: 0,year,papernum,authornum,ave_authornum
0,1951,20,24,1.2
1,1952,24,25,1.04
2,1953,17,17,1.0
3,1954,15,16,1.07
4,1955,11,11,1.0
5,1956,13,14,1.08
6,1957,16,19,1.19
7,1958,13,13,1.0
8,1959,17,21,1.24
9,1960,16,17,1.06


In [80]:
yearly_ave_authornum_chart = alt.Chart(transform_year(yearly_ave_authornum)).mark_line().encode(
    
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'ave_authornum',
        title = 'Average # of Authors'
    )
).properties(
    title = 'b',
    height=300,
    width=260
)
yearly_ave_authornum_chart

## Race

In [81]:
get_simple_prop_df('authors', 'racepred')

Unnamed: 0,racepred,freq,prop
0,White,9304,0.824
2,Asian,1361,0.121
1,Hispanic,345,0.031
3,Black,158,0.014
4,Middle Eastern,106,0.009
5,Indigenous,18,0.002


In [82]:
country_by_race = get_table(authors, 'countrypred_new', 'racepred')
country_by_race = update_df(country_by_race)
country_by_race

Unnamed: 0,Aff country,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,DE,5 (1.8%),1 (0.4%),0,0,1 (0.4%),269 (97.5%),276.0
1,GB,19 (6.4%),4 (1.3%),8 (2.7%),0,3 (1.0%),263 (88.6%),297.0
2,IL,8 (3.5%),0,4 (1.8%),0,26 (11.5%),189 (83.3%),227.0
3,NL,5 (1.4%),0,1 (0.3%),1 (0.3%),2 (0.5%),355 (97.5%),364.0
4,Other,412 (24.8%),9 (0.5%),134 (8.1%),0,20 (1.2%),1086 (65.4%),1661.0
5,US,912 (10.8%),144 (1.7%),198 (2.3%),17 (0.2%),54 (0.6%),7142 (84.4%),8467.0
6,Total,1361 (12.1%),158 (1.4%),345 (3.1%),18 (0.2%),106 (0.9%),9304 (82.4%),11292.0


In [83]:
print(country_by_race.to_latex(index=False)) 

\begin{tabular}{lllllllr}
\toprule
Aff country &        Asian &      Black &   Hispanic & Indigenous & Middle Eastern &        White &   Total \\
\midrule
         DE &     5 (1.8\%) &   1 (0.4\%) &          0 &          0 &       1 (0.4\%) &  269 (97.5\%) &   276.0 \\
         GB &    19 (6.4\%) &   4 (1.3\%) &   8 (2.7\%) &          0 &       3 (1.0\%) &  263 (88.6\%) &   297.0 \\
         IL &     8 (3.5\%) &          0 &   4 (1.8\%) &          0 &     26 (11.5\%) &  189 (83.3\%) &   227.0 \\
         NL &     5 (1.4\%) &          0 &   1 (0.3\%) &   1 (0.3\%) &       2 (0.5\%) &  355 (97.5\%) &   364.0 \\
      Other &  412 (24.8\%) &   9 (0.5\%) & 134 (8.1\%) &          0 &      20 (1.2\%) & 1086 (65.4\%) &  1661.0 \\
         US &  912 (10.8\%) & 144 (1.7\%) & 198 (2.3\%) &  17 (0.2\%) &      54 (0.6\%) & 7142 (84.4\%) &  8467.0 \\
      Total & 1361 (12.1\%) & 158 (1.4\%) & 345 (3.1\%) &  18 (0.2\%) &     106 (0.9\%) & 9304 (82.4\%) & 11292.0 \\
\bottomrule
\end{tabular}



  print(country_by_race.to_latex(index=False))


In [84]:
j_race = get_table(authors, 'journal', 'racepred')
j_race = update_df(j_race)
j_race

Unnamed: 0,Journal,Asian,Black,Hispanic,Indigenous,Middle Eastern,White,Total
0,CCC,108 (16.9%),31 (4.9%),35 (5.5%),3 (0.5%),13 (2.0%),449 (70.3%),639
1,CT,84 (8.6%),16 (1.6%),32 (3.3%),2 (0.2%),11 (1.1%),837 (85.2%),982
2,HCR,277 (10.3%),22 (0.8%),67 (2.5%),6 (0.2%),15 (0.6%),2306 (85.6%),2693
3,JCMC,426 (21.2%),15 (0.7%),84 (4.2%),3 (0.1%),25 (1.2%),1452 (72.4%),2005
4,JOC,466 (9.4%),74 (1.5%),127 (2.6%),4 (0.1%),42 (0.8%),4260 (85.7%),4973
5,Total,1361 (12.1%),158 (1.4%),345 (3.1%),18 (0.2%),106 (0.9%),9304 (82.4%),11292


In [85]:
print(j_race.to_latex(index=False)) 

\begin{tabular}{lllllllr}
\toprule
Journal &        Asian &      Black &   Hispanic & Indigenous & Middle Eastern &        White &  Total \\
\midrule
    CCC &  108 (16.9\%) &  31 (4.9\%) &  35 (5.5\%) &   3 (0.5\%) &      13 (2.0\%) &  449 (70.3\%) &    639 \\
     CT &    84 (8.6\%) &  16 (1.6\%) &  32 (3.3\%) &   2 (0.2\%) &      11 (1.1\%) &  837 (85.2\%) &    982 \\
    HCR &  277 (10.3\%) &  22 (0.8\%) &  67 (2.5\%) &   6 (0.2\%) &      15 (0.6\%) & 2306 (85.6\%) &   2693 \\
   JCMC &  426 (21.2\%) &  15 (0.7\%) &  84 (4.2\%) &   3 (0.1\%) &      25 (1.2\%) & 1452 (72.4\%) &   2005 \\
    JOC &   466 (9.4\%) &  74 (1.5\%) & 127 (2.6\%) &   4 (0.1\%) &      42 (0.8\%) & 4260 (85.7\%) &   4973 \\
  Total & 1361 (12.1\%) & 158 (1.4\%) & 345 (3.1\%) &  18 (0.2\%) &     106 (0.9\%) & 9304 (82.4\%) &  11292 \\
\bottomrule
\end{tabular}



  print(j_race.to_latex(index=False))


In [86]:
j_gender = get_table(authors, 'journal', 'genderpred')
j_gender = update_df(j_gender)
j_gender

Unnamed: 0,Journal,F,M,N,Total
0,CCC,366 (57.3%),268 (41.9%),5 (0.8%),639.0
1,CT,379 (38.6%),603 (61.4%),0,982.0
2,HCR,1055 (39.2%),1634 (60.7%),4 (0.1%),2693.0
3,JCMC,902 (45.0%),1102 (55.0%),1 (0.0%),2005.0
4,JOC,1703 (34.2%),3269 (65.7%),1 (0.0%),4973.0
5,Total,4405 (39.0%),6876 (60.9%),11 (0.1%),11292.0


In [87]:
print(j_gender.to_latex(index=False)) 

  print(j_gender.to_latex(index=False))


\begin{tabular}{llllr}
\toprule
Journal &            F &            M &         N &   Total \\
\midrule
    CCC &  366 (57.3\%) &  268 (41.9\%) &  5 (0.8\%) &   639.0 \\
     CT &  379 (38.6\%) &  603 (61.4\%) &         0 &   982.0 \\
    HCR & 1055 (39.2\%) & 1634 (60.7\%) &  4 (0.1\%) &  2693.0 \\
   JCMC &  902 (45.0\%) & 1102 (55.0\%) &  1 (0.0\%) &  2005.0 \\
    JOC & 1703 (34.2\%) & 3269 (65.7\%) &  1 (0.0\%) &  4973.0 \\
  Total & 4405 (39.0\%) & 6876 (60.9\%) & 11 (0.1\%) & 11292.0 \\
\bottomrule
\end{tabular}



In [88]:
j_country = get_table(authors, 'journal', 'countrypred_new')

j_country = update_df(j_country)
j_country.rename(columns = {
    'DE': 'Germany',
    'GB': 'Great Britain',
    'IL': 'Israel',
    'NL': 'Netherlands',
}, inplace= True)
j_country = j_country[['Journal',
                       'US',
                       'Other',
                       'Netherlands',
                       'Germany',
                       'Great Britain',
                       'Israel',
                       'Total'
                      ]]
j_country

Unnamed: 0,Journal,US,Other,Netherlands,Germany,Great Britain,Israel,Total
0,CCC,438 (68.5%),113 (17.7%),6 (0.9%),7 (1.1%),54 (8.5%),21 (3.3%),639
1,CT,712 (72.5%),156 (15.9%),21 (2.1%),33 (3.4%),20 (2.0%),40 (4.1%),982
2,HCR,2286 (84.9%),228 (8.5%),82 (3.0%),37 (1.4%),32 (1.2%),28 (1.0%),2693
3,JCMC,1171 (58.4%),530 (26.4%),114 (5.7%),62 (3.1%),84 (4.2%),44 (2.2%),2005
4,JOC,3860 (77.6%),634 (12.7%),141 (2.8%),137 (2.8%),107 (2.2%),94 (1.9%),4973
5,Total,8467 (75.0%),1661 (14.7%),364 (3.2%),276 (2.4%),297 (2.6%),227 (2.0%),11292


In [89]:
print(j_country.to_latex(index=False)) 

  print(j_country.to_latex(index=False))


\begin{tabular}{lllllllr}
\toprule
Journal &           US &        Other & Netherlands &    Germany & Great Britain &     Israel &  Total \\
\midrule
    CCC &  438 (68.5\%) &  113 (17.7\%) &    6 (0.9\%) &   7 (1.1\%) &     54 (8.5\%) &  21 (3.3\%) &    639 \\
     CT &  712 (72.5\%) &  156 (15.9\%) &   21 (2.1\%) &  33 (3.4\%) &     20 (2.0\%) &  40 (4.1\%) &    982 \\
    HCR & 2286 (84.9\%) &   228 (8.5\%) &   82 (3.0\%) &  37 (1.4\%) &     32 (1.2\%) &  28 (1.0\%) &   2693 \\
   JCMC & 1171 (58.4\%) &  530 (26.4\%) &  114 (5.7\%) &  62 (3.1\%) &     84 (4.2\%) &  44 (2.2\%) &   2005 \\
    JOC & 3860 (77.6\%) &  634 (12.7\%) &  141 (2.8\%) & 137 (2.8\%) &    107 (2.2\%) &  94 (1.9\%) &   4973 \\
  Total & 8467 (75.0\%) & 1661 (14.7\%) &  364 (3.2\%) & 276 (2.4\%) &    297 (2.6\%) & 227 (2.0\%) &  11292 \\
\bottomrule
\end{tabular}



### Race proportion by year

In [90]:
race_ts = get_freq_and_prop(data = 'authors', var = 'racepred')
race_ts[race_ts.racepred == 'White']

Unnamed: 0,year,racepred,freq,year total,prop
0,1951,White,24,24,1.0
2,1952,White,23,25,0.92
6,1953,White,14,17,0.824
7,1954,White,16,16,1.0
8,1955,White,11,11,1.0
9,1956,White,14,14,1.0
12,1957,White,17,19,0.895
13,1958,White,13,13,1.0
15,1959,White,20,21,0.952
17,1960,White,16,17,0.941


In [91]:
race_ts[race_ts.racepred == 'White'].sort_values("prop")

Unnamed: 0,year,racepred,freq,year total,prop
253,2019,White,179,269,0.665
264,2021,White,305,455,0.67
258,2020,White,210,299,0.702
237,2016,White,249,352,0.707
270,2022,White,145,200,0.725
242,2017,White,256,352,0.727
214,2012,White,256,348,0.736
203,2010,White,219,295,0.742
248,2018,White,274,368,0.745
208,2011,White,232,308,0.753


In [92]:
race_ts[race_ts.racepred == 'Asian'].sort_values("prop")

Unnamed: 0,year,racepred,freq,year total,prop
81,1983,Asian,1,144,0.007
50,1976,Asian,2,178,0.011
115,1992,Asian,2,121,0.017
60,1978,Asian,3,181,0.017
103,1989,Asian,2,113,0.018
45,1975,Asian,3,159,0.019
98,1988,Asian,2,102,0.02
37,1972,Asian,1,51,0.02
68,1980,Asian,4,204,0.02
95,1987,Asian,2,93,0.022


In [93]:
race_ts[race_ts.racepred == 'Hispanic'].sort_values("prop")

Unnamed: 0,year,racepred,freq,year total,prop
65,1979,Hispanic,1,182,0.005
186,2007,Hispanic,2,308,0.006
61,1978,Hispanic,1,181,0.006
73,1981,Hispanic,1,157,0.006
128,1995,Hispanic,1,144,0.007
121,1993,Hispanic,1,152,0.007
82,1983,Hispanic,1,144,0.007
117,1992,Hispanic,1,121,0.008
86,1984,Hispanic,1,120,0.008
89,1985,Hispanic,1,128,0.008


In [94]:
race_ts[race_ts.racepred == 'Hispanic'].sort_values("year")

Unnamed: 0,year,racepred,freq,year total,prop
5,1953,Hispanic,1,17,0.059
28,1968,Hispanic,1,41,0.024
40,1973,Hispanic,2,18,0.111
47,1975,Hispanic,2,159,0.013
52,1976,Hispanic,3,178,0.017
57,1977,Hispanic,6,217,0.028
61,1978,Hispanic,1,181,0.006
65,1979,Hispanic,1,182,0.005
70,1980,Hispanic,4,204,0.02
73,1981,Hispanic,1,157,0.006


In [95]:
race_ts[race_ts.racepred == 'Black'].sort_values("year")

Unnamed: 0,year,racepred,freq,year total,prop
4,1953,Black,1,17,0.059
11,1957,Black,1,19,0.053
16,1960,Black,1,17,0.059
23,1965,Black,1,19,0.053
33,1970,Black,1,33,0.03
43,1974,Black,3,91,0.033
46,1975,Black,2,159,0.013
51,1976,Black,1,178,0.006
56,1977,Black,2,217,0.009
69,1980,Black,2,204,0.01


In [96]:
race_ts[race_ts.racepred == 'Middle Eastern'].sort_values("prop")

Unnamed: 0,year,racepred,freq,year total,prop
187,2007,Middle Eastern,1,308,0.003
171,2004,Middle Eastern,1,248,0.004
269,2022,Middle Eastern,1,200,0.005
58,1977,Middle Eastern,1,217,0.005
62,1978,Middle Eastern,1,181,0.006
207,2011,Middle Eastern,2,308,0.006
53,1976,Middle Eastern,1,178,0.006
48,1975,Middle Eastern,1,159,0.006
75,1981,Middle Eastern,1,157,0.006
83,1983,Middle Eastern,1,144,0.007


In [97]:
race_ts[race_ts.racepred == 'Indigenous']

Unnamed: 0,year,racepred,freq,year total,prop
74,1981,Indigenous,1,157,0.006
140,1998,Indigenous,1,164,0.006
149,2000,Indigenous,1,171,0.006
158,2002,Indigenous,2,263,0.008
164,2003,Indigenous,1,214,0.005
170,2004,Indigenous,1,248,0.004
181,2006,Indigenous,1,311,0.003
197,2009,Indigenous,1,335,0.003
212,2012,Indigenous,1,348,0.003
223,2014,Indigenous,1,431,0.002


In [98]:
unique_races = list(set(race_ts.racepred))
unique_races.sort()
unique_races

['Asian', 'Black', 'Hispanic', 'Indigenous', 'Middle Eastern', 'White']

In [99]:
race_colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f']

In [100]:
race_ts = transform_year(race_ts)

In [101]:
race_stack_chart = alt.Chart(race_ts).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'd',
    height=300,
    width=280
)

race_stack_chart

In [102]:
race_prop_chart = alt.Chart(race_ts).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Porportion',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'f',
    height=300,
    width=280
)

race_prop_chart

## First author race

In [103]:
first_author_race_ts = get_freq_and_prop(data = 'papers', var = 'first_author_race')
first_author_race_ts.head()

Unnamed: 0,year,first_author_race,freq,year total,prop
0,1951,White,20,20,1.0
1,1952,Asian,2,24,0.083
2,1952,White,22,24,0.917
3,1953,Asian,1,17,0.059
4,1953,Black,1,17,0.059


In [104]:
first_author_race_ts.tail(10)

Unnamed: 0,year,first_author_race,freq,year total,prop
223,2021,Hispanic,6,167,0.036
224,2021,Indigenous,1,167,0.006
225,2021,Middle Eastern,3,167,0.018
226,2021,White,110,167,0.659
227,2022,Asian,11,78,0.141
228,2022,Black,3,78,0.038
229,2022,Hispanic,1,78,0.013
230,2022,Indigenous,1,78,0.013
231,2022,Middle Eastern,1,78,0.013
232,2022,White,61,78,0.782


In [105]:
alt.Chart(transform_year(first_author_race_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'first_author_race',
        title = 'First author race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'b',
    height=300,
    width=260
)