In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [2]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [3]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [4]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [5]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)

In [6]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [7]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [8]:
total_paper_num, total_author_num

(3169, 7083)

In [9]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## General Trends

### Total number of papers by year

In [10]:
yearly_papernum = transform_year(yearly_papernum)
yearly_papernum

Unnamed: 0,year,freq
0,2000-01-01,88
1,2001-01-01,92
2,2002-01-01,125
3,2003-01-01,112
4,2004-01-01,109
5,2005-01-01,143
6,2006-01-01,137
7,2007-01-01,152
8,2008-01-01,148
9,2009-01-01,151


In [11]:
papernum_by_year_chart = alt.Chart(yearly_papernum).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    )
).properties(
    title = 'a',
    height=300,
    width=260
)
papernum_by_year_chart

### Average number of authors by year / by journal

In [12]:
yearly_ave_authornum = papers.groupby(
    'year')['numberOfAuthors'].mean().to_frame().reset_index()

In [13]:
yearly_ave_authornum = transform_year(yearly_ave_authornum)
yearly_ave_authornum

Unnamed: 0,year,numberOfAuthors
0,2000-01-01,1.943182
1,2001-01-01,1.923913
2,2002-01-01,2.104
3,2003-01-01,1.910714
4,2004-01-01,2.275229
5,2005-01-01,2.223776
6,2006-01-01,2.270073
7,2007-01-01,2.026316
8,2008-01-01,2.054054
9,2009-01-01,2.218543


In [14]:
yearly_ave_authornum_chart = alt.Chart(yearly_ave_authornum).mark_line().encode(
    
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'numberOfAuthors',
        title = 'Average # of Authors'
    )
).properties(
    title = 'b',
    height=300,
    width=260
)
yearly_ave_authornum_chart

## Gender & Race

In [15]:
gender_ts = get_freq_and_prop(data = 'authors', var = 'genderpred')
gender_ts.head()

Unnamed: 0,year,genderpred,freq,year total,prop
0,2000,F,74,171,0.433
1,2000,M,97,171,0.567
2,2001,F,80,177,0.452
3,2001,M,97,177,0.548
4,2002,F,118,263,0.449


In [16]:
gender_ts[gender_ts.genderpred == 'N'].freq.sum()

9

In [17]:
# gender_ts['year'] = pd.to_datetime(gender_ts['year'], format='%Y')

gender_stack_chart = alt.Chart(gender_ts).mark_bar().encode(
    x = alt.X(
        'year:N',
        axis = alt.Axis(
            tickCount= 4,
            labelAngle = - 90
    )
    ),
    y = alt.Y(
        'sum(freq)',
        title = '# of Authors'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'c',
    height=300,
    width=260
)

gender_stack_chart

In [18]:
gender_ts_prop_chart = alt.Chart(transform_year(gender_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

gender_ts_prop_chart 

In [19]:
race_ts = get_freq_and_prop(data = 'authors', var = 'racepred')
race_ts.head()

Unnamed: 0,year,racepred,freq,year total,prop
0,2000,Asian,18,171,0.105
1,2000,Black,1,171,0.006
2,2000,Hispanic,5,171,0.029
3,2000,Indigenous,1,171,0.006
4,2000,White,146,171,0.854


In [20]:
unique_races = list(set(race_ts.racepred))
unique_races.sort()
unique_races

['Asian', 'Black', 'Hispanic', 'Indigenous', 'Middle Eastern', 'White']

In [21]:
race_colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f']

In [22]:
race_ts = transform_year(race_ts)

In [23]:
race_stack_chart = alt.Chart(race_ts).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'd',
    height=300,
    width=280
)

race_stack_chart

In [24]:
race_prop_chart = alt.Chart(race_ts).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Porportion',
    ),
    color = alt.Color(
        'racepred',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    )
).properties(
    title = 'f',
    height=300,
    width=280
)

race_prop_chart

## Intersection of Gender & Race

In [25]:
race_count_dic_long = dict(Counter(authors.racepred))
race_count_dic_long

{'White': 5354,
 'Hispanic': 286,
 'Asian': 1206,
 'Black': 133,
 'Middle Eastern': 88,
 'Indigenous': 16}

In [26]:
gender_count_dic = dict(Counter(authors.genderpred))
gender_count_dic

{'F': 3275, 'M': 3799, 'N': 9}

In [27]:
gender_by_race_long = authors.groupby(
    ['genderpred', 'racepred']).size().to_frame('freq').reset_index()
gender_by_race_long['gender_total'] = [gender_count_dic[x] for x in gender_by_race_long.genderpred]
gender_by_race_long['race_total'] = [race_count_dic_long[x] for x in gender_by_race_long.racepred]
gender_by_race_long['race_in_gender'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['gender_total'], 
    3)
gender_by_race_long['gender_in_race'] = round(
    gender_by_race_long['freq'] / gender_by_race_long['race_total'], 
    3)
gender_by_race_long

Unnamed: 0,genderpred,racepred,freq,gender_total,race_total,race_in_gender,gender_in_race
0,F,Asian,622,3275,1206,0.19,0.516
1,F,Black,73,3275,133,0.022,0.549
2,F,Hispanic,149,3275,286,0.045,0.521
3,F,Indigenous,13,3275,16,0.004,0.812
4,F,Middle Eastern,47,3275,88,0.014,0.534
5,F,White,2371,3275,5354,0.724,0.443
6,M,Asian,583,3799,1206,0.153,0.483
7,M,Black,60,3799,133,0.016,0.451
8,M,Hispanic,133,3799,286,0.035,0.465
9,M,Indigenous,3,3799,16,0.001,0.188


In [28]:
gender_and_race_3 = alt.Chart(gender_by_race_long).mark_bar().encode(
    x = alt.X(
        'genderpred',
        sort = ["M", "F", "N"],
        axis = None
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
#         scale=alt.Scale(type="log")  # Here the scale is applied
    ),
    color = alt.Color(
         'genderpred',
        title = 'Gender',
        scale = alt.Scale(
            domain = ['M', 'F', 'N'],
            range = ['steelblue', 'orange', 'pink']
        )
    ),
    column = alt.Column('racepred:N', 
                        title = None,
                        header=alt.Header(labelFontSize=13)
    )
).properties(
    title = 'a',
    height=300,
    width=60
)

gender_and_race_3

In [29]:
gender_and_race_4 = alt.Chart(gender_by_race_long.sample(gender_by_race_long.shape[0])).mark_bar().encode(
    x = alt.X(
        'racepred:N',
        axis = None
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors',
#         scale=alt.Scale(type="log")  # Here the scale is applied
    ),
    color = alt.Color(
         'racepred:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors)
    ),
    column = alt.Column(
        'genderpred:N', 
        title = None,
        sort = ["M", "F", "N"],
        header=alt.Header(labelFontSize=13)
    )
).properties(
    title = 'b',
    height=300,
    width=100
)

gender_and_race_4

In [30]:
female_prop_by_race = authors.groupby(
    ['year', 'genderpred', 'racepred']).size().to_frame('freq').reset_index()
female_prop_by_race.head()

Unnamed: 0,year,genderpred,racepred,freq
0,2000,F,Asian,8
1,2000,F,Hispanic,1
2,2000,F,Indigenous,1
3,2000,F,White,64
4,2000,M,Asian,10


In [31]:
tuples = []
for year in list(set(female_prop_by_race.year)):
    for race in list(set(female_prop_by_race.racepred)):
        year_race = female_prop_by_race[
            (female_prop_by_race.year == year) & (female_prop_by_race.racepred == race)]
        total = sum(year_race['freq'])
        try:
            female_num = year_race[year_race.genderpred == 'F'].iloc[0]['freq']
            female_prop = female_num / total
        except:
            female_num = 0
            female_prop = 0
        tuples.append((year, race, total, female_num, female_prop))

In [32]:
female_prop_by_race = pd.DataFrame(tuples, columns = [
    'year', 
    'race', 
    'yearly race total', 
    'female_num', 
    'female_prop'])
female_prop_by_race

Unnamed: 0,year,race,yearly race total,female_num,female_prop
0,2000,White,146,64,0.438356
1,2000,Asian,18,8,0.444444
2,2000,Indigenous,1,1,1.000000
3,2000,Black,1,0,0.000000
4,2000,Hispanic,5,1,0.200000
...,...,...,...,...,...
133,2022,Asian,39,19,0.487179
134,2022,Indigenous,1,1,1.000000
135,2022,Black,6,4,0.666667
136,2022,Hispanic,8,6,0.750000


In [33]:
# unique_races.remove('Indigenous')
# race_colors.remove(race_colors[3])

line = alt.Chart(transform_year(female_prop_by_race)).mark_line(
).encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'female_prop',
        title = 'Female Proportion'
    ),
    color = alt.Color(
         'race:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors),
        legend = None
    ),
)

points = line.mark_point().encode(
    color = alt.Color(
        'race:N',
        title = 'Race',
        scale=alt.Scale(domain=unique_races,
                      range=race_colors),
    ),
    shape=alt.Shape('race', 
                    scale=alt.Scale(
                        domain=unique_races,
                        range=['cross', 'circle', 'square', 'triangle-right', 'diamond'])
    ),
    
)

female_prop_ts_chart = alt.layer(
    line,
    points
).resolve_scale(
    color = 'independent',
    shape = 'independent'
).properties(
    title = 'c',
    height=300,
    width=360
)

female_prop_ts_chart

## Country and affiliations

In [34]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
top_country

['US', 'NL', 'DE', 'GB', 'IL']

In [35]:
all_country = list(set(authors['countrypred']))
len(all_country)

62

In [36]:
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [37]:
country_prop = get_simple_prop_df('authors', 'countrypred_new')
country_prop.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'Great Britain',
                    'IL': 'Israel'}, inplace = True)
country_prop

Unnamed: 0,countrypred_new,freq,prop
0,US,4794,0.677
2,Other,1294,0.183
5,Netherlands,345,0.049
1,Germany,253,0.036
3,Great Britain,224,0.032
4,Israel,173,0.024


In [38]:
countries = country_prop.countrypred_new.tolist()
country_colors = ['grey', 'orange', 'red', 'teal', 'pink', 'purple']

In [39]:
country_dist_chart = alt.Chart(country_prop).mark_bar().encode(
    x = alt.X(
        'countrypred_new',
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
    ),
    color = alt.Color(
        'countrypred_new',
        title = 'Country/Region',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'a',
    height=300,
    width=140
)

country_dist_chart

In [40]:
country_ts = get_freq_and_prop('authors', 'countrypred_new')
country_ts.columns = ['year', 'country', 'freq', 'year total', 'prop']
country_ts.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'Great Britain',
                    'IL': 'Israel'}, inplace = True)
country_ts

Unnamed: 0,year,country,freq,year total,prop
0,2000,Germany,2,171,0.012
1,2000,Great Britain,7,171,0.041
2,2000,Israel,2,171,0.012
3,2000,Netherlands,8,171,0.047
4,2000,Other,16,171,0.094
...,...,...,...,...,...
129,2022,Great Britain,9,200,0.045
130,2022,Israel,5,200,0.025
131,2022,Netherlands,20,200,0.100
132,2022,Other,38,200,0.190


In [41]:
source = country_ts

country_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Authors'
    ),
    color=alt.Color(
        "country:N",
        title = 'Country/Region',
#         legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'a',
    height=300,
    width=260
)

country_stacked_chart

In [42]:
line = alt.Chart(transform_year(source)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'country',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
)

points = line.mark_point().encode(
    color = alt.Color(
        'country',
        title = 'Country/Region',
#         legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
    shape = alt.Shape(
        'country',
#         legend = None,
        scale = alt.Scale(
            domain = countries
        )
    )
)

country_prop_ts_chart = alt.layer(
    line,
    points,
).resolve_scale(
    color = 'independent',
    shape = 'independent'
).properties(
    title = 'c',
    height=300,
    width=300
)

country_prop_ts_chart

In [43]:
with_us_prop = get_simple_prop_df('papers', 'with_us_authors')
with_us_prop

Unnamed: 0,with_us_authors,freq,prop
0,Yes,2290,0.723
1,No,879,0.277


In [44]:
source = with_us_prop
with_us_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'with_us_authors',
        title = 'With US authors',
        sort = '-y',
        axis = alt.Axis(labelAngle = 0)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    ),
    color = alt.Color(
        'with_us_authors',
        legend = None,
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'd',
    height=200,
    width=100
)

with_us_chart

In [45]:
with_us_prop_ts = get_freq_and_prop('papers', 'with_us_authors')
with_us_prop_ts.head()

Unnamed: 0,year,with_us_authors,freq,year total,prop
0,2000,No,16,88,0.182
1,2000,Yes,72,88,0.818
2,2001,No,11,92,0.12
3,2001,Yes,81,92,0.88
4,2002,No,27,125,0.216


In [46]:
source = with_us_prop_ts

with_us_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Papers'
    ),
    color=alt.Color(
        "with_us_authors:N",
        title = 'Involving US Authors?',
#         legend = None,
        scale = alt.Scale(
            domain = ['No', 'Yes'],
            range = ['orange', 'steelblue'],
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

with_us_stacked_chart

In [47]:
with_us_ts_chart = alt.Chart(transform_year(with_us_prop_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'with_us_authors',
        title = 'With US authors',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'e',
    height=200,
    width=200
)

with_us_ts_chart

### Number of countries

In [48]:
num_country_df = get_simple_prop_df('papers', 'num_country')
num_country_df

Unnamed: 0,num_country,freq,prop
0,1,2690,0.849
1,2,408,0.129
3,3,57,0.018
2,4,9,0.003
5,9,3,0.001
4,6,1,0.0
6,7,1,0.0


In [49]:
country_num_dist_chart = alt.Chart(num_country_df).mark_bar().encode(
    x = alt.X(
        'num_country:N',
        title = 'Number of countries',
        axis = alt.Axis(labelAngle = 0)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    ),
    color = alt.Color(
        'num_country:N',
        legend = None,
    )
).properties(
    title = 'f',
    height=200,
    width=150
)

country_num_dist_chart

In [50]:
num_country_ts = get_freq_and_prop('papers', 'num_country')
num_country_ts = transform_year(num_country_ts)
num_country_ts.head()

Unnamed: 0,year,num_country,freq,year total,prop
0,2000-01-01,1,76,88,0.864
1,2000-01-01,2,10,88,0.114
2,2000-01-01,3,2,88,0.023
3,2001-01-01,1,83,92,0.902
4,2001-01-01,2,7,92,0.076


In [51]:
alt.Chart(num_country_ts).mark_line().encode(
    x = 'year',
    y = 'prop',
    color = 'num_country:N'
)

### Affiliation types

In [52]:
afftype_prop = get_simple_prop_df('authors', 'afftypepred')
afftype_prop.replace({
    'Education': 'Edu',
    'Non Education': 'Non-Edu'
}, inplace = True)
afftype_prop

Unnamed: 0,afftypepred,freq,prop
0,Edu,6801,0.96
1,Non-Edu,282,0.04


In [53]:
afftype_dist_chart = alt.Chart(afftype_prop).mark_bar().encode(
    x = alt.X(
        'afftypepred',
        title = 'Affiliation type',
        axis = alt.Axis(labelAngle = -45),
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
#         scale = alt.Scale(type = 'log')
    ),
    color = alt.Color(
        'afftypepred',
        legend = None,
    )
).properties(
    title = 'g',
    height=200,
    width=100
)

afftype_dist_chart

## Collaborations

In [54]:
cross_var_idx = [15, 16, 17, 18, 21, 22, 23, 24]
all_paper_cols = papers.columns.tolist()
cross_vars = [all_paper_cols[x] for x in cross_var_idx]
cross_vars

['cross_country',
 'cross_type',
 'cross_gender',
 'cross_race',
 'cross_gender_and_race',
 'cross_gender_and_country',
 'cross_country_and_race',
 'cross_gender_race_and_country']

In [55]:
cross_vars_new = [
    'Cross Country',
    'Cross Type',
    'Cross Gender',
    'Cross Race',
    'Cross Gender & Race',
    'Cross Gender & Country',
    'Cross Country & Race',
    'Cross Gender, Race & Country'
]

In [56]:
rename_cross_vars_dic = dict(zip(cross_vars, cross_vars_new))
rename_cross_vars_dic

{'cross_country': 'Cross Country',
 'cross_type': 'Cross Type',
 'cross_gender': 'Cross Gender',
 'cross_race': 'Cross Race',
 'cross_gender_and_race': 'Cross Gender & Race',
 'cross_gender_and_country': 'Cross Gender & Country',
 'cross_country_and_race': 'Cross Country & Race',
 'cross_gender_race_and_country': 'Cross Gender, Race & Country'}

In [57]:
dfs = []
for var in cross_vars:
    dff = get_simple_prop_df('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dfs.append(dff)

In [58]:
collab_df = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df.replace(rename_cross_vars_dic, inplace=True)
sorted_collab = collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)['collab_type'].tolist()
sorted_collab

['Cross Gender',
 'Cross Race',
 'Cross Gender & Race',
 'Cross Country',
 'Cross Gender & Country',
 'Cross Country & Race',
 'Cross Type',
 'Cross Gender, Race & Country']

In [59]:
collab_df[collab_df.binary == 'Yes'].sort_values('freq', ascending = False)

Unnamed: 0,binary,freq,prop,collab_type
5,Yes,1181,0.373,Cross Gender
7,Yes,748,0.236,Cross Race
9,Yes,487,0.154,Cross Gender & Race
1,Yes,479,0.151,Cross Country
11,Yes,309,0.098,Cross Gender & Country
13,Yes,213,0.067,Cross Country & Race
3,Yes,160,0.05,Cross Type
15,Yes,151,0.048,"Cross Gender, Race & Country"


In [60]:
source = collab_df
cross_count_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'collab_type',
        title = None,
        sort = sorted_collab,
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
        sort='-x'
    ),
    color = alt.Color(
        'binary',
        title = 'Binary',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    ),
#     color = 'binary'
).properties(
    title = 'a',
    height=300,
    width=160
)

cross_count_chart

In [61]:
dfs = []
for var in cross_vars:
    dff = get_freq_and_prop('papers', var)
    dff.rename(columns = {var: 'binary'}, inplace = True)
    dff['collab_type'] = var 
    dff.drop(columns = ['year total'], inplace = True)
    dfs.append(dff)

In [62]:
collab_df_ts = pd.concat(dfs, axis= 0, ignore_index = True)
collab_df_ts.replace(rename_cross_vars_dic, inplace=True)
collab_df_ts = collab_df_ts[collab_df_ts.binary == 'Yes']
collab_df_ts = transform_year(collab_df_ts)
collab_df_ts.head()

Unnamed: 0,year,binary,freq,prop,collab_type
1,2000-01-01,Yes,12,0.136,Cross Country
3,2001-01-01,Yes,9,0.098,Cross Country
5,2002-01-01,Yes,10,0.08,Cross Country
7,2003-01-01,Yes,10,0.089,Cross Country
9,2004-01-01,Yes,13,0.119,Cross Country


In [63]:
source = collab_df_ts

collab_ts_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Papers'
    ),
    color=alt.Color(
        "collab_type:N",
        title = 'Collaboration',
#         sort=alt.EncodingSortField('freq', op='sum', order='descending'),
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

collab_ts_stacked_chart

In [64]:
source = collab_df_ts
all_cross_ts_chart = alt.Chart(source).mark_line(point=alt.OverlayMarkDef()).encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'collab_type',
        title = 'Collaboration',
        sort=alt.EncodingSortField('prop', op='mean', order='descending'),
    )
).properties(
    title = 'b',
    height=300,
    width=300
)

all_cross_ts_chart

## Cross-race 

In [65]:
cross_race_details_df = get_simple_prop_df('papers', 'cross_race_details')
cross_race_details_df.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_df.columns = ['cross_details', 'freq', 'prop']
cross_race_details_df

Unnamed: 0,cross_details,freq,prop
1,White only,1993,0.629
0,Cross race,748,0.236
2,Asian only,318,0.1
3,Hispanic only,50,0.016
4,Black only,40,0.013
5,Middle Eastern only,19,0.006
6,Indigenous only,1,0.0


In [66]:
alt.Chart(cross_race_details_df).mark_bar().encode(
    x = alt.X(
        'cross_details:N', 
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
    ),
    color = alt.Color(
        'cross_details:N',
        legend = None,
    )
).properties(
    title = 'c',
    height=300,
    width=160
)


In [67]:
cross_gender_details_df = get_simple_prop_df('papers', 'cross_gender_details')
cross_gender_details_df.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_df.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_df.columns = ['cross_details', 'freq', 'prop']
cross_gender_details_df

Unnamed: 0,cross_details,freq,prop
1,Cross gender,1181,0.373
2,Male only,1118,0.353
0,Female only,864,0.273
3,Non-binary only,6,0.002


In [68]:
cross_details_df = pd.concat([cross_race_details_df, cross_gender_details_df])
cross_details_df

Unnamed: 0,cross_details,freq,prop
1,White only,1993,0.629
0,Cross race,748,0.236
2,Asian only,318,0.1
3,Hispanic only,50,0.016
4,Black only,40,0.013
5,Middle Eastern only,19,0.006
6,Indigenous only,1,0.0
1,Cross gender,1181,0.373
2,Male only,1118,0.353
0,Female only,864,0.273


In [69]:
cross_race_count_chart = alt.Chart(cross_details_df).mark_bar().encode(
    x = alt.X(
        'cross_details:N', 
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications',
    ),
    color = alt.Color(
        'cross_details:N',
        legend = None,
    )
).properties(
    title = 'c',
    height=300,
    width=250
)

cross_race_count_chart

In [70]:
cross_race_details_ts = get_freq_and_prop('papers', 'cross_race_details')
cross_race_details_ts.replace({'cross race': 'Cross race'}, inplace=True)
cross_race_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_race_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,2000,Asian only,3,88,0.034
1,2000,Black only,1,88,0.011
2,2000,Hispanic only,1,88,0.011
3,2000,White only,68,88,0.773
4,2000,Cross race,15,88,0.17


In [71]:
cross_gender_details_ts = get_freq_and_prop('papers', 'cross_gender_details')
cross_gender_details_ts.replace({'cross gender': 'Cross gender'}, inplace=True)
cross_gender_details_ts.replace({
    'M only': 'Male only',
    'F only': 'Female only',
    'N only': 'Non-binary only'
}, inplace = True)
cross_gender_details_ts.columns = ['year', 'cross_details', 'freq', 'year total', 'prop']
cross_gender_details_ts.head()

Unnamed: 0,year,cross_details,freq,year total,prop
0,2000,Female only,27,88,0.307
1,2000,Male only,40,88,0.455
2,2000,Cross gender,21,88,0.239
3,2001,Female only,25,92,0.272
4,2001,Male only,36,92,0.391


In [72]:
cross_details_ts = pd.concat([cross_race_details_ts, cross_gender_details_ts])
cross_details_ts

Unnamed: 0,year,cross_details,freq,year total,prop
0,2000,Asian only,3,88,0.034
1,2000,Black only,1,88,0.011
2,2000,Hispanic only,1,88,0.011
3,2000,White only,68,88,0.773
4,2000,Cross race,15,88,0.170
...,...,...,...,...,...
69,2021,Male only,46,167,0.275
70,2021,Cross gender,66,167,0.395
71,2022,Female only,25,78,0.321
72,2022,Male only,23,78,0.295


In [73]:
alt.Chart(transform_year(cross_details_ts)).mark_line(point=alt.OverlayMarkDef()).encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'cross_details:N',
        title = 'Details',
        sort=alt.EncodingSortField('prop', op='mean', order='descending')
    )
)

## Flows of collaborations