In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [58]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [59]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [60]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [61]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [62]:
authors['genderpred'] = authors['genderpred'].map({'F': "W", "M": "M", "N":"N"})

In [63]:
authors['racepred'] = authors['racepred'].map({
    "White": "White", 
    'Hispanic': "HLS", 
    "Black": "Black",
    "Indigenous": "Indigenous",
    "Middle Eastern": "MENA",
    "Asian": "Asian"
})

In [64]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [65]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [66]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [67]:
total_paper_num, total_author_num

(5712, 11292)

In [68]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Cross-country collaborations

In [69]:
papers[papers.cross_country == 'Yes'].shape

(589, 34)

In [70]:
list(set(authors[authors.countrypred == 'GE']['ROR_AFFNAME']))

['Georgian Institute of Public Affairs']

In [71]:
def get_dic_from_firstauthor(DF, var):
    '''I want to have these tuples: (i, j) where i is the first author country code
        or another variable
    '''
    tuple_list = []
    for group in DF.groupby('doi'):
        group[1].sort_values(by='authorPosition', inplace=True)
        # first author country
        firstauthor_cntry = group[1].iloc[0, :][var]
        # if more than one author
        if group[1].shape[0] > 1:
            # set of other authors' country
            other_cntrycodes = group[1].iloc[1:, :][var].tolist()
            # for each of the other authors' country
            # form a tuple with the first author country
            for i in other_cntrycodes:
                if i != firstauthor_cntry:
                    a_tuple = [(firstauthor_cntry, i)]
                    tuple_list.append(a_tuple)
    bicode = list(itertools.chain(*tuple_list))
    bicode_counts = Counter(bicode)
    bicode_counts_dic = dict(bicode_counts)
    return bicode_counts_dic

In [72]:
import itertools
bicode_counts_dic = get_dic_from_firstauthor(authors, var = 'countrypred')
bicode_counts_dic

{('US', 'CA'): 30,
 ('CA', 'US'): 17,
 ('TR', 'DE'): 1,
 ('US', 'PH'): 26,
 ('IN', 'US'): 2,
 ('CN', 'US'): 18,
 ('CL', 'GB'): 1,
 ('AU', 'NL'): 1,
 ('NL', 'SE'): 2,
 ('CA', 'GB'): 2,
 ('DE', 'CA'): 1,
 ('AT', 'DE'): 2,
 ('NO', 'GB'): 1,
 ('GB', 'NL'): 6,
 ('CH', 'US'): 1,
 ('US', 'CH'): 6,
 ('ES', 'GB'): 2,
 ('US', 'NZ'): 6,
 ('ES', 'CO'): 1,
 ('SE', 'AR'): 1,
 ('FR', 'CA'): 3,
 ('ES', 'US'): 6,
 ('BE', 'US'): 2,
 ('GB', 'DK'): 1,
 ('US', 'CN'): 20,
 ('US', 'BE'): 2,
 ('US', 'CO'): 2,
 ('PH', 'US'): 39,
 ('CN', 'SG'): 1,
 ('NL', 'DE'): 4,
 ('DE', 'NL'): 15,
 ('IL', 'GB'): 1,
 ('US', 'FR'): 4,
 ('US', 'CL'): 4,
 ('GB', 'CH'): 1,
 ('US', 'DE'): 30,
 ('US', 'IL'): 11,
 ('AT', 'US'): 10,
 ('AT', 'NZ'): 2,
 ('US', 'JP'): 19,
 ('CL', 'NL'): 4,
 ('US', 'KR'): 20,
 ('DE', 'NO'): 2,
 ('DE', 'AT'): 1,
 ('SG', 'US'): 21,
 ('DE', 'US'): 35,
 ('US', 'NL'): 20,
 ('SG', 'AT'): 2,
 ('BE', 'NL'): 3,
 ('US', 'GB'): 27,
 ('US', 'SG'): 14,
 ('US', 'AU'): 18,
 ('PH', 'AR'): 1,
 ('KR', 'US'): 14,
 ('CN', '

In [73]:
def get_chord_df(DIC): # DIC here is bicode_counts_dic
    """
    Return:
        A dataframe containig three columns: source, targe, value.
    """
    chord_df = pd.DataFrame(DIC.items(), columns=['pairs','value'])
    chord_df['source'] = chord_df.pairs.apply(lambda x: x[0])
    chord_df['target'] = chord_df.pairs.apply(lambda x: x[1])
    chord_df = chord_df[
        ['source', 'target', 'value']].sort_values(
        by='value', ascending=False).reset_index(drop=True)
    return chord_df

In [74]:
author_chord_cntry_from_firstauthor = get_chord_df(bicode_counts_dic)

In [75]:
author_chord_cntry_from_firstauthor.value.sum()

860

In [76]:
author_chord_cntry_from_firstauthor.to_csv(
    '../data/plots/cntry_chord_from_firstauthor.csv', index=False)

In [77]:
cntry_chord = author_chord_cntry_from_firstauthor
cntry_chord

Unnamed: 0,source,target,value
0,PH,US,39
1,DE,US,35
2,US,CA,30
3,US,DE,30
4,US,GB,27
...,...,...,...
232,JP,CR,1
233,AU,FR,1
234,CH,FR,1
235,PT,IL,1


In [78]:
len(np.unique(cntry_chord.source)), len(np.unique(cntry_chord.target))

(46, 60)

In [79]:
unique_cntry = np.unique(cntry_chord.source.tolist() + cntry_chord.target.tolist())
len(unique_cntry)

65

In [80]:
tuples = []
for i in np.unique(cntry_chord.source):
    dff = cntry_chord[cntry_chord.source == i]
    total = dff.value.sum()
    tuples.append((i, total))

In [81]:
# country chord source
ccs = pd.DataFrame(tuples, columns = ['country', 'total'])
ccs['prop'] = round(ccs['total'] / cntry_chord.value.sum(), 3)
ccs.sort_values('prop', ascending = False).head()

Unnamed: 0,country,total,prop
44,US,339,0.394
10,DE,82,0.095
15,GB,51,0.059
31,PH,45,0.052
28,NL,36,0.042


In [82]:
tuples = []
for i in unique_cntry:
    dff = cntry_chord[(cntry_chord.source == i) | (cntry_chord.target == i)]
    total = dff.value.sum()
    tuples.append((i, total))

In [83]:
NUM_OF_TOP_COUNTRY = 5
cntry_chord_nodes = pd.DataFrame(tuples, columns = ['country', 'value']).sort_values(
    by=['value'], ascending = False).reset_index(drop=True)
total_node = cntry_chord_nodes.value.sum()
total_node

1720

In [84]:
cntry_chord_nodes['prop'] = round(cntry_chord_nodes['value']/cntry_chord.value.sum(), 3)

In [85]:
cntry_chord_nodes.head()

Unnamed: 0,country,value,prop
0,US,613,0.713
1,DE,134,0.156
2,GB,116,0.135
3,NL,88,0.102
4,PH,73,0.085


In [86]:
cntry_chord_nodes.head().prop.sum()

1.191

In [87]:
cntry_chord_nodes.head().value.sum()

1024

In [88]:
cntry_chord[(cntry_chord.source == 'US') | (cntry_chord.target=='US')].value.sum()

613

In [89]:
most_active_cntry = cntry_chord_nodes.head(NUM_OF_TOP_COUNTRY).country.tolist()
most_active_cntry

['US', 'DE', 'GB', 'NL', 'PH']

In [90]:
# Number of pairs where the most active countries appeared ???
# this is not the case! I need to use `num_most_active_participated - between top`?
num_most_active_participated = cntry_chord[(
    cntry_chord.source.isin(most_active_cntry)) | (
    cntry_chord.target.isin(most_active_cntry))].value.sum()
num_most_active_participated

777

In [91]:
# Number of pairs made up by collaborations between most active countries
between_top = cntry_chord[(
    cntry_chord.source.isin(most_active_cntry)) & (
    cntry_chord.target.isin(most_active_cntry))].value.sum()
between_top

247

In [92]:
between_top / author_chord_cntry_from_firstauthor.value.sum()

0.2872093023255814

### Author cords from first authors (gender and race)

In [93]:
papers[papers.cross_gender_and_race == 'Yes'].shape

(552, 34)

In [94]:
authors['gender_and_race'] = authors['genderpred'] + '_' + authors['racepred']

In [95]:
bicode_counts_dic = get_dic_from_firstauthor(authors, var = 'gender_and_race')
author_chord_gr_from_firstauthor = get_chord_df(bicode_counts_dic)

In [96]:
author_chord_gr_from_firstauthor.value.sum()

3121

In [97]:
author_chord_gr_from_firstauthor.to_csv('../data/plots/author_chord_gr_from_firstauthor.csv', index=False)

In [98]:
gr_chord = author_chord_gr_from_firstauthor
gr_chord

Unnamed: 0,source,target,value
0,M_White,W_White,906
1,W_White,M_White,741
2,M_Asian,M_White,159
3,M_White,M_Asian,126
4,M_White,W_Asian,122
...,...,...,...
79,W_MENA,M_Asian,1
80,M_MENA,W_Asian,1
81,N_Asian,W_Black,1
82,N_Asian,W_HLS,1


In [99]:
set(gr_chord.source)

{'M_Asian',
 'M_Black',
 'M_HLS',
 'M_Indigenous',
 'M_MENA',
 'M_White',
 'N_Asian',
 'N_White',
 'W_Asian',
 'W_Black',
 'W_HLS',
 'W_Indigenous',
 'W_MENA',
 'W_White'}

In [100]:
gr_chord[gr_chord.source == 'W_Indigenous']

Unnamed: 0,source,target,value
56,W_Indigenous,M_White,2
65,W_Indigenous,W_Black,1
68,W_Indigenous,M_Asian,1
69,W_Indigenous,W_White,1
70,W_Indigenous,W_Asian,1


In [101]:
gr_chord[gr_chord.source == 'M_Indigenous']

Unnamed: 0,source,target,value
57,M_Indigenous,W_White,2
63,M_Indigenous,M_White,1


In [102]:
gr_chord[gr_chord.source == 'M_White']

Unnamed: 0,source,target,value
0,M_White,W_White,906
3,M_White,M_Asian,126
4,M_White,W_Asian,122
13,M_White,W_HLS,38
14,M_White,M_HLS,28
21,M_White,M_Black,17
22,M_White,W_Black,16
23,M_White,M_MENA,15
24,M_White,W_MENA,13
41,M_White,W_Indigenous,5


In [103]:
gr_chord[gr_chord.source == 'W_White']

Unnamed: 0,source,target,value
1,W_White,M_White,741
7,W_White,W_Asian,78
11,W_White,M_Asian,55
15,W_White,M_HLS,24
16,W_White,W_HLS,24
19,W_White,W_Black,21
25,W_White,M_Black,12
44,W_White,W_Indigenous,5
46,W_White,M_MENA,4
50,W_White,W_MENA,3


In [104]:
len(np.unique(gr_chord.source)), len(np.unique(gr_chord.target))

(14, 13)

In [105]:
unique_label = np.unique(gr_chord.source.tolist() + gr_chord.target.tolist())
len(unique_label)

15

In [106]:
tuples = []
for i in np.unique(gr_chord.source):
    dff = gr_chord[gr_chord.source == i]
    total = dff.value.sum()
    tuples.append((i, total))

grs = pd.DataFrame(tuples, columns = ['country', 'total'])
grs['prop'] = round(grs['total'] / gr_chord.value.sum(), 3)
grs.sort_values('prop', ascending = False).head()

Unnamed: 0,country,total,prop
5,M_White,1289,0.413
13,W_White,967,0.31
0,M_Asian,315,0.101
8,W_Asian,304,0.097
2,M_HLS,80,0.026
