In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [8]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [9]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [10]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [11]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)


In [12]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [13]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [14]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [15]:
total_paper_num, total_author_num

(3169, 7083)

In [16]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Cross-country collaborations

In [24]:
papers[papers.cross_country == 'Yes'].shape

(479, 33)

In [26]:
def get_dic_from_firstauthor(DF, var):
    '''I want to have these tuples: (i, j) where i is the first author country code
        or another variable
    '''
    tuple_list = []
    for group in DF.groupby('doi'):
        group[1].sort_values(by='authorPosition', inplace=True)
        # first author country
        firstauthor_cntry = group[1].iloc[0, :][var]
        # if more than one author
        if group[1].shape[0] > 1:
            # set of other authors' country
            other_cntrycodes = group[1].iloc[1:, :][var].tolist()
            # for each of the other authors' country
            # form a tuple with the first author country
            for i in other_cntrycodes:
                if i != firstauthor_cntry:
                    a_tuple = [(firstauthor_cntry, i)]
                    tuple_list.append(a_tuple)
    bicode = list(itertools.chain(*tuple_list))
    bicode_counts = Counter(bicode)
    bicode_counts_dic = dict(bicode_counts)
    return bicode_counts_dic

In [27]:
import itertools
bicode_counts_dic = get_dic_from_firstauthor(authors, var = 'countrypred')
bicode_counts_dic

{('US', 'CA'): 22,
 ('CA', 'US'): 12,
 ('TR', 'DE'): 1,
 ('US', 'PH'): 16,
 ('IN', 'US'): 2,
 ('CN', 'US'): 17,
 ('CL', 'GB'): 1,
 ('AU', 'NL'): 1,
 ('NL', 'SE'): 2,
 ('CA', 'GB'): 1,
 ('DE', 'CA'): 1,
 ('AT', 'DE'): 2,
 ('NO', 'GB'): 1,
 ('GB', 'NL'): 5,
 ('CH', 'US'): 1,
 ('US', 'CH'): 5,
 ('ES', 'GB'): 2,
 ('US', 'NZ'): 4,
 ('ES', 'CO'): 1,
 ('SE', 'AR'): 1,
 ('FR', 'CA'): 3,
 ('ES', 'US'): 6,
 ('BE', 'US'): 2,
 ('GB', 'DK'): 1,
 ('US', 'CN'): 19,
 ('US', 'BE'): 1,
 ('US', 'CO'): 1,
 ('PH', 'US'): 21,
 ('CN', 'SG'): 1,
 ('NL', 'DE'): 4,
 ('DE', 'NL'): 14,
 ('IL', 'GB'): 1,
 ('US', 'FR'): 2,
 ('US', 'CL'): 4,
 ('GB', 'CH'): 1,
 ('US', 'DE'): 27,
 ('US', 'IL'): 10,
 ('AT', 'US'): 10,
 ('AT', 'NZ'): 2,
 ('US', 'JP'): 14,
 ('CL', 'NL'): 4,
 ('US', 'KR'): 17,
 ('DE', 'NO'): 2,
 ('DE', 'AT'): 1,
 ('SG', 'US'): 20,
 ('DE', 'US'): 34,
 ('US', 'NL'): 20,
 ('SG', 'AT'): 2,
 ('BE', 'NL'): 3,
 ('US', 'GB'): 17,
 ('US', 'SG'): 13,
 ('US', 'AU'): 13,
 ('PH', 'AR'): 1,
 ('KR', 'US'): 13,
 ('GE', '

In [28]:
def get_chord_df(DIC): # DIC here is bicode_counts_dic
    """
    Return:
        A dataframe containig three columns: source, targe, value.
        Even though I am using `source`, and `target`, this is an undirected ntework. 
    """
    chord_df = pd.DataFrame(DIC.items(), columns=['pairs','value'])
    chord_df['source'] = chord_df.pairs.apply(lambda x: x[0])
    chord_df['target'] = chord_df.pairs.apply(lambda x: x[1])
    chord_df = chord_df[
        ['source', 'target', 'value']].sort_values(
        by='value', ascending=False).reset_index(drop=True)
    return chord_df

In [29]:
author_chord_cntry_from_firstauthor = get_chord_df(bicode_counts_dic)

In [30]:
author_chord_cntry_from_firstauthor.value.sum()

744

In [31]:
author_chord_cntry_from_firstauthor.to_csv(
    '../data/plots/cntry_chord_from_firstauthor.csv', index=False)

In [32]:
cntry_chord = author_chord_cntry_from_firstauthor
cntry_chord

Unnamed: 0,source,target,value
0,DE,US,34
1,GE,US,28
2,US,DE,27
3,US,CA,22
4,PH,US,21
...,...,...,...
206,NL,IL,1
207,NL,CN,1
208,DE,IL,1
209,CA,IL,1


In [33]:
len(np.unique(cntry_chord.source)), len(np.unique(cntry_chord.target))

(38, 53)

In [34]:
unique_cntry = np.unique(cntry_chord.source.tolist() + cntry_chord.target.tolist())
len(unique_cntry)

55

In [35]:
tuples = []
for i in np.unique(cntry_chord.source):
    dff = cntry_chord[cntry_chord.source == i]
    total = dff.value.sum()
    tuples.append((i, total))

In [36]:
# country chord source
ccs = pd.DataFrame(tuples, columns = ['country', 'total'])
ccs['prop'] = round(ccs['total'] / cntry_chord.value.sum(), 3)
ccs.sort_values('prop', ascending = False).head()

Unnamed: 0,country,total,prop
36,US,279,0.375
8,DE,76,0.102
13,GB,41,0.055
25,NL,35,0.047
14,GE,31,0.042


In [38]:
tuples = []
for i in unique_cntry:
    dff = cntry_chord[(cntry_chord.source == i) | (cntry_chord.target == i)]
    total = dff.value.sum()
    tuples.append((i, total))

In [39]:
NUM_OF_TOP_COUNTRY = 5
cntry_chord_nodes = pd.DataFrame(tuples, columns = ['country', 'value']).sort_values(
    by=['value'], ascending = False).reset_index(drop=True)
total_node = cntry_chord_nodes.value.sum()
total_node

1488

In [40]:
cntry_chord_nodes['prop'] = round(cntry_chord_nodes['value']/cntry_chord.value.sum(), 3)

In [41]:
cntry_chord_nodes.head()

Unnamed: 0,country,value,prop
0,US,523,0.703
1,DE,124,0.167
2,GB,88,0.118
3,NL,86,0.116
4,CN,48,0.065


In [42]:
cntry_chord_nodes.head().prop.sum()

1.169

In [43]:
cntry_chord_nodes.head().value.sum()

869

In [44]:
cntry_chord[(cntry_chord.source == 'US') | (cntry_chord.target=='US')].value.sum()

523

In [45]:
most_active_cntry = cntry_chord_nodes.head(NUM_OF_TOP_COUNTRY).country.tolist()
most_active_cntry

['US', 'DE', 'GB', 'NL', 'CN']

In [46]:
# Number of pairs where the most active countries appeared ???
# this is not the case! I need to use `num_most_active_participated - between top`?
num_most_active_participated = cntry_chord[(
    cntry_chord.source.isin(most_active_cntry)) | (
    cntry_chord.target.isin(most_active_cntry))].value.sum()
num_most_active_participated

673

In [47]:
# Number of pairs made up by collaborations between most active countries
between_top = cntry_chord[(
    cntry_chord.source.isin(most_active_cntry)) & (
    cntry_chord.target.isin(most_active_cntry))].value.sum()
between_top

196

In [48]:
between_top / author_chord_cntry_from_firstauthor.value.sum()

0.26344086021505375

### Author cords from first authors (gender and race)

In [49]:
authors['gender_and_race'] = authors['genderpred'] + '_' + authors['racepred']

In [50]:
bicode_counts_dic = get_dic_from_firstauthor(authors, var = 'gender_and_race')
author_chord_gr_from_firstauthor = get_chord_df(bicode_counts_dic)

In [51]:
author_chord_gr_from_firstauthor.value.sum()

2403

In [52]:
author_chord_gr_from_firstauthor.to_csv('../data/plots/author_chord_gr_from_firstauthor.csv', index=False)

In [54]:
gr_chord = author_chord_gr_from_firstauthor
gr_chord

Unnamed: 0,source,target,value
0,M_White,F_White,586
1,F_White,M_White,509
2,M_Asian,M_White,128
3,F_Asian,M_White,114
4,M_White,F_Asian,113
...,...,...,...
76,M_White,N_Hispanic,1
77,F_Hispanic,F_Black,1
78,N_White,M_Hispanic,1
79,F_Indigenous,F_Black,1


In [55]:
len(np.unique(gr_chord.source)), len(np.unique(gr_chord.target))

(14, 13)

In [56]:
unique_label = np.unique(gr_chord.source.tolist() + gr_chord.target.tolist())
len(unique_label)

15

In [59]:
tuples = []
for i in np.unique(gr_chord.source):
    dff = gr_chord[gr_chord.source == i]
    total = dff.value.sum()
    tuples.append((i, total))

grs = pd.DataFrame(tuples, columns = ['country', 'total'])
grs['prop'] = round(grs['total'] / gr_chord.value.sum(), 3)
grs.sort_values('prop', ascending = False).head()

Unnamed: 0,country,total,prop
11,M_White,913,0.38
5,F_White,713,0.297
0,F_Asian,291,0.121
6,M_Asian,273,0.114
2,F_Hispanic,59,0.025
