In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import altair as alt
from altair import datum

In [13]:
def get_simple_prop_df(data, var):
    # get frequency and proportion over all the years
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'authors':
        df = pd.DataFrame(Counter(authors[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_author_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    else:
        df = pd.DataFrame(Counter(papers[var]).items(), columns = [var, 'freq'])
        df['prop'] = round(df['freq'] / total_paper_num, 3)
        df.sort_values('prop', ascending = False, inplace = True)
    return df

In [14]:
def get_freq_and_prop(data, var):
    # get time series data
    assert data in ['papers', 'authors'], 'data is wrong!'
    if data == 'papers':
        dic = yearly_papernum_dic
        df = papers.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    else:
        dic = yearly_authornum_dic
        df = authors.groupby(['year', var]).size().to_frame('freq').reset_index()
        df['year total'] = [dic[x] for x in df.year]
    df['prop'] = round(df['freq'] / df['year total'], 3)
    return df

In [15]:
def transform_year(df):
    dff = df.copy()
    dff['year'] = pd.to_datetime(dff['year'] , format='%Y')
    return dff

In [16]:
authors = pd.read_csv('../data/processed/authors_to_study_expanded.csv')
papers = pd.read_csv('../data/processed/papers_to_study_expanded.csv')
papers.replace({
    True: 'Yes',
    False: 'No'
}, inplace = True)

In [17]:
for i in [papers, authors]:
    i.replace({
        'Communication Theory': 'CT',
        'Communication, Culture and Critique': 'CCC',
        'Human Communication Research': 'HCR',
        'Journal of Communication': 'JOC',
        'Journal of Computer-Mediated Communication': 'JCMC'
                      }, inplace = True)

In [18]:
total_paper_num = len(papers)
total_author_num = len(authors)

In [19]:
yearly_papernum = papers.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum = authors.groupby(
    'year').size().to_frame(name='freq').reset_index()
yearly_authornum_dic = dict(zip(yearly_authornum.year, yearly_authornum.freq))
yearly_papernum_dic = dict(zip(yearly_papernum.year, yearly_papernum.freq))

## Analysis

In [20]:
top_num = 5
top_country_dic = dict(Counter(authors['countrypred']).most_common(top_num))
top_country = list(top_country_dic.keys())
top_country

['US', 'NL', 'DE', 'GB', 'IL']

In [21]:
all_country = list(set(authors['countrypred']))
len(all_country)

52

In [22]:
authors['countrypred_new'] = authors['countrypred'].apply(
        lambda x: x if x in top_country else 'Other'
    )

In [23]:
country_prop = get_simple_prop_df('authors', 'countrypred_new')
country_prop.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'United Kimdom',
                    'IL': 'Israel'}, inplace = True)
country_prop

Unnamed: 0,countrypred_new,freq,prop
0,US,2784,0.628
2,Other,874,0.197
5,Netherlands,283,0.064
1,Germany,209,0.047
3,United Kimdom,158,0.036
4,Israel,126,0.028


In [24]:
countries = country_prop.countrypred_new.tolist()
country_colors = ['grey', 'orange', 'red', 'teal', 'pink', 'purple']

In [25]:
country_dist_chart = alt.Chart(country_prop).mark_bar().encode(
    x = alt.X(
        'countrypred_new',
        title = None,
        sort = '-y',
        axis = alt.Axis(labelAngle = -45)
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
    ),
    color = alt.Color(
        'countrypred_new',
        title = 'Country/Region',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'a',
    height=300,
    width=140
)

country_dist_chart

In [26]:
country_ts = get_freq_and_prop('authors', 'countrypred_new')
country_ts.columns = ['year', 'country', 'freq', 'year total', 'prop']
country_ts.replace({'NL': 'Netherlands',
                    'DE': 'Germany',
                    'GB': 'United Kimdom',
                    'IL': 'Israel'}, inplace = True)
country_ts

Unnamed: 0,year,country,freq,year total,prop
0,2010,Germany,3,295,0.010
1,2010,United Kimdom,3,295,0.010
2,2010,Israel,11,295,0.037
3,2010,Netherlands,9,295,0.031
4,2010,Other,50,295,0.169
...,...,...,...,...,...
73,2022,United Kimdom,9,200,0.045
74,2022,Israel,5,200,0.025
75,2022,Netherlands,20,200,0.100
76,2022,Other,38,200,0.190


In [27]:
source = country_ts

country_stacked_chart = alt.Chart(transform_year(source)).mark_area().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y=alt.Y(
        "freq:Q",
        title = '# of Authors'
    ),
    color=alt.Color(
        "country:N",
        title = 'Country/Region',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    )
).properties(
    title = 'b',
    height=300,
    width=260
)

country_stacked_chart

In [28]:
line = alt.Chart(transform_year(source)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year'
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'country',
        legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
)

points = line.mark_point().encode(
    color = alt.Color(
        'country',
        title = 'Country/Region',
#         legend = None,
        scale = alt.Scale(
            domain = countries,
            range = country_colors
        )
    ),
    shape = alt.Shape(
        'country',
#         legend = None,
        scale = alt.Scale(
            domain = countries
        )
    )
)

country_prop_ts_chart = alt.layer(
    line,
    points,
).resolve_scale(
    color = 'independent',
    shape = 'independent'
).properties(
    title = 'c',
    height=300,
    width=300
)

country_prop_ts_chart

In [29]:
chart1 = alt.hconcat(
    country_dist_chart,
    country_stacked_chart,
    country_prop_ts_chart,
)

chart1

### With US authors

In [30]:
with_us_prop = get_simple_prop_df('papers', 'with_us_authors')
with_us_prop

Unnamed: 0,with_us_authors,freq,prop
0,Yes,1301,0.68
1,No,611,0.32


In [31]:
source = with_us_prop
with_us_chart = alt.Chart(source).mark_bar().encode(
    x = alt.X(
        'with_us_authors',
        title = 'With US authors',
        sort = '-y',
        axis = alt.Axis(labelAngle = 0)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    ),
    color = alt.Color(
        'with_us_authors',
        legend = None,
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'd',
    height=200,
    width=100
)

with_us_chart

In [32]:
with_us_prop_ts = get_freq_and_prop('papers', 'with_us_authors')
with_us_prop_ts.head()

Unnamed: 0,year,with_us_authors,freq,year total,prop
0,2010,No,31,139,0.223
1,2010,Yes,108,139,0.777
2,2011,No,23,138,0.167
3,2011,Yes,115,138,0.833
4,2012,No,57,163,0.35


In [33]:
with_us_ts_chart = alt.Chart(transform_year(with_us_prop_ts)).mark_line().encode(
    x = alt.X(
        'year',
        title = 'Year',
    ),
    y = alt.Y(
        'prop',
        title = 'Proportion'
    ),
    color = alt.Color(
        'with_us_authors',
        title = 'With US authors',
        scale = alt.Scale(
            domain = ['Yes', 'No'],
        )
    )
).properties(
    title = 'e',
    height=200,
    width=200
)

with_us_ts_chart

## Number of countries

In [34]:
num_country_df = get_simple_prop_df('papers', 'num_country')
num_country_df

Unnamed: 0,num_country,freq,prop
0,1,1581,0.827
1,2,278,0.145
3,3,43,0.022
2,4,6,0.003
5,9,3,0.002
4,6,1,0.001


In [35]:
country_num_dist_chart = alt.Chart(num_country_df).mark_bar().encode(
    x = alt.X(
        'num_country:N',
        title = 'Number of countries',
        axis = alt.Axis(labelAngle = 0)
    ),
    y = alt.Y(
        'freq',
        title = '# of Publications'
    ),
    color = alt.Color(
        'num_country:N',
        legend = None,
    )
).properties(
    title = 'f',
    height=200,
    width=150
)

country_num_dist_chart

In [36]:
num_country_ts = get_freq_and_prop('papers', 'num_country')
num_country_ts = transform_year(num_country_ts)
num_country_ts.head()

Unnamed: 0,year,num_country,freq,year total,prop
0,2010-01-01,1,120,139,0.863
1,2010-01-01,2,16,139,0.115
2,2010-01-01,3,3,139,0.022
3,2011-01-01,1,120,138,0.87
4,2011-01-01,2,15,138,0.109


In [37]:
alt.Chart(num_country_ts).mark_line().encode(
    x = 'year',
    y = 'prop',
    color = 'num_country:N'
)

## Affiliation types

In [38]:
afftype_prop = get_simple_prop_df('authors', 'afftypepred')
afftype_prop.replace({
    'Education': 'Edu',
    'Non Education': 'Non-Edu'
}, inplace = True)
afftype_prop

Unnamed: 0,afftypepred,freq,prop
0,Edu,4293,0.968
1,Non-Edu,141,0.032


In [39]:
afftype_dist_chart = alt.Chart(afftype_prop).mark_bar().encode(
    x = alt.X(
        'afftypepred',
        title = 'Affiliation type',
        axis = alt.Axis(labelAngle = -45),
    ),
    y = alt.Y(
        'freq',
        title = '# of Authors'
#         scale = alt.Scale(type = 'log')
    ),
    color = alt.Color(
        'afftypepred',
        legend = None,
    )
).properties(
    title = 'g',
    height=200,
    width=100
)

afftype_dist_chart

In [40]:
chart2 = alt.hconcat(
    with_us_chart,
    with_us_ts_chart,
    country_num_dist_chart,
    afftype_dist_chart
).resolve_scale(
    color='independent',
)

chart2

In [41]:
alt.vconcat(
    chart1,
    chart2
).configure_axis(
    labelFontSize=15,
    titleFontSize=20
).configure_legend(
    titleFontSize=16,
    labelFontSize=16
).configure_title(
    anchor='start',
    fontSize=22
).configure_point(
    size=100
)

## Country tier/ north-south analysis

In [53]:
from pycountry_convert import *

country_codes = list(set(authors.countrypred))

country_codes

['AT',
 'TK',
 'ES',
 'IN',
 'FR',
 'DK',
 'CA',
 'PL',
 'GB',
 'TH',
 'ET',
 'SG',
 'JP',
 'SI',
 'CR',
 'AU',
 'UG',
 'GR',
 'CY',
 'IS',
 'PT',
 'CL',
 'LV',
 'PH',
 'LB',
 'MX',
 'CO',
 'US',
 'BR',
 'IT',
 'EG',
 'NZ',
 'KE',
 'CZ',
 'DE',
 'GE',
 'AR',
 'TW',
 'HU',
 'KR',
 'NO',
 'LK',
 'IE',
 'SE',
 'FI',
 'CN',
 'TR',
 'BE',
 'CH',
 'ZA',
 'NL',
 'IL']

In [52]:
[country_alpha2_to_country_name(x) for x in country_codes]

['Austria',
 'Tokelau',
 'Spain',
 'India',
 'France',
 'Denmark',
 'Canada',
 'Poland',
 'United Kingdom',
 'Thailand',
 'Ethiopia',
 'Singapore',
 'Japan',
 'Slovenia',
 'Costa Rica',
 'Australia',
 'Uganda',
 'Greece',
 'Cyprus',
 'Iceland',
 'Portugal',
 'Chile',
 'Latvia',
 'Philippines',
 'Lebanon',
 'Mexico',
 'Colombia',
 'United States',
 'Brazil',
 'Italy',
 'Egypt',
 'New Zealand',
 'Kenya',
 'Czechia',
 'Germany',
 'Georgia',
 'Argentina',
 'Taiwan, Province of China',
 'Hungary',
 'Korea, Republic of',
 'Norway',
 'Sri Lanka',
 'Ireland',
 'Sweden',
 'Finland',
 'China',
 'Turkey',
 'Belgium',
 'Switzerland',
 'South Africa',
 'Netherlands',
 'Israel']