In [1]:
%config InlineBackend.figure_format = 'retina'
import findspark
findspark.init()

import ast
import pandas as pd
import numpy as np
import os
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext, Row, SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
import time
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
from slugify import slugify

In [2]:
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [3]:
authors = sqlContext.read.parquet('./data/authors')
co_networks = sqlContext.read.parquet('./data/coauthors')
collaborations = sqlContext.read.parquet('./data/collaborations')
collaborations = collaborations.withColumn('published', F.to_date(F.col('published')))
collaborations = collaborations.withColumn('year', F.year(F.col('published')))
collaborations = collaborations.withColumnRenamed('author_id', 'auth_id')
collaborations = collaborations.withColumn('auth_id', collaborations.auth_id.cast('bigint'))
ego_alters = sqlContext.read.parquet('./data/ego_alters')


In [4]:
to_build = pd.read_csv('./networks_to_build.csv')
to_build['id'] = to_build['id'].astype(str)
to_build = to_build.sort_values(by=['country_origin'])
to_build['coauthors_count'] = to_build['co_list'].map(lambda x: len(ast.literal_eval(x)))
# Get missing ego networks
ego_publications = collaborations.filter(collaborations.auth_id.isin(list(to_build['id'].values))).select(collaborations.auth_id.alias('ego_id'),collaborations.abs_id.alias('todel1'))
ego_publications = ego_publications.join(collaborations, ego_publications.todel1 == collaborations.abs_id, 'inner')
missing_data = ego_publications.filter(ego_publications.ego_id == ego_publications.auth_id).groupby([ego_publications.ego_id, ego_publications.auth_id]).count().select('ego_id', 'count').toPandas()
#to_build = to_build[~to_build.id.isin(list(missing_data['ego_id'].values))]
missing_data.to_csv('./ego_colls.csv')


In [None]:
to_build.groupby('EGO_COUNTRY').agg(['count', 'min', 'max', 'mean', 'std']).to_excel('./export/phys/ego_networks_description.xlsx')

In [None]:
sns.set(rc={
    'figure.figsize':(12,14),
    'font.size':20,
    'axes.titlesize':20,
    'axes.labelsize':20,
})
#print(plt.rcParams.keys())
#plt.style.use('ggplot')

# Show each observation with a scatterplot
sns.stripplot(x="cited_by_count", y="country_origin",
              data=to_build, dodge=True, jitter=True,
              alpha=.30, size=7,color='.30', zorder=1)


g = sns.boxplot(x="cited_by_count", y="country_origin", data=to_build, fliersize=0, boxprops=dict(facecolor=(0,0,0,0), linewidth=2))
xlabels = ['{:,.0f}'.format(x) + 'K' for x in g.get_xticks()/1000]
g.set_xticklabels(xlabels)

g.set_xlabel("Acumulated Citations",fontsize=14)
g.set_ylabel("Origin country of author",fontsize=20)
sns.despine(offset=20, left=True, bottom=True) 
g.get_figure().savefig("./export/phys/ego_networks_distribution.png", format='png', dpi=300)


In [None]:
authors = sqlContext.read.parquet('./data/authors')
co_networks = sqlContext.read.parquet('./data/coauthors')
collaborations = sqlContext.read.parquet('./data/collaborations')
collaborations = collaborations.withColumn('published', F.to_date(F.col('published')))
collaborations = collaborations.withColumn('year', F.year(F.col('published')))
collaborations = collaborations.withColumnRenamed('author_id', 'auth_id')

In [None]:
ego_publications = collaborations.filter(collaborations.auth_id.isin(list(to_build['id'].values))).select(collaborations.auth_id.alias('ego_id'),collaborations.abs_id.alias('todel1'))
ego_publications = ego_publications.join(collaborations, ego_publications.todel1 == collaborations.abs_id, 'inner')
ego_coauthors = ego_publications.dropDuplicates(['ego_id', 'auth_id']).groupby([ego_publications.ego_id]).count()
ego_coauthors = ego_coauthors.toPandas()
# Combine networks to get all info about hem
ego_coauthors = to_build.merge(ego_coauthors, how='left', left_on='id', right_on='ego_id')
ego_coauthors.drop(['co_list', 'ego_id'], axis=1, inplace=True)
# Get correct coauthor count of ego network
ego_coauthors['coauthors'] = ego_coauthors['count']
ego_coauthors.drop(['coauthors_count','count'], axis=1, inplace=True)
ego_coauthors.to_excel('./export/phys/ego_coauthor_description.xlsx')

In [None]:
# compute network desity
def compute_network_density():
    networks = list(to_build['id'].values)
    uniqueauths = [a.author_id for a in co_networks.filter(co_networks.network_id.isin(networks)).dropDuplicates(['author_id']).select('author_id').collect()]
      
    networks = co_networks.filter(co_networks.network_id.isin(networks)).select(co_networks.network_id, co_networks.author_id.alias('source'))
    ego_publications = networks \
        .join(collaborations, networks.source == collaborations.auth_id.alias('todel1'), 'inner') \
        .select(networks.network_id, networks.source, collaborations.abs_id.alias('todel2'))
    
    network = ego_publications \
        .join(collaborations, ego_publications.todel2 == collaborations.abs_id.alias('todel3'), how='inner') \
        .drop('todel1', 'todel2', 'todel3') \
        .select(ego_publications.network_id, ego_publications.source, collaborations.auth_id.alias('target'))
    
    network = network.withColumn('source', network.source.cast('bigint'))
    network = network.withColumn('target', network.target.cast('bigint'))

    network = network.filter(network.source != network.target) \
        .filter(network.source.isin(list(to_build['id'].values)) == False) \
        .filter(network.target.isin(list(to_build['id'].values)) == False)
    
    network = network.groupby('network_id') \
        .agg(F.count('source').alias('total_edges'), F.array_union(F.collect_set('source'),F.collect_set('target')).alias('total_nodes'))
    
    network = network.toPandas()
    network['total_nodes'] = network['total_nodes'].map(lambda x: len(x))

    def density(row):
        possible_edges = (row['total_nodes']*(row['total_nodes']-1))/2
        return row['total_edges']/possible_edges
    
    network['density'] = network.apply(lambda row: density(row), axis=1)
    network.to_excel('./export/phys/ego_networks_density.xlsx')

compute_network_density() 

In [None]:
auths = co_networks.filter(co_networks.network_id.isin(list(to_build['id'].values))) \
    .join(authors, co_networks.author_id == authors.id, 'left') \
    .select(co_networks.network_id,co_networks.author_id, authors.country)

# auth_articles = auths \
#     .join(collaborations, auths.author_id == collaborations.auth_id, 'inner') \
#     .filter(collaborations.year > 2006)

auths_per_year = auth_articles \
    .groupBy(auth_articles.country) \
    .pivot('year', range(2007,2018)).agg(F.count('abs_id')).toPandas()

auths_per_year.to_csv('./per_country_year_article.csv')

In [None]:
auths_per_year = auth_articles \
    .dropDuplicates(['author_id', 'year']) \
    .groupBy(auth_articles.country) \
    .pivot('year', range(2007,2018)).agg(F.count('author_id')).toPandas()

auths_per_year.to_csv('./per_country_year_authors.csv')

In [None]:
def country_network(name):
    networks = list(to_build[to_build.country_origin == name]['id'].values)
    
    auths = co_networks.filter(co_networks.network_id.isin(networks)).dropDuplicates(['author_id']) \
        .select(co_networks.author_id.alias('source'))
    
    articles = auths \
        .join(collaborations, auths.source == collaborations.auth_id.alias('todel1'), 'inner') \
        .select(auths.source, collaborations.abs_id.alias('todel2'))

    big_network = articles \
        .join(collaborations, articles.todel2 == collaborations.abs_id, how='inner') \
        .select(articles.source,collaborations.abs_id.alias('todel3'),collaborations.auth_id.alias('target'), collaborations.year) \
        .filter(articles.source != collaborations.auth_id)
 
    grouped = big_network.filter(big_network.year > 2006) \
        .groupby([big_network.source, big_network.target, big_network.year]).agg(F.count(F.col('todel3')).alias('weight'))
    
    se_data = grouped.join(authors, grouped.source == authors.id, how='left') \
        .select(grouped.source, grouped.target, grouped.year, grouped.weight, authors.country.alias('source_country'), authors.cited_by_count.alias('source_citations'))
    
    te_data = se_data.join(authors, grouped.target == authors.id, how='left') \
        .select(se_data.source, se_data.target, se_data.year, se_data.weight, se_data.source_country, se_data.source_citations, \
               authors.country.alias('target_country'), authors.cited_by_count.alias('target_cits'))
    
    def edge_type(scountry, tcountry):
        etype = 'missing'
        # domestic types
        if scountry == name and tcountry == name:
            etype = 'domestic'
        
        if scountry != name and tcountry == name:
            etype = 'intradomestic'
            
        if scountry == name and tcountry != name:
            etype = 'intradomestic'
        
        if scountry != name and tcountry != name:
            etype = 'nondomestic'
            
        return etype
            
        
    check_type = F.udf(edge_type, StringType()) 
    te_data = te_data.withColumn('type', check_type('source_country', 'target_country'))
    return te_data


In [None]:
initial = time.time()

for country in to_build['country_origin'].unique():
    start = time.time()
    slug = slugify(country)
    data = country_network(country)
    #data.write.parquet('./data/per_country/{}'.format(slug))
    print('{}: Took (s): {}'.format(slug, time.time() - start))
    
print('Total Took (s): {}'.format(time.time() - initial))
