In [3]:
import os
import math
import json
import csv
import time
import random
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import reverse_geocode
import matplotlib.colors
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from collections import defaultdict, Counter
from datetime import datetime

In [4]:
Data_Root = '/Data/Promotion/'

In [5]:
# pd.options.display.precision = 20

In [6]:
CIs = {'90': 1.645, '95': 1.96, '99': 2.576}

In [7]:
def yield_one_line(filename, delimiter = '\t', quote = csv.QUOTE_NONE):
    '''a generator which produce one line of a given file'''
    with open(filename, 'r') as file:
        print('processing %s...' %(filename))
        reader = csv.reader(file, delimiter=delimiter, quoting=quote)
        for row in reader:
            yield row

In [8]:
labels = ['Male', 'Female']

In [9]:
colors = sns.color_palette()[:len(labels)]

In [40]:
utype = {'author_id': str, 'matched_tid': str, 'matched_tid_original': str, 'matched_tid_retweet': str}
reg_data = pd.read_csv(Data_Root + "revision/reg_data_drop_missing.csv", header=0, dtype=utype)

In [41]:
reg_data.shape

(14552304, 83)

In [42]:
reg_data.head(3)

Unnamed: 0,doi,pub_year,author_name,authorship_seq,authorship_pos,author_id,affiliation_ids,self_promotion_original,matched_tid_original,self_promotion_retweet,...,is_active_on_twitter,follower_cn_snapshot,matched_uid,is_active_on_twitter_ours,follower_cn_snapshot_ours,is_active_on_twitter_combine,follower_cn_snapshot_combine,self_promotion_first,matched_tid_retweet_cn,matched_tid_likes_cn
0,10.4202/app.00261.2016,2016,Michal Zaton,2,last_position,2064717215,864159182,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1
1,10.1016/j.foodchem.2013.11.152,2014,Hee-Woong Kim,2,middle_position,2099457132,165507594,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1
2,10.1016/j.foodchem.2013.11.152,2014,Deug-Chan Lee,4,middle_position,2322741405,165507594,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1


Add author first publication year

In [43]:
aid_first_pub_year = {}

with open(Data_Root + 'revision/aid_year_paper_list.json', 'r') as ifile:
    for line in ifile:
        line = json.loads(line)
        aid = line['aid']
        first_year = sorted(line['year_pids'])[0]
        aid_first_pub_year[aid] = first_year

In [44]:
reg_data['aut_first_pub_year'] = reg_data['author_id'].map(aid_first_pub_year)

In [45]:
reg_data['aut_first_pub_year'].value_counts()

2013    868354
2014    816915
2015    769534
2016    681486
2012    642743
         ...  
1824         2
1801         2
1826         1
1817         1
1802         1
Name: aut_first_pub_year, Length: 212, dtype: int64

In [12]:
reg_data['no_years_after_first_pub'] = reg_data['pub_year'] - reg_data['aut_first_pub_year']

In [15]:
# 14978 rows
reg_data.loc[reg_data['no_years_after_first_pub'] < 0, 'no_years_after_first_pub'] = 0

In [21]:
out, bins = pd.qcut(reg_data['no_years_after_first_pub'], q=11, labels=False, duplicates='drop', retbins=True)


In [22]:
np.set_printoptions(suppress=True)
bins

array([  0.,   2.,   3.,   6.,   8.,  11.,  15.,  19.,  24.,  31., 218.])

In [23]:
reg_data['author_career_year_cate'] = pd.qcut(reg_data['no_years_after_first_pub'], q=11, labels=False, duplicates='drop')


In [24]:
reg_data.author_career_year_cate.value_counts()

0    3283264
2    1735067
5    1510601
7    1332487
4    1305369
6    1271282
8    1236010
9    1218489
3     973235
1     686500
Name: author_career_year_cate, dtype: int64

Add author affiliation country

In [None]:
affi_rank = {}
affi_country = {}
affi_name = {}
for line in yield_one_line(Data_Root+'Affiliations.txt'):
    affi_id, rank, dname, lat, lon = line[0], line[1], line[3], line[9], line[10]
    affi_rank[affi_id] = int(rank)
    affi_name[affi_id] = dname
    if lat != "" and lon != "":
        lat, lon = float(lat), float(lon)
        res = reverse_geocode.search([(lat, lon)])
        country = res[0]['country']
        affi_country[affi_id] = country

In [47]:
def get_affi_country(affis):
    affi_li = affis.split('|')
    cns = []
    for affi_id in affi_li:
        # many affi ids in MAG do not have latitude and longitude info for us to infer their country
        if affi_id in affi_country:
            country = affi_country[affi_id]
            cns.append(country)
    if len(cns) > 0:
        if 'United States' in cns:
            return 'United States'
        else:
            return cns[0]
    else:
        return 'unknown'

In [48]:
reg_data['affiliation_country'] = reg_data['affiliation_ids'].apply(get_affi_country)

In [49]:
reg_data['affiliation_country'].value_counts()[:20]

United States         4818093
China                 1230968
United Kingdom        1164938
Australia              680220
Germany                674825
Japan                  563090
Canada                 520958
France                 427215
Italy                  388833
Netherlands            368337
Korea, Republic of     314694
Spain                  299167
Brazil                 243030
Switzerland            210782
India                  208746
Sweden                 203101
Belgium                190848
Denmark                136993
Taiwan                 127730
Finland                 93212
Name: affiliation_country, dtype: int64

In [51]:
len(reg_data)

14552304

In [26]:
reg_data.to_csv(Data_Root+"revision/reg_data_drop_missing.csv", index=False, header=True, encoding='utf-8')


In [10]:
utype = {'author_id': str, 'matched_tid': 'str', 'matched_tid_original': 'str', 'matched_tid_retweet': 'str'}
reg_data = pd.read_csv(Data_Root+"revision/reg_data_drop_missing.csv", header=0, dtype=utype)


In [11]:
len(reg_data)

14552304