In [2]:
import os
import math
import json
import csv
import time
import random
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.colors
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from datetime import datetime

In [3]:
Data_Root = '/Data/Promotion/'

In [4]:
# pd.options.display.precision = 20

In [5]:
CIs = {'90': 1.645, '95': 1.96, '99': 2.576}

In [6]:
labels = ['Male', 'Female']

In [7]:
colors = sns.color_palette()[:len(labels)]

In [7]:
# load reg data with basic controls
utype = {'author_id': str, 'matched_tid': 'str', 'matched_tid_original': 'str', 'matched_tid_retweet': 'str'}
reg_data = pd.read_csv(Data_Root+"revision/reg_data.csv", header=0, dtype=utype)

In [10]:
# before dropping missing values
reg_data.shape

(31956185, 58)

In [None]:
reg_data.isna().sum()

In [None]:
# num of obs in regression after dropping missing data
inx = reg_data[reg_data.columns[~reg_data.columns.isin(['matched_tid', 'matched_tid_original', 'matched_tid_retweet'])]].dropna().index


In [13]:
# mainly dropped due to missing data on affiliation and journal impact.
len(inx)

14552304

In [None]:
reg_data = reg_data.loc[inx]
reg_data.index = range(len(reg_data))

In [None]:
del inx

### Add total mentions and self-promotion position

In [15]:
dois = set(reg_data.doi)

In [16]:
len(dois)

2986263

In [17]:
doi_tids = {}
with open(Data_Root+'revision/papers_2013_2018.json', 'r') as ifile:
    for line in ifile:
        res = json.loads(line)
        doi = res['doi']
        if doi in dois:
            doi_tids[doi] = [tweet['tweet_id'] for tweet in res['data']['tweets']]

In [18]:
len(doi_tids)

2986263

In [19]:
tids = set()
for doi in doi_tids:
    for tid in doi_tids[doi]:
        tids.add(tid)

In [20]:
len(tids)

23372559

In [21]:
tid_date_all = {}
tid_follow_cn_all = {}

with open(Data_Root+'tweets_v2.json', 'r') as ifile:
    for line in ifile:
        line = json.loads(line)
        tid, tdate = line['id_str'], line['created_at']
        if tid in tids:
            tid_date_all[tid] = datetime.strptime(tdate, '%a %b %d %H:%M:%S +0000 %Y')#.strftime("%Y-%m-%d")
            tid_follow_cn_all[tid] = line['user']['followers_count']

# this file is collected earlier, but without full text
with open(Data_Root+'tweets_v2_non_full_text.json', 'r') as ifile:
    for line in ifile:
        line = json.loads(line)
        tid, tdate = line['id_str'], line['created_at']
        if tid in tids:
            # tid_name_all[tid] = line['user']['name']
            tid_date_all[tid] = datetime.strptime(tdate, '%a %b %d %H:%M:%S +0000 %Y')#.strftime("%Y-%m-%d")
            tid_follow_cn_all[tid] = line['user']['followers_count']

In [22]:
len(tid_date_all)

21444419

In [23]:
len(tid_follow_cn_all)

21444419

Count num of tweets for the final set of papers

In [24]:
# focus on Male and Female
len(reg_data.loc[reg_data['gender'].isin(['Female', 'Male'])])

11420039

In [25]:
cn = 0
for doi in set(reg_data.loc[reg_data['gender'].isin(['Female', 'Male'])]['doi']):
    cn += len(doi_tids[doi])

In [26]:
cn

23082422

In [27]:
def check_early_self_promo(row):
    doi, p_0, mat_tid = row['doi'], row['self_promotion'], row['matched_tid']
    valid_tids = [tid for tid in doi_tids[doi] if tid in tid_date_all]
    total = len(valid_tids)
    if p_0:
        sort_tids = sorted(valid_tids, key=lambda tid: tid_date_all[tid])
        # valid_users = [tid_name_all[tid] for tid in sort_tids]
        # ix = valid_users.index(t_user)
        ix = sort_tids.index(mat_tid) + 1
        return (total, ix, ix/total)
    else:
        return (total, -1, -1)

In [28]:
reg_data['Tem'] = reg_data.apply(lambda row: check_early_self_promo(row), axis = 1)
reg_data[['total_num_tweets', 'self_promo_position', 'self_promo_position_frac']] = pd.DataFrame(reg_data.Tem.values.tolist(), index = reg_data.index)
reg_data = reg_data.drop(columns=['Tem'])


In [29]:
np.sum(reg_data['self_promo_position'] == 1)

154460

In [30]:
np.sum(reg_data['self_promo_position'] == 1) / np.sum(reg_data['self_promotion'] == True)

0.2937104956911305

### Add num. of followers control

In [31]:
def get_tid_follower_cn(row):
    doi, p_0, mat_tid = row['doi'], row['self_promotion'], row['matched_tid']
    if p_0:
        return tid_follow_cn_all[mat_tid]
    else:
        return -1

In [32]:
reg_data['matched_tid_follower_cn'] = reg_data.apply(get_tid_follower_cn, axis=1)

In [33]:
reg_data.head()

Unnamed: 0,doi,pub_year,author_name,authorship_seq,authorship_pos,author_id,affiliation_ids,self_promotion_original,matched_tid_original,self_promotion_retweet,...,General,Life_Sciences,Health_Sciences,Physical_Sciences,affiliation_rank_cate,author_pub_count_cate,total_num_tweets,self_promo_position,self_promo_position_frac,matched_tid_follower_cn
0,10.4202/app.00261.2016,2016,Michal Zaton,2,last_position,2064717215,864159182,False,,False,...,0,0,0,0,8.0,7.0,0,-1,-1.0,-1
1,10.1016/j.foodchem.2013.11.152,2014,Hee-Woong Kim,2,middle_position,2099457132,165507594,False,,False,...,0,1,0,1,7.0,2.0,1,-1,-1.0,-1
2,10.1016/j.foodchem.2013.11.152,2014,Deug-Chan Lee,4,middle_position,2322741405,165507594,False,,False,...,0,1,0,1,7.0,2.0,1,-1,-1.0,-1
3,10.1016/j.foodchem.2013.11.152,2014,Hae-Ik Rhee,5,last_position,2463841131,165507594,False,,False,...,0,1,0,1,7.0,5.0,1,-1,-1.0,-1
4,10.1016/j.foodchem.2013.11.152,2014,He Li,3,middle_position,2569941301,4750791,False,,False,...,0,1,0,1,9.0,1.0,1,-1,-1.0,-1


### Add num of retweets for self-promotional tweets

In [9]:
len(reg_data)

14552304

In [10]:
len(reg_data.loc[reg_data['self_promotion'] == True])

525892

In [19]:
np.sum(reg_data.loc[reg_data['self_promotion'] == True]['total_num_tweets'].isna())

0

In [16]:
len(reg_data.loc[~reg_data['matched_tid'].isna()])

525892

In [17]:
tids_ = set(reg_data.loc[~reg_data['matched_tid'].isna()]['matched_tid'])

In [20]:
tid_engage_stats = {}

with open(Data_Root+'tweets_v2.json', 'r') as ifile:
    for line in ifile:
        line = json.loads(line)
        tid = line['id_str']
        if tid in tids_:
            tid_engage_stats[tid] = (line['retweet_count'], line['favorite_count'])

# this file is collected earlier
with open(Data_Root+'tweets_v2_non_full_text.json', 'r') as ifile:
    for line in ifile:
        line = json.loads(line)
        tid = line['id_str']
        if tid in tids_:
            tid_engage_stats[tid] = (line['retweet_count'], line['favorite_count'])

In [21]:
reg_data['matched_tid_retweet_cn'] = reg_data['matched_tid'].apply(lambda tid: tid_engage_stats[tid][0] if tid in tid_engage_stats else 0)
reg_data['matched_tid_likes_cn'] = reg_data['matched_tid'].apply(lambda tid: tid_engage_stats[tid][1] if tid in tid_engage_stats else 0)

In [22]:
reg_data.head()

Unnamed: 0,doi,pub_year,author_name,authorship_seq,authorship_pos,author_id,affiliation_ids,self_promotion_original,matched_tid_original,self_promotion_retweet,...,is_active_on_twitter,follower_cn_snapshot,matched_uid,is_active_on_twitter_ours,follower_cn_snapshot_ours,is_active_on_twitter_combine,follower_cn_snapshot_combine,self_promotion_first,matched_tid_retweet_cn,matched_tid_likes_cn
0,10.4202/app.00261.2016,2016,Michal Zaton,2,last_position,2064717215,864159182,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1
1,10.1016/j.foodchem.2013.11.152,2014,Hee-Woong Kim,2,middle_position,2099457132,165507594,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1
2,10.1016/j.foodchem.2013.11.152,2014,Deug-Chan Lee,4,middle_position,2322741405,165507594,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1
3,10.1016/j.foodchem.2013.11.152,2014,Hae-Ik Rhee,5,last_position,2463841131,165507594,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1
4,10.1016/j.foodchem.2013.11.152,2014,He Li,3,middle_position,2569941301,4750791,False,,False,...,False,-1,,False,-1,False,-1,False,-1,-1


In [23]:
len(reg_data.loc[reg_data['matched_tid_retweet_cn'] != -1])

525892

### Different definition of self-promotion

Define self-promotion timing as the actual time difference between promotion time and publication date

In [35]:
doi_pub_date = {}
doi_tw_aud = {}

cn_lines = 0
with open(Data_Root + 'merged.txt', 'r') as ofile:
    for row in ofile:
        cn_lines += 1
        if cn_lines % 1000000 == 0:
            print('processed %d lines...'%cn_lines)
        if '}{"altmetric_id"' in row:
            row = row.replace('}{"altmetric_id"', '}\n{"altmetric_id"')
        # this way works for both good and bad lines.
        records = row.split('\n')
        for record in records:
            if record != '':
                paper = json.loads(record)
                citation = paper['citation']
                pubdate = ''
                if 'pubdate' in citation:
                    pubdate = citation['pubdate']
                elif 'epubdate' in citation:
                    pubdate = citation['epubdate']
                doi = ''
                if 'doi' in citation:
                    doi = citation['doi']
                if doi in dois:
                    doi_pub_date[doi] = pubdate
                    if 'poster_types' in paper['demographics']:
                        doi_tw_aud[doi] = paper['demographics']['poster_types']

processed 1000000 lines...
processed 2000000 lines...
processed 3000000 lines...
processed 4000000 lines...
processed 5000000 lines...
processed 6000000 lines...
processed 7000000 lines...
processed 8000000 lines...
processed 9000000 lines...
processed 10000000 lines...
processed 11000000 lines...
processed 12000000 lines...
processed 13000000 lines...
processed 14000000 lines...
processed 15000000 lines...
processed 16000000 lines...
processed 17000000 lines...
processed 18000000 lines...
processed 19000000 lines...
processed 20000000 lines...
processed 21000000 lines...
processed 22000000 lines...
processed 23000000 lines...
processed 24000000 lines...
processed 25000000 lines...


In [36]:
def cal_self_promo_time(row):
    doi, p_0, mat_tid = row['doi'], row['self_promotion'], row['matched_tid']
    if p_0:
        promo_time = tid_date_all[mat_tid]
        pub_time = doi_pub_date[doi]
        pub_time = datetime.strptime(pub_time, '%Y-%m-%dT%H:%M:%S+00:00')
        delta = (promo_time - pub_time).total_seconds()
        # promo time often happens before recorded pub date.
        if delta <= 0:
            return 0
        else:
            return delta
    else:
        return -1

In [37]:
reg_data['self_promo_time'] = reg_data.apply(lambda row: cal_self_promo_time(row), axis = 1)

In [38]:
reg_data.loc[reg_data['self_promo_time'] != -1]

Unnamed: 0,doi,pub_year,author_name,authorship_seq,authorship_pos,author_id,affiliation_ids,self_promotion_original,matched_tid_original,self_promotion_retweet,...,Life_Sciences,Health_Sciences,Physical_Sciences,affiliation_rank_cate,author_pub_count_cate,total_num_tweets,self_promo_position,self_promo_position_frac,matched_tid_follower_cn,self_promo_time
30,10.1111/jofo.12266,2018,David Canal,3,middle_position,2168392039,134820265|151201029,False,,True,...,1,0,0,1.0,5.0,13,13,1.000000,588,1438756.0
52,10.1016/j.neulet.2017.04.062,2017,Yi Wang,4,middle_position,2549369961,156144747,False,,True,...,1,0,0,8.0,1.0,2,2,1.000000,3609,0.0
61,10.1111/ejn.13879,2018,Abigail M. Polter,1,first_position,1994828131,27804330,True,973983994038628352,False,...,1,0,0,2.0,3.0,10,1,0.100000,1362,0.0
149,10.1136/bjsports-2017-098005,2017,Keith A Stokes,5,last_position,2038353575,51601045,False,,True,...,0,1,0,5.0,8.0,141,17,0.120567,1801,123315.0
150,10.1136/bjsports-2017-098005,2017,Simon P Roberts,2,middle_position,2117520759,51601045,True,922015387457290240,True,...,0,1,0,5.0,4.0,141,9,0.063830,286,116573.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14552117,10.1038/ng.2891,2014,Paul A Bates,17,middle_position,2120952471,2801316944,True,464430132246372352,False,...,1,0,0,8.0,7.0,32,18,0.562500,3448,5931746.0
14552129,10.1038/ng.2891,2014,David L Nicol,21,middle_position,2527823134,1325846038,True,430453624884301824,False,...,1,0,0,7.0,3.0,32,6,0.187500,6341,0.0
14552138,10.1002/grl.50374,2013,Gemma L. Smith,1,first_position,2150138117,180144235,True,388305956263575552,False,...,0,0,1,9.0,1.0,27,3,0.111111,246,14307129.0
14552281,10.1016/j.amjsurg.2018.07.064,2018,Arghavan Salles,4,last_position,2115201536,97018004,True,1147915473536737280,True,...,0,1,0,0.0,5.0,18,18,1.000000,43622,24167350.0


In [39]:
np.median(reg_data[reg_data.self_promotion == True]['self_promo_time'])

0.0

In [40]:
np.mean(reg_data[reg_data.self_promotion == True]['self_promo_time'])

7122271.972650278

In [41]:
np.mean(reg_data.total_num_tweets)

9.493099305786904

Consider promotion within one day after publication as self-promotion

In [42]:
day_sec = 3600*24
thsh = 1

def promo_fun(delta):
    if delta == -1:
        return False
    else:
        if delta <= day_sec * thsh:
            return True
        else:
            return False

In [43]:
reg_data['self_promotion_def'] = reg_data['self_promo_time'].apply(promo_fun)

In [53]:
reg_data.loc[reg_data['self_promotion_def'] == True, ['self_promo_time', 'self_promotion_def']].sample(100)

Unnamed: 0,self_promo_time,self_promotion_def
13875592,0.0,True
12301024,0.0,True
5982463,0.0,True
366531,0.0,True
1338354,0.0,True
...,...,...
12900579,0.0,True
6685511,0.0,True
10398454,0.0,True
5676775,0.0,True


Code DV as whether the author is the first one to self-promote for a paper

In [31]:
doi_first_aid = {}

for doi, gp in reg_data.groupby('doi'):
    gp = gp.loc[gp['self_promo_time'] != -1]
    if len(gp) >= 1:
        aid, etime = sorted(gp[['author_id', 'self_promo_time']].values.tolist(), key=lambda x: x[1])[0]
        doi_first_aid[doi] = aid

In [32]:
len(doi_first_aid)

356542

In [35]:
len(set(reg_data[reg_data['self_promotion'] == True]['doi']))

356542

In [33]:
def is_first_to_promo(row):
    doi, aid = row['doi'], row['author_id']
    if doi in doi_first_aid:
        if aid == doi_first_aid[doi]:
            return True
        else:
            return False
    else:
        return False

In [34]:
reg_data['self_promotion_first'] = reg_data.apply(lambda row: is_first_to_promo(row), axis = 1)

In [38]:
sum(reg_data['self_promotion_first']==True)

356542

### Add audiences (DV)

In [55]:
len(dois)

2986263

In [56]:
len(doi_tw_aud)

2213001

In [47]:
doi_tw_aud['10.1001/jama.2018.17121']

{'member_of_the_public': 47,
 'researcher': 11,
 'practitioner': 21,
 'science_communicator': 4}

In [48]:
def get_num_audiences(doi, kind = 'member_of_the_public'):
    if doi in doi_tw_aud and kind in doi_tw_aud[doi]:
        return doi_tw_aud[doi][kind]
    else:
        return 0

In [49]:
reg_data['member_of_the_public'] = reg_data['doi'].apply(lambda doi: get_num_audiences(doi, 'member_of_the_public'))
reg_data['researcher'] = reg_data['doi'].apply(lambda doi: get_num_audiences(doi, 'researcher'))
reg_data['practitioner'] = reg_data['doi'].apply(lambda doi: get_num_audiences(doi, 'practitioner'))
reg_data['science_communicator'] = reg_data['doi'].apply(lambda doi: get_num_audiences(doi, 'science_communicator'))

In [50]:
reg_data['num_non_scientists'] = reg_data[['member_of_the_public', 'practitioner', 'science_communicator']].sum(axis=1)


In [51]:
np.sum(reg_data['researcher'])

28239431

In [52]:
np.sum(reg_data['num_non_scientists'])

99928127

### Add author ethnicity (focus on male and female to reduce the number of authors queried)

In [17]:
eth_cat_map = {
 'ISRAELI': 'MiddleEastern',
 'ARAB': 'MiddleEastern',
 'TURKISH': 'MiddleEastern',
 'ENGLISH': 'English',
 'HISPANIC': 'SouthernEuropean',
 'ITALIAN': 'SouthernEuropean',    
 'GREEK': 'SouthernEuropean',
 'GERMAN': 'WesternNorthernEuropean',
 'NORDIC': 'WesternNorthernEuropean',
 'DUTCH': 'WesternNorthernEuropean',
 'FRENCH': 'WesternNorthernEuropean',
 'BALTIC': 'WesternNorthernEuropean',
 'HUNGARIAN': 'EasternEuropean',
 'ROMANIAN': 'EasternEuropean',
 'SLAV': 'EasternEuropean',
 'CHINESE': 'Chinese',
 'INDIAN': 'Indian',
 'AFRICAN': 'African',
 'KOREAN': 'EastAsian',
 'JAPANESE': 'EastAsian',
 'THAI': 'EastAsian',
 'VIETNAMESE': 'EastAsian',
 'INDONESIAN': 'EastAsian',
 'MONGOLIAN': 'EastAsian',
 'CARIBBEAN': 'CARIBBEAN',
 'POLYNESIAN': 'POLYNESIAN',
 'org': 'org',
 'unknown': 'unknown'}

In [18]:
def parse_author_name(name):
    words = name.split()
    # must has first and last name to query the API
    if len(words) <= 1:
        payload = 'invalid'
    else:
        payload = {'Fname': words[0], 'Lname': words[-1]}
        # C. Kirabo Jackson
        if len(words) >= 3:
            # C or C.
            if len(words[0]) == 1 or (len(words[0]) == 2 and words[0][1] == '.'):
                # not C. J. Jackson; but C. Del Jackson
                if len(words[1]) > 2 or (len(words[1]) == 2 and words[1][1] != '.'):
                    given = words[1]
                    family = words[-1]
                    payload = {'Fname': given, 'Lname': family}
    return payload

name_filter = set(['consortium', 'collaboration', 'collaboration*', 'editor', 'bank'])

def get_author_eth_gen(name):
    # invalid: single-word-name
    feats = ['unknown', 'unknown']
    name_tupe = parse_author_name(name)
    if type(name_tupe) == dict:
        family = name_tupe['Lname']
        # The XXX Collaboration would go to API as well since the Lname is Collaboration.
        if family.lower() in name_filter:
            feats = ['org', 'org']
        else:
            if name in name_eth_gen:
                # ethnicity
                major = name_eth_gen[name]['Ethnea'].split('-')[0]
                if major != "ERROR" and major != 'UNKNOWN' and major != 'TOOSHORT':
                    feats[0] = major
                # gender
                gender = name_eth_gen[name]['Genni']
                if gender != '-':
                    feats[1] = gender
    return feats

In [57]:
# focus on Male and Female
len(reg_data.loc[reg_data['gender'].isin(['Female', 'Male'])])

11420039

In [64]:
len(set(reg_data.loc[reg_data['gender'].isin(['Female', 'Male'])]['author_name']))

3454214

In [19]:
eth_url = 'http://abel.lis.illinois.edu/cgi-bin/ethnea/search.py?format=json'

In [20]:
name_eth_gen = {}

# reuse data collected in the initial submission
with open(Data_Root+'name_eth_gen.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        name, info = row['name'], row['info']
        name_eth_gen[name] = info

In [21]:
len(name_eth_gen)

1309714

In [25]:
# focus on Male and Female
ix = 0
for name in set(reg_data.loc[reg_data['gender'].isin(['Female', 'Male'])]['author_name']):
    if name not in name_eth_gen:
        payload = parse_author_name(name)
        if payload != 'invalid':
            ix += 1
            if ix%5000 == 0:
                time.sleep(5)
            response = requests.get(eth_url, params=payload)
            try:
                j = eval(response.text)
                genni = j['Genni']
                ethnea = j['Ethnea']
                name_eth_gen[name] = {'Ethnea': ethnea, 'Genni': genni}
            except:
                continue
            if ix%100000 == 0:
                print(ix)

In [27]:
with open(Data_Root+'revision/name_eth_gen.json', 'w') as ofile:
    for name, info in name_eth_gen.items():
        row = {'name': name, 'info': info}
        ofile.write(json.dumps(row) + '\n')

In [28]:
name_eth_gen = {}

with open(Data_Root+'revision/name_eth_gen.json', 'r') as ofile:
    for row in ofile:
        row = json.loads(row)
        name, info = row['name'], row['info']
        name_eth_gen[name] = info

In [29]:
len(name_eth_gen)

3629470

Add inferred ethnicity (based on Ethnea)

In [31]:
reg_data['author_eth_gen'] = reg_data['author_name'].apply(lambda name: get_author_eth_gen(name))
reg_data[['author_eth_ethnea', 'author_gender_ethnea']] = pd.DataFrame(reg_data['author_eth_gen'].values.tolist(), index = reg_data.index)
reg_data = reg_data.drop(columns = ['author_eth_gen'])
reg_data['author_eth_ethnea_broad'] = reg_data['author_eth_ethnea'].map(eth_cat_map)


In [32]:
len(reg_data)

14552304

In [33]:
reg_data.head(3)

Unnamed: 0,doi,pub_year,author_name,authorship_seq,authorship_pos,author_id,affiliation_ids,self_promotion_original,matched_tid_original,self_promotion_retweet,...,self_promo_time,self_promotion_def,member_of_the_public,researcher,practitioner,science_communicator,num_non_scientists,author_eth_ethnea,author_gender_ethnea,author_eth_ethnea_broad
0,10.4202/app.00261.2016,2016,Michal Zaton,2,last_position,2064717215,864159182,False,,False,...,-1.0,False,0,0,0,0,0,SLAV,M,EasternEuropean
1,10.1016/j.foodchem.2013.11.152,2014,Hee-Woong Kim,2,middle_position,2099457132,165507594,False,,False,...,-1.0,False,1,0,0,0,1,unknown,unknown,unknown
2,10.1016/j.foodchem.2013.11.152,2014,Deug-Chan Lee,4,middle_position,2322741405,165507594,False,,False,...,-1.0,False,1,0,0,0,1,unknown,unknown,unknown


In [35]:
reg_data.columns

Index(['doi', 'pub_year', 'author_name', 'authorship_seq', 'authorship_pos',
       'author_id', 'affiliation_ids', 'self_promotion_original',
       'matched_tid_original', 'self_promotion_retweet', 'matched_tid_retweet',
       'self_promotion', 'matched_tid', 'gender', 'num_authors',
       'author_pub_count', 'author_citation', 'author_rank',
       'affiliation_name', 'affiliation_cate', 'affiliation_rank',
       'journal_title', 'journal_impact', 'top_journal',
       'author_num_papers_in_data', 'author_self_promotion_rate',
       'Social_Sciences', 'Materials_Science', 'Engineering', 'Chemistry',
       'Biochemistry__Genetics_and_Molecular_Biology', 'Medicine', 'Nursing',
       'Agricultural_and_Biological_Sciences',
       'Pharmacology__Toxicology_and_Pharmaceutics', 'Neuroscience',
       'Business__Management_and_Accounting',
       'Economics__Econometrics_and_Finance', 'Chemical_Engineering',
       'Physics_and_Astronomy', 'Computer_Science', 'Decision_Sciences',
   

In [14]:
reg_data.to_csv(Data_Root+"revision/reg_data_drop_missing.csv", index=False, header=True, encoding='utf-8')

In [13]:
len(reg_data)

14552304

In [8]:
# utype = {'author_id': str, 'matched_tid': str, 'matched_tid_original': str, 'matched_tid_retweet': str}
# reg_data = pd.read_csv(Data_Root + "revision/reg_data_drop_missing.csv", header=0, dtype=utype)