In [2]:
import os
import math
import json
import csv
import time
import random
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.colors
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from collections import defaultdict, Counter
from datetime import datetime

In [3]:
Data_Root = '/Data/Promotion/'

In [4]:
# pd.options.display.precision = 20

In [4]:
CIs = {'90': 1.645, '95': 1.96, '99': 2.576}

In [5]:
labels = ['Male', 'Female']

In [6]:
colors = sns.color_palette()[:len(labels)]

In [7]:
utype = {'author_id': str, 'matched_tid': str, 'matched_tid_original': str, 'matched_tid_retweet': str}
reg_data = pd.read_csv(Data_Root + "revision/reg_data_drop_missing.csv", header=0, dtype=utype)

In [49]:
reg_data.shape

(14552304, 81)

In [8]:
reg_data.head(3)

Unnamed: 0,doi,pub_year,author_name,authorship_seq,authorship_pos,author_id,affiliation_ids,self_promotion_original,matched_tid_original,self_promotion_retweet,...,author_eth_ethnea_broad,pub_date,is_active_on_twitter,follower_cn_snapshot,matched_uid,is_active_on_twitter_ours,follower_cn_snapshot_ours,is_active_on_twitter_combine,follower_cn_snapshot_combine,self_promotion_first
0,10.4202/app.00261.2016,2016,Michal Zaton,2,last_position,2064717215,864159182,False,,False,...,EasternEuropean,2016-01-01,False,-1,,False,-1,False,-1,False
1,10.1016/j.foodchem.2013.11.152,2014,Hee-Woong Kim,2,middle_position,2099457132,165507594,False,,False,...,unknown,2014-06-01,False,-1,,False,-1,False,-1,False
2,10.1016/j.foodchem.2013.11.152,2014,Deug-Chan Lee,4,middle_position,2322741405,165507594,False,,False,...,unknown,2014-06-01,False,-1,,False,-1,False,-1,False


Add paper publication date

In [9]:
doi_pub_date = {}
with open(Data_Root+'revision/papers_2013_2018.json', 'r') as ifile:
    for line in ifile:
        res = json.loads(line)
        doi_pub_date[res['doi']] = res['data']['pub_date']

In [10]:
len(doi_pub_date)

6601528

In [11]:
doi_pub_date['10.4202/app.00261.2016']

'2016-01-01T00:00:00+00:00'

In [12]:
reg_data['pub_date'] = reg_data['doi'].apply(lambda doi: doi_pub_date[doi][:10])

In [13]:
# https://docs.openalex.org/about-the-data
# manually check these ids on openAlex: https://api.openalex.org/authors/A2569941301
reg_data[['author_id', 'author_name']]

Unnamed: 0,author_id,author_name
0,2064717215,Michal Zaton
1,2099457132,Hee-Woong Kim
2,2322741405,Deug-Chan Lee
3,2463841131,Hae-Ik Rhee
4,2569941301,He Li
...,...,...
14552299,2429127808,Michael J. Pennock
14552300,2135560445,Mark A. Cohen
14552301,2166563239,Sally S. Simpson
14552302,2240327441,Thomas A. Loughran


In [2]:
# reg_data.isna().sum()

### Using author-tweeter matching data from the external source

Data: `Costas et al. 2020 - Large-scale identification and characterization of scholars on Twitter`

In [17]:
aid_uid_df = pd.read_csv(Data_Root + "revision/authors_tweeters_2022_08_21.csv", header=0, \
                        dtype={'tweeter_id': str})

In [18]:
len(aid_uid_df)

498672

In [19]:
aid_uid_df['mag_aid'] = aid_uid_df['author_id'].apply(lambda url: url.split('https://openalex.org/A')[1])

In [20]:
aid_uid_df.sample(10)

Unnamed: 0,author_id,tweeter_id,criteria,valid,mag_aid
119559,https://openalex.org/A2257728130,839792430023000064,"full name (profile name),full name substring (...",,2257728130
23084,https://openalex.org/A3216699643,1126876446260125696,first initial + last name (profile name),1.0,3216699643
345060,https://openalex.org/A2885259549,22415100,"first token + last name (profile name),first i...",,2885259549
125106,https://openalex.org/A2960088177,858154875556909056,"full name (profile name),full name substring (...",,2960088177
303909,https://openalex.org/A1446706271,1496755333586427904,"full name (profile name),full name substring (...",,1446706271
433025,https://openalex.org/A2086255120,357343726,"full name (profile name),full name substring (...",,2086255120
69292,https://openalex.org/A2133902455,635354503,"first token + last name (profile name),all ini...",1.0,2133902455
190410,https://openalex.org/A2107328086,10547982,"full name (profile name),full name substring (...",,2107328086
184064,https://openalex.org/A3133699441,1035912728504938497,"full name (profile name),full name substring (...",,3133699441
109497,https://openalex.org/A2426007080,808463595755466752,"full name (profile name),full name substring (...",,2426007080


In [21]:
aid_uid_df.isna().sum()

author_id          0
tweeter_id         0
criteria           0
valid         422693
mag_aid            0
dtype: int64

In [22]:
aid_uid_df['valid'].value_counts(dropna=False)

NaN    422693
1.0     69688
0.0      6291
Name: valid, dtype: int64

In [23]:
np.mean(aid_uid_df['valid'])

0.9172008054857264

In [24]:
len(aid_uid_df.loc[aid_uid_df['valid'] != 0])

492381

In [25]:
aid_uids = {}

for aid, gp in aid_uid_df.loc[aid_uid_df['valid'] != 0].groupby('mag_aid'):
    aid_uids[aid] = list(set(gp['tweeter_id']))

In [26]:
len(aid_uids)

488178

In [32]:
len(set(reg_data.loc[reg_data['author_id'].isin(aid_uids)]['author_id']))

201425

In [27]:
cn = 0
for aid in aid_uids:
    if len(aid_uids[aid]) > 1:
        cn += 1

In [28]:
# some authors have matched to multiple tw accounts.
cn

3975

In [29]:
all_uids = set()

for aid in aid_uids:
    for uid in aid_uids[aid]:
        all_uids.add(uid)

In [30]:
# num of unique Tw user id
len(all_uids)

420813

In [106]:
uid_account_creation_time_follower = {}
uid_active_times = defaultdict(list)

with open(Data_Root+'tweets_v2.json', 'r') as ifile:
    for line in ifile:
        line = json.loads(line)
        uid = line['user']['id_str']
        if uid in all_uids:
            if uid not in uid_account_creation_time_follower:
                creation_time = line['user']['created_at']
                # this is counted at data collection time. All tweets are collected around the same time.
                f_cn = line['user']['followers_count']
                uid_account_creation_time_follower[uid] = (creation_time, f_cn)
            tweet_time = line['created_at']
            uid_active_times[uid].append(tweet_time)

with open(Data_Root+'tweets_v2_non_full_text.json', 'r') as ifile:
    for line in ifile:
        line = json.loads(line)
        uid = line['user']['id_str']
        if uid in all_uids:
            if uid not in uid_account_creation_time_follower:
                creation_time = line['user']['created_at']
                f_cn = line['user']['followers_count']
                uid_account_creation_time_follower[uid] = (creation_time, f_cn)
            tweet_time = line['created_at']
            uid_active_times[uid].append(tweet_time)

In [124]:
uid_active_times = dict(uid_active_times)

count how many of their uids are in our tw data.

In [107]:
len(uid_account_creation_time_follower)

288883

In [108]:
len(uid_active_times)

288883

In [109]:
with open(Data_Root+'revision/uid_account_info.json', 'w') as ofile:
    for uid in uid_active_times:
        creation_time, f_cn = uid_account_creation_time_follower[uid]
        alist = uid_active_times[uid]
        res = {'tweeter_id': uid, 'created_at': creation_time, 'follower_cn_snapshot': f_cn, 'active_time': alist}
        ofile.write(json.dumps(res)+'\n')

In [110]:
for uid, alist in uid_active_times.items():
    blist = []
    for tdate in alist:
        tdate = datetime.strptime(tdate, '%a %b %d %H:%M:%S +0000 %Y')
        # reformat back to string
        # tdate = tdate.strftime("%Y-%m-%dT%H:%M:%S")
        blist.append(tdate)
    uid_active_times[uid] = sorted(blist)

In [111]:
uid_active_times['11100372']

[datetime.datetime(2012, 6, 25, 18, 37, 17),
 datetime.datetime(2013, 9, 30, 15, 12, 34),
 datetime.datetime(2014, 6, 11, 10, 52, 48),
 datetime.datetime(2014, 11, 25, 14, 2, 7),
 datetime.datetime(2015, 9, 29, 15, 1, 50),
 datetime.datetime(2015, 10, 27, 14, 1, 53),
 datetime.datetime(2015, 11, 5, 16, 2, 9),
 datetime.datetime(2016, 4, 15, 11, 0, 22),
 datetime.datetime(2018, 1, 15, 15, 46, 9),
 datetime.datetime(2018, 1, 15, 16, 26, 49),
 datetime.datetime(2018, 1, 17, 14, 10, 26),
 datetime.datetime(2018, 7, 19, 8, 35),
 datetime.datetime(2018, 11, 5, 16, 53, 44),
 datetime.datetime(2019, 6, 30, 12, 42, 55),
 datetime.datetime(2019, 9, 25, 13, 0, 59)]

In [132]:
def code_tw_active_status_fcn(row):
    aid, pub_date = row['author_id'], row['pub_date']
    pub_date = datetime.strptime(pub_date, '%Y-%m-%d')
    flag = False
    fcn = -1
    if aid in aid_uids:
        for uid in aid_uids[aid]:
            # get the earliest active date
            if uid in uid_active_times:
                eatime = uid_active_times[uid][0]
                if (pub_date - eatime).total_seconds() >= 0:
                    flag = True
                    fcn = uid_account_creation_time_follower[uid][1]
                    break
    return (flag, fcn)

In [134]:
# an author_id could be active for some of their observations, but inactive for other obs.
reg_data['Tem'] = reg_data.apply(lambda row: code_tw_active_status_fcn(row), axis = 1)

In [135]:
reg_data[['is_active_on_twitter', 'follower_cn_snapshot']] = pd.DataFrame(reg_data['Tem'].values.tolist(), index = reg_data.index)
reg_data = reg_data.drop(columns=['Tem'])


In [138]:
reg_data['is_active_on_twitter'].value_counts(dropna=False)

False    13873164
True       679140
Name: is_active_on_twitter, dtype: int64

In [141]:
# self_promotion is coded based on our heuristic, active status is coded based on the external data.
# discrepency is expected due to different heuristics.
reg_data.loc[reg_data['self_promotion'] == True]['is_active_on_twitter'].value_counts()

True     285926
False    239966
Name: is_active_on_twitter, dtype: int64

In [140]:
np.median(reg_data.loc[reg_data['is_active_on_twitter']==True, 'follower_cn_snapshot'])

833.0

In [140]:
for year, gp in reg_data.loc[reg_data['is_active_on_twitter'] == True].groupby('pub_year'):
    print(year, np.mean(gp['self_promotion']))

2013 0.3485364439936241
2014 0.3630606757336035
2015 0.378611831251473
2016 0.3858121338808424
2017 0.42856356092964054
2018 0.471245501357713
