In [1]:
import os
import json
import pickle
import datetime 

import pandas as pd
from tqdm import tqdm

DATA_DIR = '/pool001/jschless/kiran-data/kiran-data/'

# Pull in Friends Data
Store it in a dictionary of the form {username: set(friends of username)} 

Takes 40 seconds to run

In [2]:
def get_friends(filename):
    return (
        pd.read_csv(os.path.join(DATA_DIR, filename),  # loading data
                    sep='\t', names=['user', 'friend'])
        .groupby('user') # group by the user 
        .apply(lambda x: set(x.friend)) # get set of all friends
        .to_dict() # convert it to a python dictionary
    )

In [3]:
%%time 
friends_dict = {**get_friends('FRIENDS.txt'), **get_friends('FRIENDS1.txt')}

CPU times: user 4min 8s, sys: 18.4 s, total: 4min 27s
Wall time: 4min 30s


# Convert Tweet Data into Pipeline Format
For each hashtag, we need to find who saw a tweet from a friend using the hashtag before participating

Also, create mapping of user ids to user names

Takes 1.5 minutes to run

__Errors: 540 of the tweets do not properly parse as JSONs. Not sure what happened, but different issues each time. Was such a small number I just ignored it.__

In [4]:
%%time


# only store the following tweet columns, saves time and memory
columns_needed = set(['author_id', 'screen_name', 'created_at', 
                      'date', 'id', 'text', 'trend', 'trend_date',
                      'favorite_count'])

tweet_dict = {}
name_to_id = {}
id_to_name = {}

with open(os.path.join(DATA_DIR, 'TWEETINFO.txt'), 'r') as f:
    errors = 0
    for i, line in enumerate(f):
        try:
            tweet = json.loads(line)
            
            id_to_name[tweet['user']['id']] = tweet['user']['screen_name']
            name_to_id[tweet['user']['screen_name']] = tweet['user']['id']
            
            filtered_tweet = {k:v for k,v in tweet.items() if k in columns_needed}
            
            if 'followers_count' in tweet.get('user', {}):
                filtered_tweet['n_followers'] = tweet['user']['followers_count']
            if 'friends_count' in tweet.get('user', {}):
                filtered_tweet['n_friends'] = tweet['user']['friends_count']
            if 'statuses_count' in tweet.get('user', {}):
                filtered_tweet['n_statuses'] = tweet['user']['statuses_count']
            
            tweet_dict[tweet['id']] = filtered_tweet
        except Exception as e:
            errors += 1

print("# errors", errors)

# errors 540
CPU times: user 1min 30s, sys: 3.74 s, total: 1min 33s
Wall time: 1min 33s


In [5]:
# load ht mapping file
ht_mapping = pd.read_csv(
    os.path.join(DATA_DIR, 'hashtag_mapping.txt'), 
    sep='\t', 
    header=None, 
    index_col=1).to_dict()[0]

## Constructing Dictionary of Hashtag Tweets

__Errors: 21k of the tweets had issues. The tweet_id did not exist in the TWEETINFO.txt file. Many of them appear to be private accounts__

In [6]:
# create a dictionary of the form {hashtag: list[tweets using hashtag]}
campaigns = {}
errors = 0
for ht_id in tqdm(os.listdir(os.path.join(DATA_DIR, 'hashtag_data'))):
    ht = ht_mapping[int(ht_id)]
    tweets = []
    with open(os.path.join(DATA_DIR, 'hashtag_data', ht_id)) as f:
        for link in f:
            tokens = link.split('/')
            tweet_id = int(tokens[-1])

            try: 
                tweet = tweet_dict[tweet_id]
                tweet['author'] = tokens[3]
                tweet['author_id'] = name_to_id.get(tokens[3], -1)
                tweet['trend'] = ht
                if isinstance(tweet['created_at'], str):
                    # if the created at is not a date, convert it
                    tweet['created_at'] = datetime.datetime.strptime(tweet['created_at'], 
                                                                     '%a %b %d %H:%M:%S +0000 %Y')
                tweets.append(tweet)
            except Exception as e:
#                 print(e, link)
                errors += 1
    campaigns[ht] = tweets    
print('# errors', errors)

100%|██████████| 418/418 [00:39<00:00, 10.69it/s]

# errors 20961





# Integration with Original Data

In [7]:
TURKEY_DIR = '/pool001/jschless/turkish_astroturfing'

df = pd.read_csv(os.path.join(TURKEY_DIR, 'trend_tweets.csv'),
                parse_dates=['date', 'trend_date', 'created_at'])

old_campaigns = df.groupby("trend").apply(lambda x: x.to_dict(orient="records")).to_dict()
del df 

# fill in missing author names from Tugrulcan's data
for ht, tweets in old_campaigns.items():
    for tweet in tweets:
        tweet['author'] = id_to_name.get(tweet['author_id'], 'missing')

In [8]:
# merge original data with the new data
for ht in campaigns.keys():
    campaigns[ht] += old_campaigns.get(ht, [])

# Exposure Calculation

In [9]:
with open(os.path.join(DATA_DIR, 'followers_dict.pkl'), 'rb') as f:
    followers_dict = pickle.load(f)

In [12]:
for ht, tweets in tqdm(campaigns.items()):
    tweeted = set()
#    exposed = set()
    sorted_tweets = sorted(tweets, key=lambda x: x['created_at'])
    for tweet in sorted_tweets:
        # take the intersection of the set of friends and the set of people who have already used the hashtag
        # if this is non empty, they are unexposed
 #       n_prev_exposed = len(exposed)
        # update the set of all people exposed
  #      exposed = exposed.union(followers_dict.get(tweet['author'], set()))

        # check if any of friends are in the tweeted set
        tweet['exposed'] = len(friends_dict.get(tweet['author'], set()).intersection(tweeted)) != 0
   #     tweet['n_newly_exposed'] = len(exposed) - n_prev_exposed
        tweeted.add(tweet['author'])

100%|██████████| 418/418 [01:45<00:00,  3.97it/s]


In [13]:
# checkpointing
with open(os.path.join(DATA_DIR, 'campaigns_3.pkl'), 'wb') as f:
    pickle.dump(campaigns, f)


In [14]:
with open(os.path.join(DATA_DIR, 'campaigns_3.pkl'), 'rb') as f:
    campaigns = pickle.load(f)

In [15]:
%%time

df = pd.DataFrame()
for ht, tweets in campaigns.items():
    df = df.append(pd.DataFrame.from_records(tweets))

CPU times: user 3min 24s, sys: 17 s, total: 3min 41s
Wall time: 3min 41s


In [16]:
df.head()

Unnamed: 0,created_at,favorite_count,id,text,n_followers,n_friends,n_statuses,author,author_id,trend,exposed,n_newly_exposed,date,trend_date,tweet_type
0,2020-05-02 14:05:16,1.0,1256585550796148738,#MilliGazeteOkuyorum #SesimizBir #Cumartesi #D...,1767.0,2161.0,142612.0,GunesliGuzel,293656352,#ÜniversiteliİşçilereAdalet,False,276.0,NaT,NaT,
1,2019-11-18 17:38:31,10.0,1196482833696645120,Kamuda çalışan üniversiteli işçiler memur stat...,809.0,2573.0,10232.0,yaprakergen,325766266,#ÜniversiteliİşçilereAdalet,False,0.0,NaT,NaT,
2,2019-11-11 17:02:57,,1193937167007068160,@MemurSenKonf Üniversite mezunu 4D'li işçiler ...,1245.0,1310.0,1624.0,Erdemakkusss,965200054352076800,#ÜniversiteliİşçilereAdalet,True,0.0,NaT,NaT,
3,2019-11-11 15:21:30,,1193911634013630466,@_aliyalcin_ Üniversite mezunu 4D'li işçiler o...,1245.0,1310.0,1624.0,Erdemakkusss,965200054352076800,#ÜniversiteliİşçilereAdalet,True,0.0,NaT,NaT,
4,2019-11-11 15:21:17,,1193911580230062080,@SabahMemurlar Üniversite mezunu 4D'li işçiler...,1245.0,1310.0,1624.0,Erdemakkusss,965200054352076800,#ÜniversiteliİşçilereAdalet,True,0.0,NaT,NaT,


# Trending Info

In [19]:
ls /pool001/jschless/turkish_astroturfing

[0m[48;5;10;38;5;21mfollower_info[0m/             [38;5;34mtrend_tweets.csv[0m*
[38;5;27m__MACOSX[0m/                  [38;5;34mtrend_tweets.csv.zip[0m*
[38;5;34mtrend_analysis.csv[0m*        trend_tweets_w_error.csv
[38;5;34mtrend_analysis_top10.csv[0m*  [38;5;34mworld_trend_analysis.csv[0m*
[38;5;34mtrend_tweets_copy.csv[0m*     [38;5;34mworld_trend_analysis_top10.csv[0m*


In [17]:
TURKEY_DIR = '/pool001/jschless/turkish_astroturfing'

# trend_file = 'trend_analysis_top10.csv' 
trend_file = 'trend_analysis.csv' 
#trend_file = 'world_trend_analysis_top10.csv'

trending_info = pd.read_csv(os.path.join(TURKEY_DIR, trend_file),
                           parse_dates=['tr_start', 'tr_end', 'lifetime', 'date'])

trending_info['trend'] = trending_info.keyword

trending_info = trending_info.drop(columns=['date', 'id', 'keyword'])

trending_info.head()

Unnamed: 0,tr_start,tr_end,vol,max_rank,lifetime,attack,trend
0,2019-07-14 08:43:19,2019-07-14 10:38:28,69190,15,0 days 01:55:09.000000000,False,"""DEH"""
1,2019-07-13 20:06:51,2019-07-13 20:31:51,70867,13,0 days 00:25:00.000000000,False,"""Deh"""
2,2019-07-13 22:06:46,2019-07-13 22:11:46,70858,23,0 days 00:05:00.000000000,False,"""Deh"""
3,2019-07-13 23:28:42,2019-07-14 00:03:43,71657,24,0 days 00:35:01.000000000,False,"""Deh"""
4,2019-07-14 08:33:21,2019-07-14 08:38:18,68323,15,0 days 00:04:57.000000000,False,"""Deh"""


In [18]:
trending_info['time_trending'] = trending_info.tr_end - trending_info.tr_start

In [19]:
## Tugrulcan's classifier for lexicon tweets

import emoji
import string

def give_emoji_free_text(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])

    return clean_text

def lexicon_classifier(line, trend):
    line = give_emoji_free_text(line)
    line = line.replace(trend, '')
    line = line.replace('  ', ' ')

    line = line.strip()

    if (len(line) == 0):
        return False

    if (line[0].isupper()):
        return False

    invalidChars = set(string.punctuation.replace("(", "…").replace(")", "...").replace('.', ".").replace('.', '.'))
    invalidChars = invalidChars.union(set(["%d" % i for i in range(0,10)])) # added numbers

    if any(char in invalidChars for char in line):
        return False

    tokens = line.split(' ')
    if (len(tokens) > 10 or len(tokens) < 3):
        return False



    return True

In [20]:
%%time 

mega_df = df.merge(trending_info, on='trend')
mega_df["time_since_trending"] = mega_df.created_at - mega_df.tr_start
mega_df["time_since_trending"] = mega_df.time_since_trending.apply(lambda x: int(x.total_seconds() / 60))
mega_df["lexicon"] = mega_df.apply(lambda x: lexicon_classifier(x.text, x.trend), axis=1)
mega_df['follower_data'] = mega_df.author.apply(lambda x: x in friends_dict)

CPU times: user 25min 44s, sys: 1min 3s, total: 26min 48s
Wall time: 26min 48s


In [21]:
# save now, after a lot of the heavy lifting is done 
mega_df.to_pickle(os.path.join(DATA_DIR, 'mega_df_full_follower_final.pkl'))