In [1]:
import os
import json
import pickle
import datetime 

import pandas as pd
from tqdm import tqdm

DATA_DIR = '/pool001/jschless/kiran-data/kiran-data/'

# Pull in Friends Data
Store it in a dictionary of the form {username: set(friends of username)} 

Takes 40 seconds to run

In [2]:
%%time
friends_dict = (
    pd.read_csv(os.path.join(DATA_DIR, 'FRIENDS.txt'),  # loading data
                sep='\t', names=['user', 'friend'])
        .groupby('user') # group by the user 
        .apply(lambda x: set(x.friend)) # get set of all friends
        .to_dict() # convert it to a python dictionary
)

CPU times: user 50.6 s, sys: 2.86 s, total: 53.5 s
Wall time: 53.7 s


# Convert Tweet Data into Pipeline Format
For each hashtag, we need to find who saw a tweet from a friend using the hashtag before participating

Also, create mapping of user ids to user names

Takes 1.5 minutes to run

__Errors: 540 of the tweets do not properly parse as JSONs. Not sure what happened, but different issues each time. Was such a small number I just ignored it.__

In [3]:
%%time

# only store the following tweet columns, saves time and memory
columns_needed = set(['author_id', 'screen_name', 'created_at', 'date', 'id', 'text', 'trend', 'trend_date'])

tweet_dict = {}
name_to_id = {}
id_to_name = {}

with open(os.path.join(DATA_DIR, 'TWEETINFO.txt'), 'r') as f:
    errors = 0
    for i, line in enumerate(f):
        try:
            tweet = json.loads(line)
            
            id_to_name[tweet['user']['id']] = tweet['user']['screen_name']
            name_to_id[tweet['user']['screen_name']] = tweet['user']['id']  
            
            filtered_tweet = {k:v for k,v in tweet.items() if k in columns_needed}
            tweet_dict[tweet['id']] = filtered_tweet
        except Exception as e:
            errors += 1

print("# errors", errors)

# errors 540
CPU times: user 1min 28s, sys: 3.56 s, total: 1min 31s
Wall time: 1min 31s


In [4]:
# load ht mapping file
ht_mapping = pd.read_csv(
    os.path.join(DATA_DIR, 'hashtag_mapping.txt'), 
    sep='\t', 
    header=None, 
    index_col=1).to_dict()[0]

## Constructing Dictionary of Hashtag Tweets

__Errors: 21k of the tweets had issues. The tweet_id did not exist in the TWEETINFO.txt file. Many of them appear to be private accounts__

In [5]:
# create a dictionary of the form {hashtag: list[tweets using hashtag]}
campaigns = {}
errors = 0
for ht_id in tqdm(os.listdir(os.path.join(DATA_DIR, 'hashtag_data'))):
    ht = ht_mapping[int(ht_id)]
    tweets = []
    with open(os.path.join(DATA_DIR, 'hashtag_data', ht_id)) as f:
        for link in f:
            tokens = link.split('/')
            tweet_id = int(tokens[-1])

            try: 
                tweet = tweet_dict[tweet_id]
                tweet['author'] = tokens[3]
                tweet['author_id'] = name_to_id.get(tokens[3], -1)
                tweet['trend'] = ht
                if isinstance(tweet['created_at'], str):
                    # if the created at is not a date, convert it
                    tweet['created_at'] = datetime.datetime.strptime(tweet['created_at'], 
                                                                     '%a %b %d %H:%M:%S +0000 %Y')
                tweets.append(tweet)
            except Exception as e:
#                 print(e, link)
                errors += 1
    campaigns[ht] = tweets    
print('# errors', errors)

100%|██████████| 418/418 [00:39<00:00, 10.60it/s]

# errors 20961





# Integration with Original Data

In [6]:
TURKEY_DIR = '/pool001/jschless/turkish_astroturfing'

df = pd.read_csv(os.path.join(TURKEY_DIR, 'trend_tweets.csv'),
                parse_dates=['date', 'trend_date', 'created_at'])

old_campaigns = df.groupby("trend").apply(lambda x: x.to_dict(orient="records")).to_dict()
del df 

# fill in missing author names from Tugrulcan's data
for ht, tweets in old_campaigns.items():
    for tweet in tweets:
        tweet['author'] = id_to_name.get(tweet['author_id'], 'missing')

In [7]:
# merge original data with the new data
for ht in campaigns.keys():
    campaigns[ht] += old_campaigns.get(ht, [])

# Exposure Calculation

In [10]:
for ht, tweets in tqdm(campaigns.items()):
    tweeted = set()
    sorted_tweets = sorted(tweets, key=lambda x: x['created_at'])
    for tweet in sorted_tweets:
        # take the intersection of the set of friends and the set of people who have already used the hashtag
        # if this is non empty, they are unexposed
        tweet['exposed'] = len(friends_dict.get(tweet['author'], set()).intersection(tweeted)) != 0
        tweeted.add(tweet['author'])

100%|██████████| 418/418 [01:10<00:00,  5.89it/s]


In [11]:
# checkpointing

with open(os.path.join(DATA_DIR, 'campaigns.pkl'), 'wb') as f:
    pickle.dump(campaigns, f)
    
with open(os.path.join(DATA_DIR, 'campaigns.pkl'), 'rb') as f:
    campaigns = pickle.load(f)

In [12]:
%%time

df = pd.DataFrame()
for ht, tweets in campaigns.items():
    df = df.append(pd.DataFrame.from_records(tweets))

CPU times: user 2min 57s, sys: 10.5 s, total: 3min 7s
Wall time: 3min 7s


In [13]:
df.head()

Unnamed: 0,created_at,id,text,author,author_id,trend,exposed,date,trend_date,tweet_type
0,2020-05-02 14:05:16,1256585550796148738,#MilliGazeteOkuyorum #SesimizBir #Cumartesi #D...,GunesliGuzel,293656352,#ÜniversiteliİşçilereAdalet,False,NaT,NaT,
1,2019-11-18 17:38:31,1196482833696645120,Kamuda çalışan üniversiteli işçiler memur stat...,yaprakergen,325766266,#ÜniversiteliİşçilereAdalet,False,NaT,NaT,
2,2019-11-11 17:02:57,1193937167007068160,@MemurSenKonf Üniversite mezunu 4D'li işçiler ...,Erdemakkusss,965200054352076800,#ÜniversiteliİşçilereAdalet,True,NaT,NaT,
3,2019-11-11 15:21:30,1193911634013630466,@_aliyalcin_ Üniversite mezunu 4D'li işçiler o...,Erdemakkusss,965200054352076800,#ÜniversiteliİşçilereAdalet,True,NaT,NaT,
4,2019-11-11 15:21:17,1193911580230062080,@SabahMemurlar Üniversite mezunu 4D'li işçiler...,Erdemakkusss,965200054352076800,#ÜniversiteliİşçilereAdalet,True,NaT,NaT,


# Trending Info

In [14]:
TURKEY_DIR = '/pool001/jschless/turkish_astroturfing'

trend_file = 'trend_analysis_top10.csv' 
#trend_file = 'world_trend_analysis_top10.csv'

trending_info = pd.read_csv(os.path.join(TURKEY_DIR, trend_file),
                           parse_dates=['tr_start', 'tr_end', 'lifetime', 'date'])

trending_info['trend'] = trending_info.keyword

trending_info = trending_info.drop(columns=['date', 'id', 'keyword'])

trending_info.head()

Unnamed: 0,tr_start,tr_end,vol,max_rank,lifetime,attack,trend
0,2019-06-27 06:14:01,2019-06-27 08:09:03,-1,8,0 days 01:55:02.000000000,False,"""Maçka"""
1,2019-06-20 18:38:28,2019-06-20 22:24:33,14474,1,0 days 03:46:05.000000000,True,#1200ÜcretliAtamasıHaktır
2,2019-06-20 22:29:29,2019-06-20 23:59:37,14495,6,0 days 01:30:08.000000000,False,#1200ücretliatamasıhaktır
3,2019-07-15 04:12:22,2019-07-15 13:02:52,118537,1,0 days 08:50:30.000000000,False,#15TEMMUZDESTANI
4,2019-07-15 13:32:57,2019-07-15 13:47:54,124473,1,0 days 00:14:57.000000000,False,#15TEMMUZDESTANI


In [15]:
trending_info['time_trending'] = trending_info.tr_end - trending_info.tr_start

In [16]:
trending_info.query('trend == "#ÜniversiteliİşçilereAdalet"')

Unnamed: 0,tr_start,tr_end,vol,max_rank,lifetime,attack,trend,time_trending
1405,2019-07-05 17:32:49,2019-07-05 18:12:53,-1,1,0 days 00:40:04.000000000,True,#ÜniversiteliİşçilereAdalet,00:40:04
1406,2019-07-05 18:22:52,2019-07-05 19:42:43,-1,4,0 days 01:19:51.000000000,True,#ÜniversiteliİşçilereAdalet,01:19:51
1407,2019-07-05 19:52:41,2019-07-05 20:57:40,-1,5,0 days 01:04:59.000000000,True,#ÜniversiteliİşçilereAdalet,01:04:59


In [17]:
trending_info.describe()

Unnamed: 0,vol,max_rank,time_trending
count,3787.0,3787.0,3787
mean,21990.98,5.49089,0 days 01:52:58.148930
std,124053.5,3.045631,0 days 02:21:39.023816
min,-1.0,1.0,0 days 00:00:00
25%,-1.0,3.0,0 days 00:10:01
50%,-1.0,6.0,0 days 01:05:01
75%,-1.0,8.0,0 days 02:41:39
max,3382066.0,10.0,0 days 22:15:08


In [18]:
## Tugrulcan's classifier for lexicon tweets

import emoji
import string

def give_emoji_free_text(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])

    return clean_text

def lexicon_classifier(line, trend):
    line = give_emoji_free_text(line)
    line = line.replace(trend, '')
    line = line.replace('  ', ' ')

    line = line.strip()

    if (len(line) == 0):
        return False

    if (line[0].isupper()):
        return False

    invalidChars = set(string.punctuation.replace("(", "…").replace(")", "...").replace('.', ".").replace('.', '.'))
    invalidChars = invalidChars.union(set(["%d" % i for i in range(0,10)])) # added numbers

    if any(char in invalidChars for char in line):
        return False

    tokens = line.split(' ')
    if (len(tokens) > 10 or len(tokens) < 3):
        return False



    return True

In [19]:
%%time 

mega_df = df.merge(trending_info, on='trend')
mega_df["time_since_trending"] = mega_df.created_at - mega_df.tr_start
mega_df["time_since_trending"] = mega_df.time_since_trending.apply(lambda x: int(x.total_seconds() / 60))
mega_df["lexicon"] = mega_df.apply(lambda x: lexicon_classifier(x.text, x.trend), axis=1)
mega_df['follower_data'] = mega_df.author.apply(lambda x: x in friends_dict)

CPU times: user 10min 42s, sys: 9.44 s, total: 10min 51s
Wall time: 10min 51s


In [20]:
# save now, after a lot of the heavy lifting is done 
mega_df.to_pickle(os.path.join(DATA_DIR, 'mega_df.pkl'))