### Data Preprocessing and Inspection
#### Structure of this notebook:  
    **Aggregating the data into a dataset:** 
      
    A. Preprocessing  
      
    **Exploring the data and descriptive statistics:**   
      
    B. Understanding the Likers & Retweeters datasets  
    C. Understanding script performance
    D. Understanding user activity

In [None]:
import os
import glob
import json
import pandas as pd
import csv
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
import itertools
from matplotlib import pyplot as plt
from collections import Counter
from ast import literal_eval 

## A. Preprocessing 

#### Prelims and aggregating all likers/retweeters files including the final harvest data into like/retweet-tweet matrices.   
#### Where likers/retweeters are listed in columns, and tweet-IDs in rows.  
#### Entry[tweet_i,liker_j] evaluates to 1, if liker_j has liked tweet_i.

In [None]:
#TODO: What is your pull-folder called?
my_pull = ' ' # pull folder name: Pull-DD-MM-YYY-hour:minute:second

In [None]:
# list all likers/retweeters files
# TODO: adjust path to your pull-folder! 
filenames_l = glob.glob(os.path.join(my_pull,'CSVs/Likers_of_alarms/*ikers*')) 
filenames_r = glob.glob(os.path.join(my_pull,'CSVs/Retweeters_of_alarms/*etweeters*'))

In [None]:
# Import all csvs and convert to pkl files
# likers:
for file in filenames_l:
    likers = pd.read_csv(file, converters={"likers": literal_eval})
    file = file.replace("csv","pkl")
    likers.to_pickle(file)

In [None]:
# retweeters:
for file in filenames_r:
    retweeters = pd.read_csv(file, converters={"retweeters": literal_eval})
    file = file.replace("csv","pkl")
    retweeters.to_pickle(file)

In [None]:
# load all pkl file names
pkl_l = glob.glob(os.path.join(my_pull,'CSVs/Likers_of_alarms/*pkl')) # all pkl file names 
pkl_r = glob.glob(os.path.join(my_pull,'CSVs/Retweeters_of_alarms/*pkl')) # all pkl file names 

In [None]:
# read pkl files, set tweet id as index (makes data handling and indexing way easier), make vote dataframe
# Aggregate votes in dataframe: tweet ID as row index, user names as column headings
likers_incomplete = pd.DataFrame()

for files in pkl_l:

    df = pd.read_pickle(files)
    df.set_index('Unnamed: 0', inplace = True)
    
    for tweet in df.index:      
            for user in df.at[tweet,'likers']:
                likers_incomplete.at[tweet, user] = 1 

In [None]:
# same as above for retweeters
retweeters_incomplete = pd.DataFrame()

for files in pkl_r:

    df = pd.read_pickle(files)
    df.set_index('Unnamed: 0', inplace = True)
    
    for tweet in df.index:      
            for user in df.at[tweet,'retweeters']:
                retweeters_incomplete.at[tweet, user] = 1 

#### Add final harvest data to dataframe

In [None]:
# likes
finalharvest_l = pd.read_pickle(os.path.join(my_pull,'likers_final_harvest_complete.pkl')) # We use a file to keep track of the log so we can remember position if we need to restart due to error

In [None]:
# retweets
finalharvest_r = pd.read_pickle(os.path.join(my_pull,'retweeters_final_harvest_complete.pkl')) # We use a file to keep track of the log so we can remember position if we need to restart due to error

In [None]:
# add final harvest to dataframe
for tweet in finalharvest_l.index:      
    for user in finalharvest_l.at[tweet,'likers']:
                likers_incomplete.at[tweet, user] = 1 

In [None]:
# As above for retweets            
for tweet in finalharvest_r.index:      
    for user in finalharvest_r.at[tweet,'retweeters']:
                retweeters_incomplete.at[tweet, user] = 1 

In [None]:
likers_complete = likers_incomplete
retweeters_complete = retweeters_incomplete

#### Save complete datasets: likers and retweeters

In [None]:
# Now dataset complete, save as csv so it can also be passed on to data analysis
# TODO: name file
likers_complete.to_csv('like_votes_with_final_harvest_XXX.csv')
retweeters_complete.to_csv('retweets_votes_with_final_harvest_XXX.csv')

## B. Understanding the Likers & Retweeters datasets: Exploring the data/descriptive statistics

In [None]:
# If A. was run, then load data
likers_complete = pd.read_csv('like_votes_with_final_harvest_XXX.csv')

In [None]:
pd.options.display.float_format = '{:.2f}'.format
likers_complete.set_index('Unnamed: 0', inplace = True)
likers_complete.index.names = ['tweet']

In [None]:
retweeters_complete = pd.read_csv('retweets_votes_with_final_harvest_XXX.csv')

In [None]:
pd.options.display.float_format = '{:.2f}'.format
retweeters_complete.set_index('Unnamed: 0', inplace = True)
retweeters_complete.index.names = ['tweet']

In [None]:
# Number of all placed likes
numberlikes_rows = likers_complete.sum(axis = 1, skipna = True) 
sum(numberlikes_rows)

In [None]:
# Number of all placed retweets
numberretweets_rows = retweeters_complete.sum(axis = 1, skipna = True) 
sum(numberretweets_rows)

In [None]:
# sanity check 
numberlikes_col = likers_complete.sum(axis = 0, skipna = True) # sanity check
sum(numberlikes_col)

In [None]:
numberretweets_col = retweeters_complete.sum(axis = 0, skipna = True) # sanity check
sum(numberretweets_col)

In [None]:
# How many tweets were subject to collection in the final harvest?: those tweets that were subject to final data collection 
# in the final harvest (see parameters my_likersAtLeast)
my_likersAtLeast = # SET PARAMETER HERE
sum(numberlikes_rows >= my_likersAtLeast) 
# this is the number of tweets with at least my_likersAtLeast likes

In [None]:
# How many tweets were subject to collection in the final harvest?: those tweets that were subject to final data collection 
# in the final harvest (see parameters my_retweetersAtLeast)
my_retweetersAtLeast = # SET PARAMETER HERE
sum(numberretweets_rows>=my_retweetersAtLeast) 
# this is the number of tweets with at least my_retweetersAtLeast retweets

## C. Understanding script performance

### How many likers/retweeters did the script curate? Dataset comprising those tweets that were one last time collected in the final harvest

In [None]:
subset_likerscomplete = likers_complete.loc[finalharvest_l.index, :]  
subset_retweeterscomplete = retweeters_complete.loc[finalharvest_r.index, :]

In [None]:
# like count at time of final harvest
likecount = finalharvest_l['like_count']
# number of collected likers 
likerscollected = subset_likerscomplete.sum(axis = 1, skipna = True) 
# retweet count at time of final harvest
retweetcount = finalharvest_r['retweet_count']
# number of collected retweeters
retweeterscollected = subset_retweeterscomplete.sum(axis = 1, skipna = True) 

In [None]:
# Absolute number of missed likes/retweets per tweet
fig, ax = plt.subplots(figsize =(18, 10)) 

plt.plot(list(range(0,len(likecount),1)),likecount-likerscollected, label = 'Missed likes', alpha = 0.7, linewidth = .5)
plt.plot(list(range(0,len(retweetcount),1)),retweetcount-retweeterscollected, label = 'Missed retweets', alpha = 0.9, linewidth = .5)
plt.legend(loc="upper right", prop={'size': 10})

ax.set_xlabel("Tweets")
ax.set_ylabel('Likes/retweeters not collected')

In [None]:
# Share of missed likes/retweets given total of received likes/retweets per tweet
fig, ax = plt.subplots(figsize =(20, 10)) 

plt.plot(list(range(0,len(natural_likecount),1)),((natural_likecount-natural_likerscollected)/natural_likecount), label = 'Missed likes', alpha = 0.7, linewidth = .5)
plt.plot(list(range(0,len(retweetcount),1)),((retweetcount-retweeterscollected)/retweetcount), label = 'Missed retweets', alpha = 0.6, linewidth = .5)

plt.legend(loc="upper right", prop={'size': 10})

ax.set_xlabel("Tweets")
ax.set_ylabel('Share of likers/retweeters per tweet not collected')

In [None]:
# Supplemented with total number of likes/retweets each tweet attracted: 
xvalretweeters = list(range(1,len(retweetcount)+1,1))
xvallikers= list(range(1,len(likecount)+1,1))

SMALL_SIZE = 8
MEDIUM_SIZE = 16
BIGGER_SIZE = 12

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize =(18, 10))

fig.subplots_adjust(hspace=0.05)  # adjust space between axes

# plot the same data on both axes (broken axis can accomodate for outliers this way)
ax1.plot(xvalretweeters, (retweetcount-retweeterscollected)/retweetcount, label = 'retweets', alpha = .8, color='tab:orange', linewidth = 0.5)
ax1.plot(xvallikers, (likecount-likerscollected)/likecount, label = 'likes', alpha = .6, color = 'tab:blue', linewidth=.5)


ax2.plot(xvallikers, (likecount-likerscollected)/likecount, label = 'Received likes', alpha = 0.6, color = 'tab:blue', linewidth = .5)
ax3 = ax1.twinx()
ax3.plot(xvallikers, likesseries, label = 'Received likes', alpha = 0.8, color = 'tab:blue', linestyle='dotted')
ax3.plot(xvalretweeters, retweetsseries, label = 'Received retweets', alpha = 0.8, color = 'tab:orange', linestyle='dotted')

ax2.set_xlabel("Tweets")


# zoom-in / limit the view to different portions of the data
ax2.set_ylim(-4.09, -4.0)  # outliers only
ax1.set_ylim(-1.3, .7)  # most of the data (-1.3, .7

    # hide the spines between ax and ax2
ax1.spines.bottom.set_visible(False)
ax2.spines.top.set_visible(False)
ax3.spines.bottom.set_visible(False)
ax1.xaxis.tick_top()
ax1.tick_params(labeltop=False)  # don't put tick labels at the top
ax3.tick_params(labeltop=False)  # don't put tick labels at the top
ax2.tick_params(labeltop=False)  # don't put tick labels at the top

ax2.xaxis.tick_bottom()

d = .2  # proportion of vertical to horizontal extent of the slanted line
kwargs = dict(marker=[(-1, -d), (1, d)], markersize=12,
              linestyle="none", color='k', mec='k', mew=1, clip_on=False)
ax1.plot([0, 1], [0, 0], transform=ax1.transAxes, **kwargs)
ax2.plot([0, 1], [1, 1], transform=ax2.transAxes, **kwargs)

ax1.set_ylabel('Share of likes/retweets not collected (lines)')
ax3.set_ylabel('Received likes/retweets (dotted)')

ax1.legend(['Retweets', 'Likes'], loc = 'upper right',  prop={'size': 13})

plt.savefig('howmanymissed.pdf', bbox_inches='tight')

In [None]:
# inspect numbers more closely
d = {'collectedlikers': likerscollected, 'likecount': likecount, 'difference': likecount-likerscollected, 'percent': ((likecount-likerscollected)/likecount)}
inspectlikes = pd.DataFrame(data=d)
inspectlikes

In [None]:
# inspect numbers more closely
d = {'collectedretweeters': retweeterscollected, 'retweetcount': retweetcount, 'difference': retweetcount-retweeterscollected, 'percent': ((retweetcount-retweeterscollected)/retweetcount)}
inspectretweets = pd.DataFrame(data=d)
inspectretweets

In [None]:
# *retracted likes*
# In <len(neg)/len(inspectlikes)> % the script collects 10 likes or more too many
neg = inspectlikes[inspectlikes['difference'] <-10]
len(neg)/len(inspectlikes) 

In [None]:
# *retracted retweets*
# In <len(neg)/len(inspectretweets)> the script collects 10 retweets or more too many
neg = inspectretweets[inspectretweets['difference'] <-10]
len(neg)/len(inspectretweets) 

In [None]:
# *likes placed in large, simulanous batches* 
# In <len(pos)/len(inspectlikes)> % of tweets the script misses out on 10 or more likes
pos = inspectlikes[inspectlikes['difference'] >10]
len(pos)/len(inspectlikes)

In [None]:
# *retweets placed in large, simulanous batches* 
# In <len(pos)/len(inspectretweets)> % of tweets the script misses out on 10 or more retweets
pos = inspectretweets[inspectretweets['difference'] >10]
len(pos)/len(inspectretweets)

In [None]:
# # likes placed in large, simulanous batches
# In <len(pos)/len(inspectlikes)> % of tweets the script misses out on more than 10% of likes
pos = inspectlikes[inspectlikes['percent'] >.1]
len(pos)/len(inspectlikes)

In [None]:
# *retracted likes*
# In <len(pos)/len(inspectlikes)> % the script collects 10% or more too many likes
pos = inspectlikes[inspectlikes['percent'] <-.1]
len(pos)/len(inspectlikes)

In [None]:
# *retracted retweets*
# In <len(pos)/len(inspectretweets)> % the script collects 10% or more too many retweets
pos = inspectretweets[inspectretweets['percent'] >.1] 
len(pos)/len(inspectretweets)

In [None]:
# In <len(pos)/len(inspectretweets)> % of tweets the script misses out on more than 10% of retweets
pos = inspectretweets[inspectretweets['percent'] <-.1] 
len(pos)/len(inspectretweets)

In [None]:
# The script collects "too many" likers in <len(notzero)/len(inspectlikes)> % of cases
notzero = inspectlikes[inspectlikes['difference'] < 0]
len(notzero)/len(inspectlikes)

In [None]:
# The script collects "too many" retweeters in <len(notzero)/len(inspectretweets)> % of cases
notzero = inspectretweets[inspectretweets['difference'] < 0]
len(notzero)/len(inspectretweets)

In [None]:
# The script collects the exactly right number of likers in in <len(notzero)/len(inspectlikes)> % of cases 
notzero = inspectlikes[inspectlikes['difference'] == 0]
len(notzero)/len(inspectlikes)

In [None]:
# The script collects the exactly right number of retweeters in in <len(notzero)/len(inspectretweets)> % of cases 
notzero = inspectretweets[inspectretweets['difference'] == 0]
len(notzero)/len(inspectretweets)

In [None]:
# Inspect (highly popular) tweets in terms of like count
likesseries = inspectlikes['likecount'].squeeze()
likesseries

In [None]:
# Inspect (highly popular) tweets in terms of retweet count
retweetsseries = inspectretweets['retweetcount'].squeeze()
retweetsseries

In [None]:
from statistics import median

In [None]:
# Median like count
median(inspectlikes['likecount'].squeeze())

In [None]:
# Median retweet count
median(inspectretweets['retweetcount'].squeeze())

## D. Understanding user activity

### How many likes/retweets did the users place? How many unique likers/retweeters are in the dataset? 

In [None]:
# Most active user placed <numberlikes_col.max()> likes
numberlikes_col.max()

In [None]:
# Most active user placed <numberlikes_col.max()> retweets
numberretweets_col.max()

In [None]:
# make frequency table
freqlikers = numberlikes_col.value_counts()
freqlikers

freqretweeters = numberretweets_col.value_counts()
freqretweeters

In [None]:
x = np.array(numberlikes_col)
placedlikes = np.unique(x)

x = np.array(numberretweets_col)
placedretweets = np.unique(x)

In [None]:
# bring things together for frequency table
d = {'placedlikes': placedlikes, 'freqlikers': freqlikers}
freqtable_l = pd.DataFrame(data=d)
freqtable_l

# bring things together for frequency table
d = {'placedretweets': placedretweets, 'freqretweeters': freqretweeters}
freqtable_r = pd.DataFrame(data=d)
freqtable_r

In [None]:
plt.plot(freqtable_l['placedlikes'], freqtable['freqlikers'])

In [None]:
freqtable_l

In [None]:
freqtable_l['placedlikes']

In [None]:
y_pos_r = np.arange(len(bars_r))
y_pos_r
len(y_pos_r)

In [None]:
SMALL_SIZE = 8
MEDIUM_SIZE = 16
BIGGER_SIZE = 12

plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize

fig, ax = plt.subplots(figsize =(20, 10))
bars = freqtable_l['placedlikes'].iloc[0:50,]
y_pos = np.arange(len(bars)+1)
y_pos = y_pos[1:51]

bars_r = freqtable_r['placedretweets'].iloc[0:50,]
y_pos_r = np.arange(len(bars_r)+1)
y_pos_r = y_pos_r[1:51]
n = 1  # Keeps every 7th label
[l.set_visible(False) for (i,l) in enumerate(ax.xaxis.get_ticklabels()) if i % n != 0]

# Create bars
plt.bar(y_pos-.3, round(freqtable_l['freqlikers'].iloc[0:50,]/sum(freqtable_l['freqlikers']),3), width = 0.45, alpha = .9, label = 'Likes')
plt.bar(y_pos_r+.2, round(freqtable_r['freqretweeters'].iloc[0:50,]/sum(freqtable_r['freqretweeters']),3), width = 0.45, alpha =.6, label = 'Retweets')


# Create names on the axis
#plt.xticks(y_pos, bars)
plt.xlabel("Number of likes/retweets placed per liker")
plt.ylabel("share of likers/retweeters")

plt.legend(loc="upper right", prop={'size': 13})

# Show graphic
# plt.show()
#import tikzplotlib
#tikzplotlib.clean_figure()
#tikzplotlib.save("howmanyplacedlikes_lowerend.tex", axis_height='7cm', axis_width='15cm') 
plt.savefig('howmanyplacedlikes_lowerend.pdf', bbox_inches='tight')



In [None]:
# Other metrics 
round(sum(freqtable_l['freqlikers'].iloc[51:655,]))#/sum(freqtable_l['freqlikers']),3) # % of users like more than 50 times in a month

In [None]:
round(sum(freqtable_l['freqlikers'].iloc[51:655,])/sum(freqtable_l['freqlikers']),3) # % of users like more than 50 times in a month

In [None]:
round(sum(freqtable_r['freqretweeters'].iloc[51:655,])/sum(freqtable_r['freqretweeters']),3) # % of users retweet more than 50 times in a month

In [None]:
round(sum(freqtable_l['freqlikers'].iloc[1:655,])/sum(freqtable_l['freqlikers']),3) # % of users like more than one post per month 

In [None]:
round(sum(freqtable_r['freqretweeters'].iloc[1:655,])/sum(freqtable_r['freqretweeters']),3) # % of users retweet more than one post per monthr

In [None]:
round(sum(freqtable_l['freqlikers'].iloc[2:655,])/sum(freqtable_l['freqlikers']),3) # % like more than 2 post per month 

In [None]:
round(sum(freqtable_r['freqretweeters'].iloc[2:655,])/sum(freqtable_r['freqretweeters']),3) # % retweet more than 2 posts per month 

In [None]:
round(sum(freqtable_l['freqlikers'].iloc[3:655,])/sum(freqtable_l['freqlikers']),3) # % like more than 3 posts per month 

In [None]:
round(sum(freqtable_r['freqretweeters'].iloc[3:655,])/sum(freqtable_r['freqretweeters']),3) # % retweet more than 3 post per month 

In [None]:
round(sum(freqtable_l['freqlikers'].iloc[4:655])/sum(freqtable_l['freqlikers']),3) # % like more than 4 post per month (5 or more)

In [None]:
round(sum(freqtable_r['freqretweeters'].iloc[4:655])/sum(freqtable_r['freqretweeters']),3) # % retweet more than 4 post per month 

In [None]:
sum(freqtable_l['freqlikers'].iloc) # number of likers

In [None]:
sum(freqtable_r['freqretweeters'].iloc) # number of retweeters