# Data Wrangling

In [1]:
import os
import re
import datetime as dt
from time import time
import random
import math
import html

# data science
import numpy as np
import pandas as pd

In [2]:
# configurations

# Allow multiple outputs for each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# display all columns
pd.set_option('display.max_columns', None)

# suppress auto-conversion to scientific notation
pd.set_option('display.precision', 6)

# Tweets

Source: https://data.world/balexturner/390-000-metoo-tweets<br>
390,000 tweets under the #MeToo hashtag dated between November 29th and December 25th, 2017

In [3]:
# read tweets
df = pd.read_excel('Data/metoo_tweets_dec2017.xlsx')

In [4]:
df.columns

Index(['text', 'favorited', 'favoriteCount', 'replyToSN', 'created',
       'truncated', 'replyToSID', 'id', 'replyToUID', 'statusSource',
       'screenName', 'retweetCount', 'isRetweet', 'retweeted', 'longitude',
       'latitude'],
      dtype='object')

In [5]:
# 393135 rows, 16 columns
df.shape

(393135, 16)

In [6]:
df.describe(include='all')

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,longitude,latitude
count,393135,393135,393135.0,49716,393135,393135,41905.0,393135.0,49718.0,393132,393135,393135.0,393135,393135,123.0,123.0
unique,192901,1,,20300,30196,2,,,,1695,222311,,2,1,,
top,RT @LaurenJauregui: Shout out to all the stron...,False,,realDonaldTrump,2017-12-16 13:53:00,False,,,,"<a href=""http://twitter.com/download/iphone"" r...",DulleyTopBooks,,True,False,,
freq,5443,393135,,2062,713,305538,,,,127447,790,,224719,393135,,
first,,,,,2017-11-29 13:52:00,,,,,,,,,,,
last,,,,,2017-12-25 23:59:00,,,,,,,,,,,
mean,,,3.311494,,,,9.398637e+17,9.408284e+17,1.228446e+17,,,717.743994,,,-69.171724,36.231479
std,,,142.558537,,,,1.778519e+16,2292175000000000.0,2.965066e+17,,,1823.582462,,,54.971418,13.226847
min,,,0.0,,,,2410985000.0,9.36e+17,12.0,,,0.0,,,-157.826,-33.797215
25%,,,0.0,,,,9.38862e+17,9.39e+17,23970100.0,,,0.0,,,-100.0,34.051983


In [7]:
df['screenName'].nunique()

222311

In [8]:
df['id'].nunique()

5642

In [9]:
df.columns

Index(['text', 'favorited', 'favoriteCount', 'replyToSN', 'created',
       'truncated', 'replyToSID', 'id', 'replyToUID', 'statusSource',
       'screenName', 'retweetCount', 'isRetweet', 'retweeted', 'longitude',
       'latitude'],
      dtype='object')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 393135 entries, 1 to 393135
Data columns (total 16 columns):
text             393135 non-null object
favorited        393135 non-null bool
favoriteCount    393135 non-null int64
replyToSN        49716 non-null object
created          393135 non-null datetime64[ns]
truncated        393135 non-null bool
replyToSID       41905 non-null float64
id               393135 non-null int64
replyToUID       49718 non-null float64
statusSource     393132 non-null object
screenName       393135 non-null object
retweetCount     393135 non-null int64
isRetweet        393135 non-null bool
retweeted        393135 non-null bool
longitude        123 non-null float64
latitude         123 non-null float64
dtypes: bool(4), datetime64[ns](1), float64(4), int64(3), object(4)
memory usage: 40.5+ MB


In [11]:
df.isnull().sum()
# will look into 'replyToSID', 'id', 'replyToUID', 'replyToSN'

text                  0
favorited             0
favoriteCount         0
replyToSN        343419
created               0
truncated             0
replyToSID       351230
id                    0
replyToUID       343417
statusSource          3
screenName            0
retweetCount          0
isRetweet             0
retweeted             0
longitude        393012
latitude         393012
dtype: int64

First, let's check 'replyToSID', 'id', 'replyToUID'

In [12]:
df[['replyToSID', 'id', 'replyToUID']]

Unnamed: 0,replyToSID,id,replyToUID
1,,936000000000000000,
2,,936000000000000000,2.661498e+08
3,,936000000000000000,
4,,936000000000000000,
5,,936000000000000000,
6,,936000000000000000,
7,,936000000000000000,
8,,936000000000000000,
9,9.360000e+17,936000000000000000,4.969813e+07
10,9.360000e+17,936000000000000000,5.892869e+07


replyToUID has too many missing values. Will drop.

In [13]:
# the 'id' field is not unique (cannot be used to uniquely identify a tweet)
# will drop and create new id
df['id'].nunique() == len(df)

False

In [14]:
# interesting, I'll keep this column
df['replyToSN'].value_counts()

realDonaldTrump    2062
Alyssa_Milano       746
SenGillibrand       662
rosemcgowan         511
TIME                435
thehill             374
SenFranken          313
CNN                 308
FoxNews             271
morningmika         225
funder              221
nytimes             185
benshapiro          184
MiraSorvino         183
Rosie               183
BetteMidler         180
jaketapper          170
MSNBC               168
StaciaRR            152
yashar              148
washingtonpost      145
salmahayek          137
EdanClay            136
EzraFitz_13         132
Morning_Joe         130
Amy_Siskind         128
NBCNews             128
politico            122
krassenstein        119
terrycrews          117
                   ... 
salonpas              1
heatheroverhere       1
Aetna                 1
elmtree916            1
rychemom              1
Real_Infinity95       1
sndorf                1
EricMcCormack         1
NorwellDemocrat       1
deelymac              1
tweetangelarose 

In [15]:
df
# I hope the tweets aren't truncated.. Let's check

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,longitude,latitude
1,American Harem.. #MeToo https://t.co/HjExLJdGuF,False,0,,2017-11-29 23:59:00,False,,936000000000000000,,"<a href=""http://instagram.com"" rel=""nofollow"">...",ahmediaTV,0,False,False,,
2,@johnconyersjr @alfranken why have you guys ...,False,0,johnconyersjr,2017-11-29 23:59:00,False,,936000000000000000,2.661498e+08,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",JesusPrepper74,0,False,False,,
3,Watched Megan Kelly ask Joe Keery this A.M. if...,False,0,,2017-11-29 23:59:00,True,,936000000000000000,,"<a href=""http://twitter.com/download/android"" ...",DemerisePotvin,0,False,False,,
4,Women have been talking about this crap the en...,False,0,,2017-11-29 23:59:00,False,,936000000000000000,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",TheDawnStott,0,False,False,,
5,.@BetteMidler please speak to this sexual assa...,False,15,,2017-11-29 23:59:00,False,,936000000000000000,,"<a href=""http://twitter.com/#!/download/ipad"" ...",scottygirl2014,11,False,False,,
6,We can't keep turning a blind eye and pretend ...,False,0,,2017-11-29 23:59:00,False,,936000000000000000,,"<a href=""http://instagram.com"" rel=""nofollow"">...",ForEverBrenn,0,False,False,,
7,Jay-Z is saying what I've been saying. DJT's j...,False,3,,2017-11-29 23:59:00,True,,936000000000000000,,"<a href=""http://twitter.com/download/android"" ...",silveriaalison,3,False,False,,
8,Where in the world is @MattLauer Celebrate #MeToo,False,0,,2017-11-29 23:59:00,False,,936000000000000000,,"<a href=""http://twitter.com/download/iphone"" r...",calrican,0,False,False,,
9,@JoyAnnReid Keep it coming ladies! #MeToo,False,0,JoyAnnReid,2017-11-29 23:59:00,False,9.360000e+17,936000000000000000,4.969813e+07,"<a href=""http://twitter.com/download/iphone"" r...",traugott_sarah,0,False,False,,
10,"@RepAdamSmith Workplace is one place, but what...",False,1,RepAdamSmith,2017-11-29 23:58:00,True,9.360000e+17,936000000000000000,5.892869e+07,"<a href=""http://twitter.com/download/iphone"" r...",SchifanoRaelene,0,False,False,,


In [16]:
df.iloc[[2]]
df.iloc[[2]]['text']
# Damn.. it's truncated. Let's see how many are truncated.

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,longitude,latitude
3,Watched Megan Kelly ask Joe Keery this A.M. if...,False,0,,2017-11-29 23:59:00,True,,936000000000000000,,"<a href=""http://twitter.com/download/android"" ...",DemerisePotvin,0,False,False,,


3    Watched Megan Kelly ask Joe Keery this A.M. if...
Name: text, dtype: object

In [17]:
df['truncated'].value_counts(dropna=False)

False    305538
True      87597
Name: truncated, dtype: int64

In [18]:
87597 / (305538 + 87597)
# Roughly 22% of tweets are truncated. I can work with that.

0.22281658972108817

In [19]:
# make sure tweets contain the #metoo hashtag
df['text'].str.contains('#metoo', case=False).value_counts()

True     338554
False     54581
Name: text, dtype: int64

In [20]:
# 54,581 out of 393,135 tweets do not contain the #metoo hashtag (case insensitive)
# but it appears as though the tweets without #metoo are also discussing relevant topic
# therefore will not drop these tweets - for now
df[df['text'].str.contains('#metoo', case=False) == False]

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,longitude,latitude
3,Watched Megan Kelly ask Joe Keery this A.M. if...,False,0,,2017-11-29 23:59:00,True,,936000000000000000,,"<a href=""http://twitter.com/download/android"" ...",DemerisePotvin,0,False,False,,
10,"@RepAdamSmith Workplace is one place, but what...",False,1,RepAdamSmith,2017-11-29 23:58:00,True,9.360000e+17,936000000000000000,5.892869e+07,"<a href=""http://twitter.com/download/iphone"" r...",SchifanoRaelene,0,False,False,,
12,Just got scammed out of $25. Bought #TigerW...,False,2,,2017-11-29 23:58:00,True,,936000000000000000,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RDonSteele,1,False,False,,
13,@RepKathleenRice @RepJayapal Calling out @RepJ...,False,0,RepKathleenRice,2017-11-29 23:58:00,True,9.360000e+17,936000000000000000,2.970462e+09,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",_standeliver,0,False,False,,
16,"I was sexually harassed as a ""young Intern"" fo...",False,1,,2017-11-29 23:57:00,True,,936000000000000000,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",FrancescaBiller,3,False,False,,
17,So I have a question. This post was published ...,False,35,,2017-11-29 23:57:00,True,,936000000000000000,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",emilyjoypoetry,7,False,False,,
18,Im pretty sure @DanRather and Brian Williams a...,False,0,,2017-11-29 23:57:00,True,,936000000000000000,,"<a href=""http://twitter.com/download/iphone"" r...",frozenjo,1,False,False,,
20,How can @realDonaldTrump talk about anyone tha...,False,0,,2017-11-29 23:57:00,True,,936000000000000000,,"<a href=""http://twitter.com/download/iphone"" r...",DougsMom204,0,False,False,,
21,"Yes!! WHEN it happens to you, THEN you can dec...",False,2,,2017-11-29 23:57:00,True,,936000000000000000,,"<a href=""http://twitter.com/download/iphone"" r...",page_lie,0,False,False,,
24,men all over this country are seeking ways to ...,False,1,,2017-11-29 23:56:00,True,,936000000000000000,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",justshocked,1,False,False,,


In [21]:
# before exporting, I'm going to assign unique IDs and drop unnecessary columns
df['tweetId'] = np.arange(1, len(df)+1)
df.drop(['id', 'replyToSID','replyToUID'],1,inplace=True)

# reorganize the columns
cols = ['tweetId', 'screenName', 'text', 'replyToSN', 'isRetweet',
        'retweeted', 'retweetCount', 'favorited', 'favoriteCount',
        'truncated', 'created', 'statusSource', 'longitude', 'latitude']

df = df[cols]
df

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
1,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
2,2,JesusPrepper74,@johnconyersjr @alfranken why have you guys ...,johnconyersjr,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
3,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,,False,False,0,False,0,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
4,4,TheDawnStott,Women have been talking about this crap the en...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
5,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,,False,False,11,False,15,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",,
6,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
7,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,,False,False,3,False,3,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
8,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
9,9,traugott_sarah,@JoyAnnReid Keep it coming ladies! #MeToo,JoyAnnReid,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
10,10,SchifanoRaelene,"@RepAdamSmith Workplace is one place, but what...",RepAdamSmith,False,False,0,False,1,True,2017-11-29 23:58:00,"<a href=""http://twitter.com/download/iphone"" r...",,


In [22]:
df['text'].values.tolist()

['American Harem.. #MeToo https://t.co/HjExLJdGuF',
 '@johnconyersjr  @alfranken  why have you guys not resigned yet? Liberal hypocrisy! #MeToo',
 'Watched Megan Kelly ask Joe Keery this A.M. if she can "rub my fingers through your hair", and refer to his body be https://t.co/Q86wfW7DeJ',
 'Women have been talking about this crap the entire time, finally someone listened. #metoo https://t.co/JlK11yhFXc',
 '.@BetteMidler please speak to this sexual assault by @GeraldoRivera during the interview. #MeToo  https://t.co/1iuafGaOmv',
 "We can't keep turning a blind eye and pretend this isn't real. #metoo https://t.co/1dLZcftbSs",
 "Jay-Z is saying what I've been saying. DJT's jobis done. The dialogues have begun:\r racism\r#metoo \rmorality\rhum https://t.co/KJtQBEoonR",
 'Where in the world is @MattLauer Celebrate #MeToo',
 '@JoyAnnReid Keep it coming ladies!  #MeToo',
 '@RepAdamSmith Workplace is one place, but what about our HOMES?! Who protects Citizens when HOA/COA Boards &amp; Attorn h

In [23]:
# need to get rid of /r 
df['text'] = df['text'].str.replace(r'\r', ' ')
df['text'].str.contains(r'\r').value_counts()

False    393135
Name: text, dtype: int64

In [24]:
# unescape '&amp', etc
df['text'] = [html.unescape(i) for i in df['text'].values.tolist()]

In [25]:
# df.to_sql('tweets', con, if_exists='replace', index=False)
df.to_csv('Data/tweets.csv', index=False, encoding='utf-8')

# Unique Users

In [26]:
# uniqueUsers = pd.read_sql('SELECT DISTINCT screenName from tweets', con)
uniqueUsers = df.copy()
uniqueUsers

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
1,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
2,2,JesusPrepper74,@johnconyersjr @alfranken why have you guys ...,johnconyersjr,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
3,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,,False,False,0,False,0,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
4,4,TheDawnStott,Women have been talking about this crap the en...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
5,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,,False,False,11,False,15,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",,
6,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
7,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,,False,False,3,False,3,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
8,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
9,9,traugott_sarah,@JoyAnnReid Keep it coming ladies! #MeToo,JoyAnnReid,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
10,10,SchifanoRaelene,"@RepAdamSmith Workplace is one place, but what...",RepAdamSmith,False,False,0,False,1,True,2017-11-29 23:58:00,"<a href=""http://twitter.com/download/iphone"" r...",,


In [27]:
uniqueUsers = uniqueUsers[['screenName']]
uniqueUsers.drop_duplicates(inplace=True)

In [28]:
uniqueUsers[uniqueUsers['screenName'].isna()]

Unnamed: 0,screenName


In [29]:
df['screenName'].nunique() == len(uniqueUsers)

True

In [30]:
uniqueUsers

Unnamed: 0,screenName
1,ahmediaTV
2,JesusPrepper74
3,DemerisePotvin
4,TheDawnStott
5,scottygirl2014
6,ForEverBrenn
7,silveriaalison
8,calrican
9,traugott_sarah
10,SchifanoRaelene


In [31]:
uniqueUsers.to_csv('Data/users.csv', index=False)

## Hashtags

In [32]:
hashtag = df[['tweetId','screenName', 'text']]

In [33]:
hashtags_extracted = hashtag['text'].str.extractall('(?P<hashtag>#\w+)')
hashtags_extracted.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,hashtag
Unnamed: 0_level_1,match,Unnamed: 2_level_1
1,0,#MeToo
2,0,#MeToo
4,0,#metoo
5,0,#MeToo
6,0,#metoo
7,0,#metoo
8,0,#MeToo
9,0,#MeToo
11,0,#MeToo
12,0,#TigerWoods


Again, I need to reset the second index

In [34]:
hashtags_extracted = hashtags_extracted.reset_index(level=1)
hashtags_extracted.head()

Unnamed: 0,match,hashtag
1,0,#MeToo
2,0,#MeToo
4,0,#metoo
5,0,#MeToo
6,0,#metoo


Great. Now I just need to join the tweetId and merge back to the dataframe 'hashtag'.

In [35]:
hashtags_extracted = hashtags_extracted.join(hashtag['tweetId'])
hashtags_extracted

Unnamed: 0,match,hashtag,tweetId
1,0,#MeToo,1
2,0,#MeToo,2
4,0,#metoo,4
5,0,#MeToo,5
6,0,#metoo,6
7,0,#metoo,7
8,0,#MeToo,8
9,0,#MeToo,9
11,0,#MeToo,11
12,0,#TigerWoods,12


In [36]:
hashtags_extracted['hashtag'] = hashtags_extracted['hashtag'].str.replace(r'#metoo', '#MeToo')
hashtags_extracted['hashtag'] = hashtags_extracted['hashtag'].str.replace(r'#Metoo', '#MeToo')
hashtags_extracted['hashtag'] = hashtags_extracted['hashtag'].str.replace(r'#METOO', '#MeToo')

In [37]:
hashtags_extracted = hashtags_extracted[['tweetId', 'hashtag']]
hashtags_extracted

Unnamed: 0,tweetId,hashtag
1,1,#MeToo
2,2,#MeToo
4,4,#MeToo
5,5,#MeToo
6,6,#MeToo
7,7,#MeToo
8,8,#MeToo
9,9,#MeToo
11,11,#MeToo
12,12,#TigerWoods


In [38]:
hashtag = pd.merge(hashtag, hashtags_extracted, how='right', on='tweetId')
hashtag

Unnamed: 0,tweetId,screenName,text,hashtag
0,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,#MeToo
1,2,JesusPrepper74,@johnconyersjr @alfranken why have you guys ...,#MeToo
2,4,TheDawnStott,Women have been talking about this crap the en...,#MeToo
3,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,#MeToo
4,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,#MeToo
5,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,#MeToo
6,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,#MeToo
7,9,traugott_sarah,@JoyAnnReid Keep it coming ladies! #MeToo,#MeToo
8,11,BrunusCutis,Is it just me or does the #MeToo <----icon; fr...,#MeToo
9,12,RDonSteele,Just got scammed out of $25. Bought #TigerW...,#TigerWoods


In [39]:
hashtag.drop('text',1,inplace=True)

In [40]:
hashtag.to_csv('Data/hashtags.csv', index=False)

# Association

Here I'm going to create a DataFrame that lists tweets that are:<br>
{category: condition}

Original:
- retweet: isRetweet is True
- reply: 'replyToSN' is not NaN
- mention: contains '@'

Updated:
- retweet
 - 'isRetweet' == True
- reply
 - 'replyToSN' is not NaN
 - 'text' starts with '@'
- mention
 - 'isRetweet' == False
 - 'replyToSN' is NaN
 - 'text' contains but does not start with '@'
- self
 - 'isRetweet' == False
 - 'replyToSN' is NaN
 - 'text' does not contain '@'

in all possible 1-to-1 combinations

In [41]:
# df = pd.read_sql('SELECT tweetId, screenName, text from tweets', con)
df = pd.read_csv('Data/tweets.csv')
df = df[['tweetId', 'screenName', 'text', 'replyToSN', 'isRetweet']]

In [42]:
len(df)
print('reply: "replyToSN" is not NaN')
np.logical_not(df['replyToSN'].isna()).value_counts()
print('isRetweet')
df['isRetweet'].value_counts()
print('contains "RT"')
df['text'].str.contains('RT').value_counts()
print('mention: contains "@"')
df['text'].str.contains('@').value_counts()

393135

reply: "replyToSN" is not NaN


False    343419
True      49716
Name: replyToSN, dtype: int64

isRetweet


True     224719
False    168416
Name: isRetweet, dtype: int64

contains "RT"


True     227089
False    166046
Name: text, dtype: int64

mention: contains "@"


True     297378
False     95757
Name: text, dtype: int64

of 393135 tweets...
- 49716 (12.6%) were created as a response to another user/tweet
- 224719 (57.2%) were labaled as True for 'isRetweet'
- 227089 (57.8%) contained 'RT'
- 297378 (75.6%) contained '@', most/many of which are likely tagging another user

Now, I need to determine how I will classify retweets:
- contains 'RT' followed by a username? or
- 'isRetweet' labeled as True?

Let's look at the 'text' columns deeper

In [43]:
len(df[df['isRetweet'] == True])
df[df['isRetweet'] == True]['text'].str.contains('RT').value_counts()

224719

True     224702
False        17
Name: text, dtype: int64

Of the 224719 tweets where 'isRetweet' is True, all but 17 of them contain 'RT'<br>
Let's look tweets where 'isRetweet' is True, and 'text' does not contain 'RT'

In [44]:
isRetweet = df[df['isRetweet'] == True]
isRetweet[np.logical_not(isRetweet['text'].str.contains('RT'))]

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet
248421,248422,Noone82270905,@Noone82270905's account is temporarily unavai...,,True
265635,265636,Noone82270905,@Noone82270905's account is temporarily unavai...,,True
282493,282494,CharleenRyan4,@CharleenRyan4's account is temporarily unavai...,,True
380535,380536,Femen_France,This Tweet from @Femen_France is temporarily u...,,True
388852,388853,sapiens_no_2_1,This Tweet from @sapiens_no_2_1 has been withh...,,True
389602,389603,PjjPjoje,This Tweet from @PjjPjoje has been withheld in...,,True
389673,389674,sarahconstantin,This Tweet from @sarahconstantin has been with...,,True
389686,389687,isikiniwapo1,This Tweet from @isikiniwapo1 has been withhel...,,True
389732,389733,Me_Too_Us,This Tweet from @Me_Too_Us has been withheld i...,,True
389777,389778,Max2Jets,This Tweet from @Max2Jets has been withheld in...,,True


Interesting, all of them seem to be suspended or tweets have been taken down.<br>
Let's drop these.

In [45]:
# tweetIds that will be flagged
flag = isRetweet[np.logical_not(isRetweet['text'].str.contains('RT'))]['tweetId']
flag

248421    248422
265635    265636
282493    282494
380535    380536
388852    388853
389602    389603
389673    389674
389686    389687
389732    389733
389777    389778
389779    389780
389998    389999
390941    390942
390944    390945
390968    390969
390970    390971
390973    390974
Name: tweetId, dtype: int64

In [46]:
df = pd.read_csv('Data/tweets.csv', encoding='utf-8')

In [47]:
# drop the tweets
len(flag)
len(df)
df.drop(df[df['tweetId'].isin(flag)].index, inplace=True)
len(df)

17

393135

393118

In [48]:
df.to_csv('Data/tweets.csv', index=False, encoding='utf-8')

Now, 'isRetweet' == True will be an indicator for Retweets


## Reply

In [49]:
df = pd.read_csv('Data/tweets.csv', encoding='utf-8')

In [50]:
reply = df[(df['replyToSN'].isnull() == False) & (df['text'].str.startswith('@') == True)][['tweetId','screenName','replyToSN']]
reply

Unnamed: 0,tweetId,screenName,replyToSN
1,2,JesusPrepper74,johnconyersjr
8,9,traugott_sarah,JoyAnnReid
9,10,SchifanoRaelene,RepAdamSmith
12,13,_standeliver,RepKathleenRice
21,22,VirginiusPrimus,CNN
22,23,1mimi4ever,SaveUSA1776
24,25,ProphetPhella,realDonaldTrump
31,32,genesis427427,btweet2all
35,36,nvygrl1,GeraldoRivera
38,39,roadgearsun,samaralynn


In [51]:
reply['associationType'] = 'reply'
reply = reply[['tweetId','screenName','associationType','replyToSN']]
reply.columns = ['tweetId','screenName', 'associationType', 'keyword']
reply

Unnamed: 0,tweetId,screenName,associationType,keyword
1,2,JesusPrepper74,reply,johnconyersjr
8,9,traugott_sarah,reply,JoyAnnReid
9,10,SchifanoRaelene,reply,RepAdamSmith
12,13,_standeliver,reply,RepKathleenRice
21,22,VirginiusPrimus,reply,CNN
22,23,1mimi4ever,reply,SaveUSA1776
24,25,ProphetPhella,reply,realDonaldTrump
31,32,genesis427427,reply,btweet2all
35,36,nvygrl1,reply,GeraldoRivera
38,39,roadgearsun,reply,samaralynn


## Retweet

In [52]:
df = pd.read_csv('Data/tweets.csv')

In [53]:
df

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
0,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
1,2,JesusPrepper74,@johnconyersjr @alfranken why have you guys ...,johnconyersjr,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
2,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,,False,False,0,False,0,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
3,4,TheDawnStott,Women have been talking about this crap the en...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
4,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,,False,False,11,False,15,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",,
5,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
6,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,,False,False,3,False,3,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
7,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
8,9,traugott_sarah,@JoyAnnReid Keep it coming ladies! #MeToo,JoyAnnReid,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
9,10,SchifanoRaelene,"@RepAdamSmith Workplace is one place, but what...",RepAdamSmith,False,False,0,False,1,True,2017-11-29 23:58:00,"<a href=""http://twitter.com/download/iphone"" r...",,


In [54]:
retweet = df[df['isRetweet'] == True][['tweetId','screenName', 'text']]
retweet

Unnamed: 0,tweetId,screenName,text
4997,4998,wordsrox,RT @NancyPelosi: Proud to stand with @RepSpeie...
4998,4999,Objective_ish,RT @NancyPelosi: Proud to stand with @RepSpeie...
5000,5001,criynwa293,RT @jonfavs: NEW POD: Legislative looting and ...
5001,5002,krissymom2,RT @NancyPelosi: Proud to stand with @RepSpeie...
5003,5004,jestor810,RT @NancyPelosi: Proud to stand with @RepSpeie...
5004,5005,LisaD787,RT @earthma23: @davrosz I have a couple of ide...
5006,5007,GBan14,RT @NancyPelosi: Proud to stand with @RepSpeie...
5007,5008,MsHasNoPatience,RT @SarahSpain: Many have been asking when the...
5009,5010,Perspectvz,RT @NancyPelosi: Proud to stand with @RepSpeie...
5010,5011,CarnivoraUrsida,RT @wesley_jordan: Thank you @BetteMidler! Te...


In [55]:
# ensuring that row counts are the same
retweet['text'].str.startswith('RT @').value_counts()

True    224702
Name: text, dtype: int64

In [56]:
retweet['keyword'] = retweet['text'].str.extract(r'RT @(\w+)')

In [57]:
retweet.drop('text',1,inplace=True)
retweet['associationType'] = 'retweet'
retweet = retweet[['tweetId','screenName','associationType','keyword']]
retweet

Unnamed: 0,tweetId,screenName,associationType,keyword
4997,4998,wordsrox,retweet,NancyPelosi
4998,4999,Objective_ish,retweet,NancyPelosi
5000,5001,criynwa293,retweet,jonfavs
5001,5002,krissymom2,retweet,NancyPelosi
5003,5004,jestor810,retweet,NancyPelosi
5004,5005,LisaD787,retweet,earthma23
5006,5007,GBan14,retweet,NancyPelosi
5007,5008,MsHasNoPatience,retweet,SarahSpain
5009,5010,Perspectvz,retweet,NancyPelosi
5010,5011,CarnivoraUrsida,retweet,wesley_jordan


## Mention

In [58]:
mention = df[['tweetId','screenName', 'text', 'isRetweet', 'replyToSN']]

In [59]:
mention = mention[mention['replyToSN'].isnull() & (mention['isRetweet'] == False)]
mention

Unnamed: 0,tweetId,screenName,text,isRetweet,replyToSN
0,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,False,
2,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,False,
3,4,TheDawnStott,Women have been talking about this crap the en...,False,
4,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,False,
5,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,False,
6,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,False,
7,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,False,
10,11,BrunusCutis,Is it just me or does the #MeToo <----icon; fr...,False,
11,12,RDonSteele,Just got scammed out of $25. Bought #TigerW...,False,
13,14,dbehan79,#BlackLivesMatter with #MeToo yup ok,False,


In [60]:
mention = mention[np.logical_not(mention['text'].str.startswith('@')) & mention['text'].str.contains('@')]
mention

Unnamed: 0,tweetId,screenName,text,isRetweet,replyToSN
4,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,False,
7,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,False,
16,17,emilyjoypoetry,So I have a question. This post was published ...,False,
17,18,frozenjo,Im pretty sure @DanRather and Brian Williams a...,False,
19,20,DougsMom204,How can @realDonaldTrump talk about anyone tha...,False,
29,30,collegeclasses5,Kathleen Kane: The Rise and Fall | Philadelphi...,False,
32,33,page_lie,And @headhntr freaking CONFESSED online last w...,False,
40,41,SWSupportLV,Sex Workers Deserve A Space To Voice #MeToo by...,False,
46,47,BJFrezell,Thinking of @AnnCurry and the unpleasantness M...,False,
52,53,jicReneeMerling,Yet #fakemedia & #Hollywood wants to use #Smok...,False,


In [61]:
mention['text'] = mention['text'].str.replace(r'(RT @\w+)', '')

In [62]:
mention['text'].str.contains('RT @').value_counts()

False    25255
True        12
Name: text, dtype: int64

In [63]:
# 12 of RT's couldn't be removed
mention[mention['text'].str.contains('RT @')]

Unnamed: 0,tweetId,screenName,text,isRetweet,replyToSN
3116,3117,realityshowes,RT @ destiario: Typing in dictatorship #Venezu...,False,
7881,7882,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,
18180,18181,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,
25592,25593,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,
38617,38618,realityshowes,RT @ destiario: ‰Õü‰Õü‰_´Typing in dictatorshi...,False,
52738,52739,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,
54254,54255,realityshowes,RT @ destiario: #Cine Dustin Hoffman Spars Wit...,False,
112149,112150,realityshowes,RT @ destiario: ‚í∏‚í∏‚û•Typing in dictatorshi...,False,
118825,118826,YESWECAN53,"RT @ ""#MT @funder The 16 women who accused Tru...",False,
118826,118827,whyfeedthem53,"RT @ ""#MT @funder The 16 women who accused Tru...",False,


In [64]:
mention['text2'] = mention['text'].str.replace(r'(RT @ \w+)', '')
mention[mention['text'].str.contains('RT @')]

Unnamed: 0,tweetId,screenName,text,isRetweet,replyToSN,text2
3116,3117,realityshowes,RT @ destiario: Typing in dictatorship #Venezu...,False,,: Typing in dictatorship #Venezuela 2017 http...
7881,7882,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
18180,18181,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
25592,25593,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
38617,38618,realityshowes,RT @ destiario: ‰Õü‰Õü‰_´Typing in dictatorshi...,False,,: ‰Õü‰Õü‰_´Typing in dictatorship #Venezuela 2...
52738,52739,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
54254,54255,realityshowes,RT @ destiario: #Cine Dustin Hoffman Spars Wit...,False,,: #Cine Dustin Hoffman Spars With John Oliver ...
112149,112150,realityshowes,RT @ destiario: ‚í∏‚í∏‚û•Typing in dictatorshi...,False,,: ‚í∏‚í∏‚û•Typing in dictatorship #Venezuela 2...
118825,118826,YESWECAN53,"RT @ ""#MT @funder The 16 women who accused Tru...",False,,"RT @ ""#MT @funder The 16 women who accused Tru..."
118826,118827,whyfeedthem53,"RT @ ""#MT @funder The 16 women who accused Tru...",False,,"RT @ ""#MT @funder The 16 women who accused Tru..."


In [65]:
mention[mention['text'].str.contains('RT @')]['text2'].values

array([': Typing in dictatorship #Venezuela 2017  https://t.co/UyIWOlbtX0 #USA #KevinSpacey #Oscars #MeToo',
       ': â’¸â’¸â_¥Typing in dictatorship #Venezuela 2017 â˜› https://t.co/dvPuZdtnlB #USA #KevinSpacey #Oscars #MeToo',
       ': â’¸â’¸â_¥Typing in dictatorship #Venezuela 2017 â˜› https://t.co/dvPuZdtnlB #USA #KevinSpacey #Oscars #MeToo',
       ': â’¸â’¸â_¥Typing in dictatorship #Venezuela 2017 â˜› https://t.co/dvPuZdtnlB #USA #KevinSpacey #Oscars #MeToo',
       ': ‰Õü‰Õü‰_´Typing in dictatorship #Venezuela 2017 ‰÷_ https://t.co/dvPuZdtnlB #USA #KevinSpacey #Oscars #MeToo',
       ': â’¸â’¸â_¥Typing in dictatorship #Venezuela 2017 â˜› https://t.co/dvPuZdtnlB #USA #KevinSpacey #Oscars #MeToo',
       ': #Cine Dustin Hoffman Spars With John Oliver Over Harassment Claims: â€œYouâ€™ve Put Me On Display Hereâ€ùâ€_ https://t.co/pQmEF3JWG4',
       ': ‚í∏‚í∏‚û•Typing in dictatorship #Venezuela 2017 ‚òõ https://t.co/dvPuZdtnlB #USA #KevinSpacey #Oscars #MeToo',
       'RT @ "#MT @f

In [66]:
# I'll just remove "RT @ ( TarekFatah )" and "RT @ "#MT @funder" manually
mention['text2'] = mention['text2'].str.replace(r'RT @ \( TarekFatah \)', '')
mention['text2'] = mention['text2'].str.replace(r'RT @ \"#MT @funder', '')
mention[mention['text'].str.contains('RT @')]

Unnamed: 0,tweetId,screenName,text,isRetweet,replyToSN,text2
3116,3117,realityshowes,RT @ destiario: Typing in dictatorship #Venezu...,False,,: Typing in dictatorship #Venezuela 2017 http...
7881,7882,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
18180,18181,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
25592,25593,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
38617,38618,realityshowes,RT @ destiario: ‰Õü‰Õü‰_´Typing in dictatorshi...,False,,: ‰Õü‰Õü‰_´Typing in dictatorship #Venezuela 2...
52738,52739,realityshowes,RT @ destiario: â’¸â’¸â_¥Typing in dictatorshi...,False,,: â’¸â’¸â_¥Typing in dictatorship #Venezuela 2...
54254,54255,realityshowes,RT @ destiario: #Cine Dustin Hoffman Spars Wit...,False,,: #Cine Dustin Hoffman Spars With John Oliver ...
112149,112150,realityshowes,RT @ destiario: ‚í∏‚í∏‚û•Typing in dictatorshi...,False,,: ‚í∏‚í∏‚û•Typing in dictatorship #Venezuela 2...
118825,118826,YESWECAN53,"RT @ ""#MT @funder The 16 women who accused Tru...",False,,The 16 women who accused Trump of sexual assa...
118826,118827,whyfeedthem53,"RT @ ""#MT @funder The 16 women who accused Tru...",False,,The 16 women who accused Trump of sexual assa...


In [67]:
mention.drop('text',1,inplace=True)

In [68]:
mentioned = mention['text2'].str.extractall('@(?P<mentioned>\w+)')
mentioned

Unnamed: 0_level_0,Unnamed: 1_level_0,mentioned
Unnamed: 0_level_1,match,Unnamed: 2_level_1
4,0,BetteMidler
4,1,GeraldoRivera
7,0,MattLauer
16,0,CTmagazine
17,0,DanRather
19,0,realDonaldTrump
29,0,phillymag
32,0,headhntr
40,0,TheNudeReporter
46,0,AnnCurry


Great, extraction worked. However, the returned dataframe has a multi-level index.
I just need to reset the second index

In [69]:
mentioned = mentioned.reset_index(level=1)
mentioned.head()

Unnamed: 0,match,mentioned
4,0,BetteMidler
4,1,GeraldoRivera
7,0,MattLauer
16,0,CTmagazine
17,0,DanRather


In [70]:
mention[mention['tweetId'] == 2]
mention[mention['tweetId'] == 2]['text2'].values
# check that it worked

Unnamed: 0,tweetId,screenName,isRetweet,replyToSN,text2


array([], dtype=object)

Great. Now I just need to join the tweetId and merge back to the dataframe 'mention'.

In [71]:
mentioned = mentioned.join(mention['tweetId'])
mentioned

Unnamed: 0,match,mentioned,tweetId
4,0,BetteMidler,5
4,1,GeraldoRivera,5
7,0,MattLauer,8
16,0,CTmagazine,17
17,0,DanRather,18
19,0,realDonaldTrump,20
29,0,phillymag,30
32,0,headhntr,33
40,0,TheNudeReporter,41
46,0,AnnCurry,47


In [72]:
mentioned = mentioned[['tweetId', 'mentioned']]
mentioned

Unnamed: 0,tweetId,mentioned
4,5,BetteMidler
4,5,GeraldoRivera
7,8,MattLauer
16,17,CTmagazine
17,18,DanRather
19,20,realDonaldTrump
29,30,phillymag
32,33,headhntr
40,41,TheNudeReporter
46,47,AnnCurry


In [73]:
mention.head()

Unnamed: 0,tweetId,screenName,isRetweet,replyToSN,text2
4,5,scottygirl2014,False,,.@BetteMidler please speak to this sexual assa...
7,8,calrican,False,,Where in the world is @MattLauer Celebrate #MeToo
16,17,emilyjoypoetry,False,,So I have a question. This post was published ...
17,18,frozenjo,False,,Im pretty sure @DanRather and Brian Williams a...
19,20,DougsMom204,False,,How can @realDonaldTrump talk about anyone tha...


Now I need to drop 'text2' from 'mention' and merge the dataframe with 'mentioned'

In [74]:
mention.drop('text2',1,inplace=True)
mention = pd.merge(mention, mentioned, how='right', on='tweetId')
mention

Unnamed: 0,tweetId,screenName,isRetweet,replyToSN,mentioned
0,5,scottygirl2014,False,,BetteMidler
1,5,scottygirl2014,False,,GeraldoRivera
2,8,calrican,False,,MattLauer
3,17,emilyjoypoetry,False,,CTmagazine
4,18,frozenjo,False,,DanRather
5,20,DougsMom204,False,,realDonaldTrump
6,30,collegeclasses5,False,,phillymag
7,33,page_lie,False,,headhntr
8,41,SWSupportLV,False,,TheNudeReporter
9,47,BJFrezell,False,,AnnCurry


In [75]:
mention.drop(['isRetweet', 'replyToSN'],1,inplace=True)
mention.columns = ['tweetId', 'screenName', 'keyword']
mention['associationType'] = 'mention'
mention = mention[['tweetId','screenName','associationType','keyword']]
mention

Unnamed: 0,tweetId,screenName,associationType,keyword
0,5,scottygirl2014,mention,BetteMidler
1,5,scottygirl2014,mention,GeraldoRivera
2,8,calrican,mention,MattLauer
3,17,emilyjoypoetry,mention,CTmagazine
4,18,frozenjo,mention,DanRather
5,20,DougsMom204,mention,realDonaldTrump
6,30,collegeclasses5,mention,phillymag
7,33,page_lie,mention,headhntr
8,41,SWSupportLV,mention,TheNudeReporter
9,47,BJFrezell,mention,AnnCurry


## Self

In [76]:
self = df[['tweetId','screenName', 'text', 'isRetweet', 'replyToSN']]

In [77]:
self = self[self['replyToSN'].isnull() & (self['isRetweet'] == False)]
self

Unnamed: 0,tweetId,screenName,text,isRetweet,replyToSN
0,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,False,
2,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,False,
3,4,TheDawnStott,Women have been talking about this crap the en...,False,
4,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,False,
5,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,False,
6,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,False,
7,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,False,
10,11,BrunusCutis,Is it just me or does the #MeToo <----icon; fr...,False,
11,12,RDonSteele,Just got scammed out of $25. Bought #TigerW...,False,
13,14,dbehan79,#BlackLivesMatter with #MeToo yup ok,False,


In [78]:
self = self[self['text'].str.contains('@') == False]
self

Unnamed: 0,tweetId,screenName,text,isRetweet,replyToSN
0,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,False,
2,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,False,
3,4,TheDawnStott,Women have been talking about this crap the en...,False,
5,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,False,
6,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,False,
10,11,BrunusCutis,Is it just me or does the #MeToo <----icon; fr...,False,
11,12,RDonSteele,Just got scammed out of $25. Bought #TigerW...,False,
13,14,dbehan79,#BlackLivesMatter with #MeToo yup ok,False,
14,15,european_4,Why are not the men haters from #Metoo marchin...,False,
15,16,FrancescaBiller,"I was sexually harassed as a ""young Intern"" fo...",False,


In [79]:
self.drop(['isRetweet', 'replyToSN', 'text'],1,inplace=True)

In [80]:
self

Unnamed: 0,tweetId,screenName
0,1,ahmediaTV
2,3,DemerisePotvin
3,4,TheDawnStott
5,6,ForEverBrenn
6,7,silveriaalison
10,11,BrunusCutis
11,12,RDonSteele
13,14,dbehan79
14,15,european_4
15,16,FrancescaBiller


In [81]:
self['associationType'] = 'self'
self['keyword'] = np.nan
self

Unnamed: 0,tweetId,screenName,associationType,keyword
0,1,ahmediaTV,self,
2,3,DemerisePotvin,self,
3,4,TheDawnStott,self,
5,6,ForEverBrenn,self,
6,7,silveriaalison,self,
10,11,BrunusCutis,self,
11,12,RDonSteele,self,
13,14,dbehan79,self,
14,15,european_4,self,
15,16,FrancescaBiller,self,


## Merge associations

In [82]:
associations = pd.concat([mention, retweet, reply, self])
len(associations) == len(self) + len(mention) + len(retweet) + len(reply)

True

In [83]:
# sort by tweetId
associations.sort_values('tweetId', inplace=True)
associations = associations.reset_index(drop=True)

# before exporting, I'm going to assign unique IDs again
associations['associationId'] = np.arange(1, len(associations)+1)

# reorder columns
associations = associations[['associationId', 'tweetId', 'screenName', 'associationType', 'keyword']]
associations

Unnamed: 0,associationId,tweetId,screenName,associationType,keyword
0,1,1,ahmediaTV,self,
1,2,2,JesusPrepper74,reply,johnconyersjr
2,3,3,DemerisePotvin,self,
3,4,4,TheDawnStott,self,
4,5,5,scottygirl2014,mention,BetteMidler
5,6,5,scottygirl2014,mention,GeraldoRivera
6,7,6,ForEverBrenn,self,
7,8,7,silveriaalison,self,
8,9,8,calrican,mention,MattLauer
9,10,9,traugott_sarah,reply,JoyAnnReid


In [84]:
associations.to_csv('Data/associations.csv', index=False)

## Complete users dataset

In [85]:
users = pd.read_csv('Data/users.csv')

In [86]:
# Extract all unique users that were mentioned/retweeted/responded, that are not in the users.csv file
unique = associations[associations['associationType'] != 'hashtag']['keyword'].unique()
unique = pd.DataFrame(unique, columns = ['screenName'])

In [87]:
np.logical_not(unique['screenName'].isin(users['screenName'])).value_counts()

False    21628
True     18693
Name: screenName, dtype: int64

There are 25329 users that were referenced in the tweets, whose tweets are not in our dataset.<br>
This is great because about 50% of tweets references users that are already in the data, which enables us to visualize the virality.<br>
Now let's add these users to the csv file

In [88]:
unique = unique[np.logical_not(unique['screenName'].isin(users['screenName']))]
unique
# users that are not included in our dataset as part of users.csv

Unnamed: 0,screenName
0,
1,johnconyersjr
2,BetteMidler
3,GeraldoRivera
4,MattLauer
5,JoyAnnReid
6,RepAdamSmith
7,RepKathleenRice
9,DanRather
10,realDonaldTrump


In [89]:
print(f'{len(users)} - # of unique users already in our dataset')
print(f'{len(unique)} - # of unique users not in dataset')
print(f'{len(users) + len(unique)} - # of unique users in dataset after adding these new users')

users = pd.concat([users, unique])
users

222311 - # of unique users already in our dataset
18693 - # of unique users not in dataset
241004 - # of unique users in dataset after adding these new users


Unnamed: 0,screenName
0,ahmediaTV
1,JesusPrepper74
2,DemerisePotvin
3,TheDawnStott
4,scottygirl2014
5,ForEverBrenn
6,silveriaalison
7,calrican
8,traugott_sarah
9,SchifanoRaelene


In [90]:
users.drop_duplicates(inplace=True)
users.to_csv('Data/users.csv', index=False)

## Update: found a NaN in the downloaded data

In [91]:
import pandas as pd

In [92]:
associations = pd.read_csv('Data/associations.csv')
users = pd.read_csv('Data/users.csv')

In [93]:
df = pd.read_csv('Data/tweets.csv')

In [94]:
associations.isnull().sum()

associationId          0
tweetId                0
screenName             0
associationType        0
keyword            93305
dtype: int64

In [95]:
users.isnull().sum()

screenName    1
dtype: int64

In [96]:
df.isnull().sum()

tweetId               0
screenName            0
text                  0
replyToSN        343402
isRetweet             0
retweeted             0
retweetCount          0
favorited             0
favoriteCount         0
truncated             0
created               0
statusSource          3
longitude        392995
latitude         392995
dtype: int64

In [97]:
# we need to drop NaN values in users

In [98]:
users[users['screenName'].isnull()]

Unnamed: 0,screenName
222311,


In [99]:
len(users)

241004

In [100]:
users.dropna(subset=['screenName'],inplace=True)
len(users)

241003

In [101]:
users[users['screenName'].isnull()]

Unnamed: 0,screenName


In [102]:
users.to_csv('Data/users.csv', index=False)

# Interactions

In [103]:
associations = pd.read_csv('Data/associations.csv')
associations

Unnamed: 0,associationId,tweetId,screenName,associationType,keyword
0,1,1,ahmediaTV,self,
1,2,2,JesusPrepper74,reply,johnconyersjr
2,3,3,DemerisePotvin,self,
3,4,4,TheDawnStott,self,
4,5,5,scottygirl2014,mention,BetteMidler
5,6,5,scottygirl2014,mention,GeraldoRivera
6,7,6,ForEverBrenn,self,
7,8,7,silveriaalison,self,
8,9,8,calrican,mention,MattLauer
9,10,9,traugott_sarah,reply,JoyAnnReid


Just noticed that for every user that received a reply, that user is also included as 'mention'.<br>
I did some quick research on the types of tweets that we are dealing with<br>
<hr>
Source: https://www.adweek.com/digital/reply-mention/

**Tweets That Start With @Username**

Any tweet that starts with @username on Twitter is interpreted as a reply – that is, Twitter assumes you’re sending this message directly to that person and will place it in their feed (and mentions folder) accordingly.

It’s still a public message (as opposed to a private direct message), and is visible when anyone visits the sender’s profile page. However (and this is the kicker), **if you start a message with @username, it won’t appear in the standard home stream of anyone else in your network** unless both of you (the sender and the recipient) are being followed.

**Tweets That Place @Username Somewhere Else**

If a given @username is included in a tweet anywhere else but at the very start, Twitter interprets this differently – as a mention instead of a reply.

What this means is that the tweet will be immediately sent to the home stream of everyone who is following you, and not just the person that was mentioned.

Put literally anything ahead of the @ symbol on a tweet and it isn’t a reply. This is why you see some users placing a full stop before the @username (i.e., .@username), often when they’ve been asked to respond to a given question multiple times by different people, as this allows them to mass-broadcast a “reply” to everybody while also (seemingly) responding directly to the last person who made the enquiry.

A kind of faux reply, if you will. I don’t recommend it. If you want to make an open tweet, then make an open tweet (and tag at the end if necessary). Don’t try and disguise it as something else. It’s confusing enough for newcomers as it is.
<hr>
First, I want to focus on analyzing how the movement grew and spread, so we will exclude replies for now.

In [104]:
tweets = pd.read_csv('Data/tweets.csv')
users = pd.read_csv('Data/users.csv')

In [105]:
tweets

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
0,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
1,2,JesusPrepper74,@johnconyersjr @alfranken why have you guys ...,johnconyersjr,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
2,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,,False,False,0,False,0,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
3,4,TheDawnStott,Women have been talking about this crap the en...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
4,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,,False,False,11,False,15,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",,
5,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
6,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,,False,False,3,False,3,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
7,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
8,9,traugott_sarah,@JoyAnnReid Keep it coming ladies! #MeToo,JoyAnnReid,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
9,10,SchifanoRaelene,"@RepAdamSmith Workplace is one place, but what...",RepAdamSmith,False,False,0,False,1,True,2017-11-29 23:58:00,"<a href=""http://twitter.com/download/iphone"" r...",,


In [106]:
tweets[tweets['tweetId'] == 2]['text'].values

array(['@johnconyersjr  @alfranken  why have you guys not resigned yet? Liberal hypocrisy! #MeToo'],
      dtype=object)

In [107]:
len(tweets[np.logical_not(tweets['replyToSN'].isnull())]) == len(tweets[tweets['text'].str.startswith('@')])

False

In [108]:
# tweets where 'replyToSN' and the first character of the text did not agree 
# (reply that didn't start with "@", vice versa)
tweets[np.logical_not(tweets['replyToSN'].isnull()) != tweets['text'].str.startswith('@')]

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
34,35,ThalesLives,8. But how soon before this snowball of sexual...,ThalesLives,False,False,5,False,29,True,2017-11-29 23:55:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
105,106,LilMissRightie,Forget #MeToo for now or for just a moment and...,LilMissRightie,False,False,13,False,26,True,2017-11-29 23:45:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
200,201,NoHopeCitizen,@MLauer picks on @POTUS for a discussion with ...,,False,False,0,False,0,True,2017-11-29 23:34:00,"<a href=""http://twitter.com/download/iphone"" r...",,
205,206,Unitynow8,@LanaDelRaytheon Also were do I bet on how man...,,False,False,0,False,3,False,2017-11-29 23:33:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
229,230,EzraFitz_13,RT lucas_megatron: Meghan_Trainor just this On...,EzraFitz_13,False,False,0,False,0,False,2017-11-29 23:30:00,"<a href=""http://www.botize.com"" rel=""nofollow""...",,
308,309,frenchcori,"#MeToo reminds me of that awkward, uncomfortab...",frenchcori,False,False,0,False,1,True,2017-11-29 23:23:00,"<a href=""http://twitter.com/download/iphone"" r...",,
359,360,b9AcE,"Astonishing that AFAIK, there hasn't been any ...",b9AcE,False,False,1,False,0,True,2017-11-29 23:17:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
390,391,Webbedspiderrob,"For most of these accusations, there is little...",Webbedspiderrob,False,False,0,False,1,True,2017-11-29 23:15:00,"<a href=""http://twitter.com/download/iphone"" r...",,
391,392,ejgibbens,Next up on the #metoo playlist - Fuck You by L...,ejgibbens,False,False,0,False,0,False,2017-11-29 23:15:00,"<a href=""http://twitter.com/download/iphone"" r...",,
426,427,StuckonSW,America: The #MeToo movement has been a waters...,StuckonSW,False,False,0,False,2,True,2017-11-29 23:11:00,"<a href=""http://twitter.com/download/iphone"" r...",,


Seems like there are 2 rare cases 
* 1. users can reply to tweets without beginning their tweets with username of the intended audience (person being replied to)
* 2. users can begin their tweets with "@" followed by the username of the intended audience, and still not be recognized as a reply

In [109]:
# case #1
tweets[np.logical_not(tweets['replyToSN'].isnull()) & np.logical_not(tweets['text'].str.startswith('@'))]

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
34,35,ThalesLives,8. But how soon before this snowball of sexual...,ThalesLives,False,False,5,False,29,True,2017-11-29 23:55:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
105,106,LilMissRightie,Forget #MeToo for now or for just a moment and...,LilMissRightie,False,False,13,False,26,True,2017-11-29 23:45:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
229,230,EzraFitz_13,RT lucas_megatron: Meghan_Trainor just this On...,EzraFitz_13,False,False,0,False,0,False,2017-11-29 23:30:00,"<a href=""http://www.botize.com"" rel=""nofollow""...",,
308,309,frenchcori,"#MeToo reminds me of that awkward, uncomfortab...",frenchcori,False,False,0,False,1,True,2017-11-29 23:23:00,"<a href=""http://twitter.com/download/iphone"" r...",,
359,360,b9AcE,"Astonishing that AFAIK, there hasn't been any ...",b9AcE,False,False,1,False,0,True,2017-11-29 23:17:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
390,391,Webbedspiderrob,"For most of these accusations, there is little...",Webbedspiderrob,False,False,0,False,1,True,2017-11-29 23:15:00,"<a href=""http://twitter.com/download/iphone"" r...",,
391,392,ejgibbens,Next up on the #metoo playlist - Fuck You by L...,ejgibbens,False,False,0,False,0,False,2017-11-29 23:15:00,"<a href=""http://twitter.com/download/iphone"" r...",,
426,427,StuckonSW,America: The #MeToo movement has been a waters...,StuckonSW,False,False,0,False,2,True,2017-11-29 23:11:00,"<a href=""http://twitter.com/download/iphone"" r...",,
485,486,BooksbySSJaxon,For we cannot tarry here; We must fight my dar...,BooksbySSJaxon,False,False,0,False,1,True,2017-11-29 23:03:00,"<a href=""http://twitter.com/download/iphone"" r...",,
549,550,AnaMardoll,"I'm just incredibly tired of people saying ""oh...",AnaMardoll,False,False,28,False,142,True,2017-11-29 22:57:00,"<a href=""http://twitter.com/download/android"" ...",,


In [110]:
# case #2
tweets[tweets['replyToSN'].isnull() & tweets['text'].str.startswith('@')]

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
200,201,NoHopeCitizen,@MLauer picks on @POTUS for a discussion with ...,,False,False,0,False,0,True,2017-11-29 23:34:00,"<a href=""http://twitter.com/download/iphone"" r...",,
205,206,Unitynow8,@LanaDelRaytheon Also were do I bet on how man...,,False,False,0,False,3,False,2017-11-29 23:33:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
544,545,lurvejennifer,@MLauer I'm beside myself with anger and disgu...,,False,False,0,False,0,True,2017-11-29 22:57:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,
1503,1504,22stacyb,"@MLauer ""Fuck, marry, or kill"" really? You're ...",,False,False,0,False,0,False,2017-11-29 21:03:00,"<a href=""http://twitter.com/download/iphone"" r...",,
1786,1787,PattyParas,@Steve4276 @BillOReilly You could be wearing a...,,False,False,0,False,0,False,2017-11-29 20:23:00,"<a href=""http://twitter.com/download/iphone"" r...",,
2355,2356,justshocked,@MLauer #creeper how many did you rape?? #MeTo...,,False,False,0,False,0,False,2017-11-29 19:12:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
2392,2393,roseyrosy,@MLauer Speaking of facts: Youre out of a job....,,False,False,2,False,13,True,2017-11-29 19:07:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",,
2863,2864,HYD_MRA,@MLauer How does #feminism taste now? Not pret...,,False,False,0,False,0,True,2017-11-29 18:11:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
2991,2992,AlysseStewart,@MLauer Looks like they needed some different ...,,False,False,0,False,7,False,2017-11-29 17:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
4039,4040,peasantbarbie,@ everyone who posted or liked #metoo posts an...,,False,False,0,False,9,True,2017-11-29 15:45:00,"<a href=""http://twitter.com/download/iphone"" r...",,


There are 
* 2702 tweets for case #1
* 131 tweets for case #2

Quickly skimming the case #2 of tweets that started with "@" but were not recognized as replies,
- there are some tweets that replied to an account that is/was unavailable
- there are tweets with misformatted replies ("@@..", "@#...", "@ username", "@null")

Let's see if any #2 tweets were also retweets, or retweeted

In [111]:
# none of the tweets that we consider as case #2 were retweeted
# makes sense, as they were intended to be resplies to a particular user (in most cases)
tweets[tweets['replyToSN'].isnull() & tweets['text'].str.startswith('@') & tweets['retweeted'] == True]

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude


In [112]:
# of the tweets that we consider as case #2, 3 of them were also retweets, but also to announce suspension of the user
tweets[tweets['replyToSN'].isnull() & tweets['text'].str.startswith('@') & tweets['isRetweet'] == True]

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude


I'm going to drop these and save the data as V2

Rationale:
- case #1 : Since they were recognized as "replies" by the algorithm, I will assume that these tweets only appeared in the timeline of the recipient, plus anyone that follow both the sender and recipient. Since we are interested in how the movement grew, we will exclude these from V2 dataset
- case #2 : We will also exclude these from V2 dataset since many of them seem to have been falsely labeled as "not a reply"

In [113]:
tweets

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
0,1,ahmediaTV,American Harem.. #MeToo https://t.co/HjExLJdGuF,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
1,2,JesusPrepper74,@johnconyersjr @alfranken why have you guys ...,johnconyersjr,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
2,3,DemerisePotvin,Watched Megan Kelly ask Joe Keery this A.M. if...,,False,False,0,False,0,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
3,4,TheDawnStott,Women have been talking about this crap the en...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
4,5,scottygirl2014,.@BetteMidler please speak to this sexual assa...,,False,False,11,False,15,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/#!/download/ipad"" ...",,
5,6,ForEverBrenn,We can't keep turning a blind eye and pretend ...,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://instagram.com"" rel=""nofollow"">...",,
6,7,silveriaalison,Jay-Z is saying what I've been saying. DJT's j...,,False,False,3,False,3,True,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/android"" ...",,
7,8,calrican,Where in the world is @MattLauer Celebrate #MeToo,,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
8,9,traugott_sarah,@JoyAnnReid Keep it coming ladies! #MeToo,JoyAnnReid,False,False,0,False,0,False,2017-11-29 23:59:00,"<a href=""http://twitter.com/download/iphone"" r...",,
9,10,SchifanoRaelene,"@RepAdamSmith Workplace is one place, but what...",RepAdamSmith,False,False,0,False,1,True,2017-11-29 23:58:00,"<a href=""http://twitter.com/download/iphone"" r...",,


In [114]:
dropped = tweets[np.logical_not(tweets['replyToSN'].isnull()) != tweets['text'].str.startswith('@')]

In [115]:
len(tweets)
tweets.drop(tweets[np.logical_not(tweets['replyToSN'].isnull()) != tweets['text'].str.startswith('@')].index, inplace=True)
len(tweets)

393118

390288

Original: 393135 rows<br>
V2: 390302 rows

In [116]:
tweets.to_csv('Data/tweets.csv', index=False, encoding='utf-8')

We will also have to update the other data sets to reflect the change

In [117]:
dropped

Unnamed: 0,tweetId,screenName,text,replyToSN,isRetweet,retweeted,retweetCount,favorited,favoriteCount,truncated,created,statusSource,longitude,latitude
34,35,ThalesLives,8. But how soon before this snowball of sexual...,ThalesLives,False,False,5,False,29,True,2017-11-29 23:55:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
105,106,LilMissRightie,Forget #MeToo for now or for just a moment and...,LilMissRightie,False,False,13,False,26,True,2017-11-29 23:45:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
200,201,NoHopeCitizen,@MLauer picks on @POTUS for a discussion with ...,,False,False,0,False,0,True,2017-11-29 23:34:00,"<a href=""http://twitter.com/download/iphone"" r...",,
205,206,Unitynow8,@LanaDelRaytheon Also were do I bet on how man...,,False,False,0,False,3,False,2017-11-29 23:33:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
229,230,EzraFitz_13,RT lucas_megatron: Meghan_Trainor just this On...,EzraFitz_13,False,False,0,False,0,False,2017-11-29 23:30:00,"<a href=""http://www.botize.com"" rel=""nofollow""...",,
308,309,frenchcori,"#MeToo reminds me of that awkward, uncomfortab...",frenchcori,False,False,0,False,1,True,2017-11-29 23:23:00,"<a href=""http://twitter.com/download/iphone"" r...",,
359,360,b9AcE,"Astonishing that AFAIK, there hasn't been any ...",b9AcE,False,False,1,False,0,True,2017-11-29 23:17:00,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",,
390,391,Webbedspiderrob,"For most of these accusations, there is little...",Webbedspiderrob,False,False,0,False,1,True,2017-11-29 23:15:00,"<a href=""http://twitter.com/download/iphone"" r...",,
391,392,ejgibbens,Next up on the #metoo playlist - Fuck You by L...,ejgibbens,False,False,0,False,0,False,2017-11-29 23:15:00,"<a href=""http://twitter.com/download/iphone"" r...",,
426,427,StuckonSW,America: The #MeToo movement has been a waters...,StuckonSW,False,False,0,False,2,True,2017-11-29 23:11:00,"<a href=""http://twitter.com/download/iphone"" r...",,


In [118]:
associations[associations['tweetId'].isin(dropped['tweetId'])]

Unnamed: 0,associationId,tweetId,screenName,associationType,keyword
