In [1]:
import numpy as np
import pandas as pd
import re
from fuzzywuzzy import fuzz
import string
import matplotlib.pyplot as plt
from pandarallel import pandarallel
pd.set_option('display.max_rows', 500)

## Import Data

In [2]:
r_golbeck = pd.read_csv('Golbeck/onlineHarassmentDataset.tsv', delimiter='\t', encoding='latin-1')
r_golbeck.head()

Unnamed: 0,ID,Code,Tweet
0,1,H,@Grumpy_P_Sloth @deanesmay feminists argue for...
1,2,H,1948Army of England helped the Jews to occupy ...
2,3,H,Dutch Leader Says Europe to collapse In 6 Week...
3,4,H,RT @__DeLay: The next day the Romans and the J...
4,5,H,RT @Bakersman_Joe: When Hitler Invited The Jew...


## Cleaning
#### Apply same cleaning steps as in data build notebook

In [3]:
from preprocessTwitter import tokenize #Python version of Ruby preprocessing script

def clean_tweets(tweet):
    tweet = re.sub('^RT:? ','<rt> ',tweet) #Assign special char to RTs
    tweet = re.sub('RT@','<rt> @',tweet) #Split out RT and user mention when overlapping
    tweet = re.sub('via@','via @',tweet) #Split out 'via' and user mention when overlapping
    tweet = re.sub(r'https?: ?/ ?/t(.co( )?)?/?[A-Za-z0-9é]*|https?:\\/\\/t.co\\/[A-Za-z0-9]*é?|https?(: / /)?é?', '<url>', tweet) #Assign special char to URLs (Python script missing some cases that I do not)
    tweet = re.sub(r'<url>é|<url> ?\.c[oé]*$', '<url>', tweet, flags=re.IGNORECASE) 
    tweet = re.sub(r"'", '', re.sub(r'"', '', tweet)) #Get rid of single and double quotes (admittedly loses some context here)
    tweet = re.sub('anti-', 'anti ', tweet, flags=re.IGNORECASE) #Capture 'anti' or 'non' prefix as its own word 
    tweet = re.sub('#anti', '#anti ', tweet, flags=re.IGNORECASE)
    tweet = re.sub(' anti', ' anti ', tweet, flags=re.IGNORECASE)
    tweet = re.sub('non-', 'non ', tweet, flags=re.IGNORECASE)
    tweet = re.sub('&amp;?', '&', tweet, flags=re.IGNORECASE) #Put ampersand in recognizable format
    #Clean some known hateful terms that are in the data (e.g. "whitepower") to extract meaning from hashtags
    tweet = re.sub('#white', '#white ', tweet, flags=re.IGNORECASE)
    tweet = re.sub('waronwhite', 'war on white', tweet, flags=re.IGNORECASE)
    tweet = re.sub('whitegenocide', 'white genocide', tweet, flags = re.IGNORECASE)
    tweet = re.sub('whitelivesmatter', 'white lives matter', tweet, flags = re.IGNORECASE)
    tweet = re.sub('blacklivesmatter', 'black lives matter', tweet, flags = re.IGNORECASE)
    tweet = re.sub('livesmatter', 'lives matter', tweet, flags = re.IGNORECASE)
    tweet = re.sub('alllive', 'all live', tweet, flags = re.IGNORECASE)
    tweet = re.sub('bluelive', 'blue live', tweet, flags = re.IGNORECASE)
    tweet = re.sub('whitepower', 'white power', tweet, flags = re.IGNORECASE)
    tweet = re.sub('#fuck', '#fuck ', tweet, flags=re.IGNORECASE)
    tweet = re.sub('fuckniggers', 'fuck niggers', tweet, flags=re.IGNORECASE)
    tweet = re.sub('sjw', 'social justice warrior', tweet, flags=re.IGNORECASE)
    tweet = tweet.strip() #Strip whitespace
    return tweet

In [4]:
r_golbeck['tokens'] = r_golbeck['Tweet'].apply(clean_tweets) #Apply my own cleaning steps
r_golbeck['tokens'] = r_golbeck['tokens'].apply(tokenize) #Then apply the ones implemented here: https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
r_golbeck['tokens'] = r_golbeck['tokens'].apply(lambda x:re.sub('><', '> <', x)) #Separate any special characters that are overlapping/touching
r_golbeck.head()

Unnamed: 0,ID,Code,Tweet,tokens
0,1,H,@Grumpy_P_Sloth @deanesmay feminists argue for...,<user> <user> feminists argue for raising mini...
1,2,H,1948Army of England helped the Jews to occupy ...,<number>army of england helped the jews to occ...
2,3,H,Dutch Leader Says Europe to collapse In 6 Week...,dutch leader says europe to collapse in <numbe...
3,4,H,RT @__DeLay: The next day the Romans and the J...,<rt> <user> : the next day the romans and the ...
4,5,H,RT @Bakersman_Joe: When Hitler Invited The Jew...,<rt> <user> : when hitler invited the jews to ...


## Fuzzy Matching
#### Apply fuzzy matching to tweets to search for duplicate tweets with differing labels

In [29]:
fuzz_similarity = r_golbeck[['ID','Code','tokens']]
fuzz_similarity = fuzz_similarity.merge(fuzz_similarity, how='cross') #Cartesian join all tweets
print(fuzz_similarity.shape)
fuzz_similarity.head()

(414529600, 6)


Unnamed: 0,ID_x,Code_x,tokens_x,ID_y,Code_y,tokens_y
0,1,H,<user> <user> feminists argue for raising mini...,1,H,<user> <user> feminists argue for raising mini...
1,1,H,<user> <user> feminists argue for raising mini...,2,H,<number>army of england helped the jews to occ...
2,1,H,<user> <user> feminists argue for raising mini...,3,H,dutch leader says europe to collapse in <numbe...
3,1,H,<user> <user> feminists argue for raising mini...,4,H,<rt> <user> : the next day the romans and the ...
4,1,H,<user> <user> feminists argue for raising mini...,5,H,<rt> <user> : when hitler invited the jews to ...


In [41]:
#Limit to rows where the tweet ID isn't identical (exact same tweet) and the labels are different
fuzz_similarity = fuzz_similarity[(fuzz_similarity['Code_x'] != fuzz_similarity['Code_y']) &
                                  ((fuzz_similarity['ID_x'] != fuzz_similarity['ID_y']))]
print(fuzz_similarity.shape)
fuzz_similarity.head()

(159341338, 6)


Unnamed: 0,ID_x,Code_x,tokens_x,ID_y,Code_y,tokens_y
37,1,H,<user> <user> feminists argue for raising mini...,38,N,"for the record , i see tax havens as the next ..."
38,1,H,<user> <user> feminists argue for raising mini...,39,N,"this is our <hashtag> israel , this is for the..."
39,1,H,<user> <user> feminists argue for raising mini...,40,N,<user> <user> <user> >reading the nt <allcap...
40,1,H,<user> <user> feminists argue for raising mini...,41,N,<user> go for if its a quasi free socitey for ...
41,1,H,<user> <user> feminists argue for raising mini...,42,N,<user> throw a chaw in before you go mudding <...


#### Compute fuzzy ratio for each pair of tweets in parallel (value of 100 = exact same string)
Note: This process will take several hours even with 8 worker nodes running in parallel

In [42]:
# import time (if you want to track how long this takes)

pandarallel.initialize(progress_bar=True)
# t0 = time.time()
ratios = fuzz_similarity.parallel_apply(lambda x: fuzz.ratio(x.tokens_x, x.tokens_y), axis=1)
# t1 = time.time()

# print(t1-t0)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=19917668), Label(value='0 / 199176…

#### After some exploration, settled on a fuzzy score of 85 of the minimum threshold for two tweets being considered "duplicates". 
Here are the examples with that minimum score:

In [83]:
fuzz_similarity[fuzz_similarity['score']==85]

Unnamed: 0,ID_x,Code_x,tokens_x,ID_y,Code_y,tokens_y,score
4140712,204,H,<user> untag you fucking nigger,7633,N,<user> i want to you fucking nigger,85
4948749,244,H,<user> : <hashtag> white lives matter <url>,1270,N,<rt> <user> : <hashtag> white lives mattermore,85
4955619,244,H,<user> : <hashtag> white lives matter <url>,8140,N,<user> <user> <hashtag> white lives matter,85
4956089,244,H,<user> : <hashtag> white lives matter <url>,8610,N,<user> <user> <hashtag> white lives matter,85
4956287,244,H,<user> : <hashtag> white lives matter <url>,8808,N,<user> <user> <hashtag> white lives matter,85
4956299,244,H,<user> : <hashtag> white lives matter <url>,8820,N,<user> <user> <hashtag> white lives matter,85
6023014,296,H,<user> : gas the jews <url>,16815,N,<rt> <user> : eat the jews <url>,85
8883453,437,H,<user> yes you fucking nigger,6494,N,<user> \nlmao you fucking nigger,85
9914511,487,N,<user> <hashtag> white lives matter,19552,H,<hashtag> white lives mattertoo,85
10094810,496,H,<user> : i set fire to the jews was also a goo...,16611,N,<rt> <user> : i set fire to the jews was also ...,85


In [63]:
fuzz_similarity['score'] = ratios

In [79]:
#Limit to only pairs of tweets with scores over 85
over85 = fuzz_similarity[fuzz_similarity['score']>85]
#Get the IDs of all tweets taht appear in these rows (so that they can be dropped in data build notebook)
duplicate_IDs = np.unique(np.append(over85['ID_x'], over85['ID_y']))

## Export for use in main notebook

In [81]:
np.save('duplicate_fuzzy_ids.npy', duplicate_IDs)