In [135]:
# Import libraries
import psutil
import pandas as pd
import numpy as np
from fuzzywuzzy import process, fuzz
import glob
import time
from textdistance import levenshtein

# import os
# import boto3
# import statistics
# import plotly.graph_objects as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# init_notebook_mode(connected=False)
# from nltk.tokenize import sent_tokenize, word_tokenize
# import matplotlib.pyplot as plt
# import seaborn as sns
# from sklearn.preprocessing import MinMaxScaler
# import umap
# from sklearn.neighbors import NearestNeighbors
# from sklearn.cluster import DBSCAN

import warnings
warnings.filterwarnings('ignore')

### Load cleaned data

In [115]:
psutil.virtual_memory()

svmem(total=8461193216, available=3370409984, percent=60.2, used=5090783232, free=3370409984)

In [117]:
# Read in commentors
commentors = pd.read_csv('data/cleaned/comments.csv', usecols=['commentor'])

### Resolve commentors suspected to be the same individual

In [118]:
# number of unique commentors
commentors.commentor.nunique()

165718

In [119]:
# get an example of a potential single commentor under two different usernames
commentors[commentors.commentor.str.contains("All Eyez On Me")]['commentor'].unique()

array(['"All Eyez On Me" in theaters NOW!!!',
       '"All Eyez On Me" in theaters june 16 2017'], dtype=object)

In [121]:
# get another example
commentors[commentors.commentor.str.contains("Native Americans")]['commentor'].unique()

array(['*********************************************************************************** The Native Americans are not r*dsk*ns. They are human beings like you and me. FTTR! - Fail To The Racists!',
       '*********************************************************************************** The Native Americans are not r*dsk*ns. They are human beings like you and me. FTTR! - Fail To The Racists! ***********************************************************',
       'The Native Americans are not r*dsk*ns. They are human beings like you and me. FTTR! - Fail To The Racists!',
       '*********************************************************************** The Native Americans are not r*dsk*ns. They are human beings like you and me. FTTR! - Fail To The Racists! ***********************************************************************',
       'The Native Americans are not r*dsk*ns. They are human beings. FTTR! - Fail To The Racists!',
       'The Native Americans are not r*dsk*ns! They are hum

In [35]:
fuzz.partial_ratio('"All Eyez On Me" in theaters NOW!!!','"All Eyez On Me" in theaters june 16 2017')

83

In [79]:
unique_commentors = pd.DataFrame(commentors.commentor.unique())
unique_commentors.columns = ['commentor']
unique_commentors = unique_commentors.sort_values(by=['commentor']).reset_index(drop=True)
df = unique_commentors[:1000]

In [None]:
%%time

# Create df of unique commentor's usernames
unique_commentors = pd.DataFrame(commentors.commentor.unique())
unique_commentors.columns = ['commentor']
unique_commentors = unique_commentors.sort_values(by=['commentor']).reset_index(drop=True)

increment = 1000
for i in range(0,round(unique_commentors.shape[0]), increment):
    start_time = time.time()
    print(i)
    df = unique_commentors[i:i+increment]

    # Compare usernames to find potential split entities
    potential_matches = list()
    for name in df.commentor:
        if len(name) < 10:
            continue
        for other_name in df.commentor:
            if name != other_name and len(other_name) > 10:
                score = fuzz.partial_ratio(name, other_name)
                if score >= 83:
                    potential_matches.append([name,other_name,score])

    potential_matches_df = pd.DataFrame(potential_matches, columns=['username1','username2','score'])
    potential_matches_df.to_csv(f'tmp_entity_resolution/{i}.csv', header=False, index=False)
    print(time.time() - start_time)

In [127]:
# Read in resolved usernames
resolved_usernames_files = glob.glob('tmp_entity_resolution/*.csv')
data = list()

for filename in resolved_usernames_files:
    df = pd.read_csv(filename, names=['commentor1','commentor2','score'])
    data.append(df)

resolved_usernames = pd.concat(data, axis=0, ignore_index=True)
resolved_usernames.shape

(54628, 3)

In [202]:
# Calculate various ratios
resolved_usernames['ratio'] = resolved_usernames.apply(lambda x: fuzz.ratio(x.commentor1, x.commentor2), axis=1)
resolved_usernames['partial_token_set_ratio'] = resolved_usernames.apply(lambda x: fuzz.partial_token_set_ratio(x.commentor1, x.commentor2), axis=1)
resolved_usernames['token_sort_ratio'] = resolved_usernames.apply(lambda x: fuzz.token_sort_ratio(x.commentor1, x.commentor2), axis=1)
resolved_usernames['WRatio'] = resolved_usernames.apply(lambda x: fuzz.WRatio(x.commentor1, x.commentor2), axis=1)
resolved_usernames['avg_score'] = resolved_usernames[['score', 'ratio','partial_token_set_ratio','token_sort_ratio']].mean(axis=1)
high_confidence_matches = resolved_usernames[resolved_usernames.WRatio >= 88]
high_confidence_matches.to_csv('tmp_resolved_entities/high_confidence_matches.csv', header=True, index=False)

In [201]:
# example of a potential single commentor operating under different usernames 
high_confidence_matches[high_confidence_matches.commentor1 == 'joetoronto'].head()

Unnamed: 0,commentor1,commentor2,score,ratio,partial_token_set_ratio,token_sort_ratio,avg_score,WRatio
53816,joetoronto,joetoronto and billswillnevermove in a nacho cheese match,100,30,100,30,65.0,90
53817,joetoronto,joetoronto has 50 plus names on here,100,43,100,43,71.5,90
53818,joetoronto,joetoronto has 50 plus names on here including Lawrence,100,31,100,31,65.5,90
53819,joetoronto,joetoronto has 50 plus names on here including ee00ee,100,32,100,32,66.0,90
53820,joetoronto,joetoronto has an unhealthy obsession with Mack and Trubisky,100,29,100,29,64.5,90


In [222]:
# Create mapping between duplicate name and resolved name
resolved_commentors = dict()
for i in range(0, len(high_confidence_matches)):
    commentor1 = high_confidence_matches['commentor1'].iloc[i]
    commentor2 = high_confidence_matches['commentor2'].iloc[i]
    if commentor2 not in resolved_commentors.keys():
        if commentor1 not in resolved_commentors.keys():
            resolved_commentors[commentor2] = commentor1

In [None]:
# Apply mapping to original data
commentors = pd.read_csv('data/cleaned/comments.csv')
commentors