In [1]:
from pathlib import Path
import pickle
import re
from functools import partial

from tqdm import tqdm, tqdm_notebook
from multiprocessing import cpu_count, Pool

import numpy as np
import pandas as pd

import spacy
import ftfy
import yake
from nltk.tokenize import TweetTokenizer

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_colwidth = 200 # default - 50

In [3]:
DATA_PATH = Path('../data')
RANDOM_SEED = 17

tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




**Load Data**

In [4]:
train_df = pd.read_csv(DATA_PATH/'train.csv')
valid_df = pd.read_csv(DATA_PATH/'valid.csv')
test_df = pd.read_csv(DATA_PATH/'test.csv')

In [5]:
text_cols = ['title', 'text']

**Clean Data**

In [6]:
def parallelize(data, func, cores=None):
    if cores is None:
        cores = cpu_count()
    partitions = cores
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data

**Fix unicode characters**

In [7]:
def parallel_fix_text(s, normalization='NFC'):
    return s.progress_apply(fix_text)

def fix_text(doc):
    return ftfy.fix_text(doc)

Experiments on sample data

In [8]:
sample_df = train_df['text'].sample(frac=1.0)
sample_df.shape

(24871,)

In [11]:
train_df['title'].progress_apply(fix_text)

HBox(children=(IntProgress(value=0, max=24871), HTML(value='')))

TypeError: object of type 'float' has no len()

In [9]:
# clean_df = sample_df.progress_apply(ftfy.fix_text, normalization='NFKC')
clean_df = parallelize(sample_df, parallel_fix_text)
clean_df = clean_df.rename('clean_comment_text')











TypeError: object of type 'float' has no len()

Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Process ForkPoolWorker-4:
Process ForkPoolWorker-8:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:
Process ForkPoolWorker-7:
Process ForkPoolWorker-3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)

In [188]:
diff_df = pd.concat([clean_df, sample_df], axis=1)
diff_df = diff_df[diff_df['clean_comment_text'] != diff_df['comment_text']]
diff_df.shape

(56554, 2)

In [189]:
diff_df.head()

Unnamed: 0,clean_comment_text,comment_text
1792214,This is how these things impact all of us when they are brought into our political discourse.\n\n If these folks want to tell us that public policy isn't necessary to stop people from killing fell...,This is how these things impact all of us when they are brought into our political discourse.\n\n If these folks want to tell us that public policy isn’t necessary to stop people from killing fell...
643621,"I am not a politician, so I don't need to report it, but I also participated in a Pacifica Institute trip to Turkey. The article stated, ""It's puzzling that state legislators who rarely get invol...","I am not a politician, so I don't need to report it, but I also participated in a Pacifica Institute trip to Turkey. The article stated, “It’s puzzling that state legislators who rarely get invol..."
651697,"No, it appears preaching is your specialty.\n\nI won't go into this with you again. I get no pleasure from besting a fool.","No, it appears preaching is your specialty.\n\nI won’t go into this with you again. I get no pleasure from besting a fool."
988929,"The false dictionary of the White Supremacist, Old South CONfederacy, Neo-CONservative, Neo-dogma determined to destroy the nation to re-establish the Old South Plantation system now called Corpor...","The false dictionary of the White Supremacist, Old South CONfederacy, Neo-CONservative, Neo-dogma determined to destroy the nation to re-establish the Old South Plantation system now called Corpor..."
1515445,"If it ever sees the light of a legislator's laptop, it will bear about as much resemblance to the original as an unretouched Maxim cover does to the model who posed for it.","If it ever sees the light of a legislator’s laptop, it will bear about as much resemblance to the original as an unretouched Maxim cover does to the model who posed for it."


In [190]:
top_diff_df = (diff_df['clean_comment_text'].str.len() - diff_df['comment_text'].str.len()).abs().sort_values(ascending=False).head(10)
top_diff_df

826201    25
131633    22
76263     10
115751    10
77657      8
88374      7
96710      6
148735     6
124405     6
105658     6
dtype: int64

In [204]:
diff_df.loc[top_diff_df.head(10).index].values

array([['That bridge is incredibly dangerous, a relic of the past.  A tight, narrow turn, almost any speed above 30 could have been "excessive speed" for  a truck  like this.  But a professional truck driver should have been aware of  that.\nFortunately there are now definite plans to replace it!\nIn the meantime enjoy  sweet little Scottsburg,  once a major shipping metropolis.',
        'That bridge is incredibly dangerous, a relic of the past.  A tight, narrow turn, almost any speed above 30 could have been "excessive \x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10\x10speed" for  a truck  like this.  But a professional truck driver should have been aware of  that.\nFortunately there are now definite plans to replace it!\nIn the meantime enjoy  sweet little Scottsburg,  once a major shipping metropolis.'],
       ["GARBAGE ! FOR DECADES FOAMING AT THE MOUTH RIGHT WING NUTJOBS HAVE PERSECUTED THE CLINTONS AND THEY GOT SQUAT. AMAZINGLY E

Apply on full data

In [194]:
ftfy_train_df = parallelize(train_df['comment_text'], parallel_fix_text)
ftfy_train_df.head()

0                 This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!
1    Thank you!! This would make my life a lot less anxiety-inducing. Keep it up, and don't let anyone get in your way!
2                                This is such an urgent design problem; kudos to you for taking it on. Very impressive!
3                                  Is this something I'll be able to install on my site? When will you be releasing it?
4                                                                                  haha you guys are a bunch of losers.
Name: comment_text, dtype: object

In [195]:
ftfy_test_df = parallelize(test_df['comment_text'], parallel_fix_text)
ftfy_test_df.head()

0                                                Jeff Sessions is another one of Trump's Orwellian choices. He believes and has believed his entire career the exact opposite of what the position requires.
1    I actually inspected the infrastructure on Grand Chief Stewart Philip's home Penticton First Nation in both 2010 and 2013.  Exactly Zero projects that had been identified in previous inspection re...
2    No it won't . That's just wishful thinking on democrats fault .   For the 100 th time , Walker cited the cost of drug users treatment as being lost with Obamacare .  I laugh every time I hear a li...
3    Instead of wringing our hands and nibbling the periphery of the issue, how about we face the actual issue head on? I would support a city ordinance against loitering, and applaud city councilors w...
4    how many of you commenters have garbage piled high in your yard, bald tires, dead batteries, rotten pallets, car parts, blah blah blah. this town is a pigpen. drive around and

In [196]:
((ftfy_train_df != train_df['comment_text']).sum(),
 (ftfy_test_df != test_df['comment_text']).sum())

(56554, 2943)

**Save/Load Clean Data**

**Tokenize Data (Spacy)**

In [236]:
def parallel_tokenize(s):
    return s.progress_apply(tokenize)

def tokenize(doc):
    return ' '.join([token.text for token in nlp(doc)])

In [199]:
nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger', 'ner'])

In [200]:
spacy_ftfy_train_df = parallelize(ftfy_train_df, parallel_tokenize)
spacy_ftfy_train_df.head()

0                This is so cool . It 's like , ' would you want your mother to read this ? ? ' Really great idea , well done !
1    Thank you ! ! This would make my life a lot less anxiety - inducing . Keep it up , and do n't let anyone get in your way !
2                                     This is such an urgent design problem ; kudos to you for taking it on . Very impressive !
3                                       Is this something I 'll be able to install on my site ? When will you be releasing it ?
4                                                                                         haha you guys are a bunch of losers .
Name: comment_text, dtype: object

In [201]:
spacy_ftfy_test_df = parallelize(ftfy_test_df, parallel_tokenize)
spacy_ftfy_test_df.head()

0                                             Jeff Sessions is another one of Trump 's Orwellian choices . He believes and has believed his entire career the exact opposite of what the position requires .
1    I actually inspected the infrastructure on Grand Chief Stewart Philip 's home Penticton First Nation in both 2010 and 2013 .   Exactly Zero projects that had been identified in previous inspection...
2    No it wo n't . That 's just wishful thinking on democrats fault .    For the 100 th time , Walker cited the cost of drug users treatment as being lost with Obamacare .   I laugh every time I hear ...
3    Instead of wringing our hands and nibbling the periphery of the issue , how about we face the actual issue head on ? I would support a city ordinance against loitering , and applaud city councilor...
4    how many of you commenters have garbage piled high in your yard , bald tires , dead batteries , rotten pallets , car parts , blah blah blah . this town is a pigpen . drive aro

**Save/Load Clean Data**

**Tokenize Data (nltk)**

In [238]:
tokenizer = TweetTokenizer(reduce_len=True)
tokenizer

<nltk.tokenize.casual.TweetTokenizer at 0x7f0dac2225c0>

In [239]:
def parallel_tokenize(s):
    return s.progress_apply(tokenize)

def tokenize(doc):
    return ' '.join(tokenizer.tokenize(doc))

In [240]:
tweet_ftfy_train_df = parallelize(ftfy_train_df, parallel_tokenize)
tweet_ftfy_train_df.head()

0              This is so cool . It's like , ' would you want your mother to read this ? ? ' Really great idea , well done !
1    Thank you ! ! This would make my life a lot less anxiety-inducing . Keep it up , and don't let anyone get in your way !
2                                  This is such an urgent design problem ; kudos to you for taking it on . Very impressive !
3                                     Is this something I'll be able to install on my site ? When will you be releasing it ?
4                                                                                      haha you guys are a bunch of losers .
Name: comment_text, dtype: object

In [241]:
tweet_ftfy_test_df = parallelize(ftfy_test_df, parallel_tokenize)
tweet_ftfy_test_df.head()

0                                              Jeff Sessions is another one of Trump's Orwellian choices . He believes and has believed his entire career the exact opposite of what the position requires .
1    I actually inspected the infrastructure on Grand Chief Stewart Philip's home Penticton First Nation in both 2010 and 2013 . Exactly Zero projects that had been identified in previous inspection re...
2    No it won't . That's just wishful thinking on democrats fault . For the 100 th time , Walker cited the cost of drug users treatment as being lost with Obamacare . I laugh every time I hear a liber...
3    Instead of wringing our hands and nibbling the periphery of the issue , how about we face the actual issue head on ? I would support a city ordinance against loitering , and applaud city councilor...
4    how many of you commenters have garbage piled high in your yard , bald tires , dead batteries , rotten pallets , car parts , blah blah blah . this town is a pigpen . drive aro

**Save/Load Clean Data**

In [243]:
with open(DATA_PATH/'tweet_ftfy_nfkc_train.pkl', 'wb') as  f:
    pickle.dump(tweet_ftfy_train_df, f)

with open(DATA_PATH/'tweet_ftfy_nfkc_test.pkl', 'wb') as  f:
    pickle.dump(tweet_ftfy_test_df, f)

In [244]:
with open(DATA_PATH/'tweet_ftfy_nfkc_train.pkl', 'rb') as  f:
    tweet_ftfy_train_df = pickle.load(f)

with open(DATA_PATH/'tweet_ftfy_nfkc_test.pkl', 'rb') as  f:
    tweet_ftfy_test_df = pickle.load(f)

In [247]:
check_df = tweet_ftfy_train_df.str.split()
check_df.head()

0              [This, is, so, cool, ., It's, like, ,, ', would, you, want, your, mother, to, read, this, ?, ?, ', Really, great, idea, ,, well, done, !]
1    [Thank, you, !, !, This, would, make, my, life, a, lot, less, anxiety-inducing, ., Keep, it, up, ,, and, don't, let, anyone, get, in, your, way, !]
2                                          [This, is, such, an, urgent, design, problem, ;, kudos, to, you, for, taking, it, on, ., Very, impressive, !]
3                                             [Is, this, something, I'll, be, able, to, install, on, my, site, ?, When, will, you, be, releasing, it, ?]
4                                                                                                        [haha, you, guys, are, a, bunch, of, losers, .]
Name: comment_text, dtype: object

In [249]:
check_df[check_df.str.len() == 1]

7                                                                                                                                                 [FFFUUU]
269                                            [www.businessinsider.com/it-turns-out-that-smoking-marijuana-may-actually-make-you-a-better-driver-2011-12]
276                                                                                                                                                   [:(]
316                                                                                                                                                   [;)]
375                                                                                                                                                  [log]
436                                                                                                                                                   [No]
437                                                                   

**Keyword Extraction**

In [224]:
text_content = " Jeff Sessions is another one of Trump 's Orwellian choices . He believes and has believed his entire career the exact opposite of what the position requires ."
simple_kwextractor = yake.KeywordExtractor(n=2)
keywords = simple_kwextractor.extract_keywords(text_content)

In [225]:
keywords

[('jeff sessions', 0.008293207122436794),
 ('orwellian choices', 0.008293207122436794),
 ('position requires', 0.015380821171891606),
 ('entire career', 0.02570861714399338),
 ('exact opposite', 0.02570861714399338),
 ('trump', 0.057012387690331526),
 ('sessions', 0.08596317751626563),
 ('orwellian', 0.08596317751626563),
 ('jeff', 0.09568045026443411),
 ('choices', 0.09568045026443411),
 ('requires', 0.09568045026443411),
 ('believed', 0.15831692877998726),
 ('entire', 0.15831692877998726),
 ('career', 0.15831692877998726),
 ('exact', 0.15831692877998726),
 ('opposite', 0.15831692877998726),
 ('position', 0.15831692877998726)]