In [1]:
%matplotlib inline
import os
import shutil
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import string
import spacy
import textacy
import pickle
import keras
import re
from tqdm import tqdm, tqdm_notebook, tnrange, trange

Using TensorFlow backend.


In [2]:
tqdm.monitor_interval = 0
tqdm_notebook().pandas()




In [3]:
nlp = spacy.load('en_core_web_md')

In [4]:
train_raw = pd.read_csv('RAW/train.csv.zip',
                        dtype={
                            'id' : str,
                            'comment_text' : str,
                            'toxic': bool,
                            'severe_toxic': bool,
                            'obscene': bool,
                            'threat': bool,
                            'insult': bool,
                            'identity_hate': bool
                        },
                        compression='zip',
                        index_col = 'id',
                        na_filter=False
                       )

In [6]:
train_raw.iloc[0:5]

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
22256635,"Nonsense? kiss off, geek. what I said is true...",True,False,False,False,False,False
27450690,"""\n\n Please do not vandalize pages, as you di...",False,False,False,False,False,False
54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",False,False,False,False,False,False
77493077,Asking some his nationality is a Racial offenc...,False,False,False,False,False,False
79357270,The reader here is not going by my say so for ...,False,False,False,False,False,False


In [8]:
self_pronoun_re = re.compile("(^|\s)(i)($|\s)")

def handle_match(m_o):
    return m_o[1] + "I" + m_o[3]

def clean_text(text):
    norm_escapes = text.encode("unicode-escape").decode('unicode-escape')
    norm_whitespc = textacy.preprocess.normalize_whitespace(norm_escapes)
    prepped = textacy.preprocess.preprocess_text(norm_whitespc, no_contractions=True, no_currency_symbols=True, no_urls=True, no_emails=True, no_phone_numbers=True, lowercase=True, fix_unicode=True, no_punct=True, transliterate=True, no_numbers=True)
    prepped = re.sub(self_pronoun_re,handle_match, prepped)
    doc = nlp(prepped)
    keep_tokens = list()
    for i in doc:
        if i.lemma_.startswith("-"):
            if i.lemma_ == "-PRON-":
                keep_tokens.append(i.norm_)
            else:
                print(i.lemma_)
                break
        else:
            keep_tokens.append(i.lemma_)
    return " ".join(keep_tokens)

In [9]:
train_raw['cleaned_comments'] = train_raw.comment_text.progress_apply(clean_text)




In [10]:
train_raw.iloc[0:5]

Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comments
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
22256635,"Nonsense? kiss off, geek. what I said is true...",True,False,False,False,False,False,nonsense kiss off geek what I say be true I wi...
27450690,"""\n\n Please do not vandalize pages, as you di...",False,False,False,False,False,False,please do not vandalize page as you do with th...
54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",False,False,False,False,False,False,point of interest I remove the point of intere...
77493077,Asking some his nationality is a Racial offenc...,False,False,False,False,False,False,ask some his nationality be a racial offence w...
79357270,The reader here is not going by my say so for ...,False,False,False,False,False,False,the reader here be not go by my say so for eth...


In [13]:
train_raw.to_pickle('CLEAN/train.P.gz', compression='gzip')

In [None]:
# del train_raw

In [14]:
import gc
gc.collect()

2597

In [5]:
test_raw = pd.read_csv('RAW/test.csv.zip',
                       dtype={
                           'id' : str,
                           'comment_text' : str
                       },
                       compression='zip',
                       index_col = 'id',
                       na_filter=False
                      )

In [7]:
test_raw.iloc[0:5]

Unnamed: 0_level_0,comment_text
id,Unnamed: 1_level_1
6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
6102620,::Kentuckiana is colloquial. Even though the ...
14563293,"Hello fellow Wikipedians,\nI have just modifie..."
21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
22982444,== [WIKI_LINK: Talk:Celts] ==


In [None]:
# test_raw['cleaned_comments'] = [clean_text(test_raw.iloc[i].comment_text) for i in tnrange(test_raw.shape[0])] #test_raw.comment_text.progress_apply(clean_text)
test_raw['cleaned_comments'] = test_raw.comment_text.progress_apply(clean_text)

In [12]:
# test_cleaned = [None] * test_raw.shape[0]
# for i in tnrange(test_raw.shape[0]):
#     test_cleaned[i] = clean_text(test_raw.iloc[i].comment_text)







KeyboardInterrupt: 

In [15]:
test_raw.iloc[0:5]

Unnamed: 0_level_0,comment_text,cleaned_comments
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...,orphaned nonfree medium image41cd1jboevl ss500...
6102620,::Kentuckiana is colloquial. Even though the ...,kentuckiana be colloquial even though the area...
14563293,"Hello fellow Wikipedians,\nI have just modifie...",hello fellow wikipedian \n I have just modify ...
21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2...",akc suspension the morning call feb number num...
22982444,== [WIKI_LINK: Talk:Celts] ==,wikilink talkcelt


In [16]:
if not os.path.exists('CLEAN'):
    os.mkdir("CLEAN")

In [17]:
# train_raw.to_csv('CLEAN/train.csv')
# test_raw.to_csv('CLEAN/test.csv')
os.remove('CLEAN/train.csv')
os.remove('CLEAN/test.csv')

In [19]:
test_raw.to_pickle('CLEAN/test.P.gz', compression='gzip')

In [None]:
# del test_raw