In [2]:
%matplotlib inline
import os
import shutil
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import sklearn as sk
import string
import spacy
import textacy
import pickle
import keras
import re
from tqdm import tqdm, tqdm_notebook, tnrange, trange

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
tqdm.monitor_interval = 0
tqdm_notebook().pandas()




In [4]:
nlp = spacy.load('en_core_web_md')

In [5]:
train_raw = pd.read_csv('RAW/train.csv.zip',
                        dtype={
                            'id' : str,
                            'comment_text' : str,
                            'toxic': bool,
                            'severe_toxic': bool,
                            'obscene': bool,
                            'threat': bool,
                            'insult': bool,
                            'identity_hate': bool
                        },
                        compression='zip',
                        na_filter=False,
#                         index_col = 'id'
                       )

In [6]:
train_raw.iloc[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,False,False,False,False,False,False
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,False,False,False,False,False,False
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",False,False,False,False,False,False
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",False,False,False,False,False,False
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",False,False,False,False,False,False


In [9]:
self_pronoun_re = re.compile("(^|\s)(i)($|\s)")

def handle_match(m_o):
    return m_o[1] + "I" + m_o[3]

def clean_text(text):
    norm_escapes = text.encode("unicode-escape").decode('unicode-escape')
    norm_whitespc = textacy.preprocess.normalize_whitespace(norm_escapes)
    prepped = textacy.preprocess.preprocess_text(norm_whitespc, no_contractions=True, no_currency_symbols=True, no_urls=True, no_emails=True, no_phone_numbers=True, lowercase=True, fix_unicode=True, no_punct=True, transliterate=True, no_numbers=True)
    prepped = re.sub(self_pronoun_re,handle_match, prepped)
    doc = nlp(prepped)
    keep_tokens = list()
    for tok in doc:
        if tok.is_space:
            continue
        if tok.lemma_.startswith("-"):
            if tok.lemma_ == "-PRON-":
                keep_tokens.append(tok.norm_)
            else:
                print(tok.lemma_)
                break
        else:
            keep_tokens.append(tok.lemma_)
    return " ".join(keep_tokens)

In [23]:
clean_text("10 ten one hundred 111")

'number ten one hundred number'

In [10]:
train_raw['cleaned_comments'] = train_raw.comment_text.progress_apply(clean_text)

In [11]:
train_raw.iloc[0:5]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned_comments
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,False,False,False,False,False,False,explanation why the edit make under my usernam...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,False,False,False,False,False,False,daww he match this background colour I be seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",False,False,False,False,False,False,hey man I be really not try to edit war its ju...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",False,False,False,False,False,False,more I can not make any real suggestion on imp...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",False,False,False,False,False,False,you sir be my hero any chance you remember wha...


In [12]:
train_raw.to_pickle('CLEAN/train.P.gz', compression='gzip')

In [13]:
# del train_raw

In [14]:
import gc
gc.collect()

9301

In [15]:
test_raw = pd.read_csv('RAW/test.csv.zip',
                       dtype={
                           'id' : str,
                           'comment_text' : str
                       },
                       compression='zip',
                       na_filter=False,
#                        index_col = 'id'
                      )

In [16]:
test_raw.iloc[0:5]

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [17]:
# test_raw['cleaned_comments'] = [clean_text(test_raw.iloc[i].comment_text) for i in tnrange(test_raw.shape[0])] #test_raw.comment_text.progress_apply(clean_text)
test_raw['cleaned_comments'] = test_raw.comment_text.progress_apply(clean_text)

In [18]:
# test_cleaned = [None] * test_raw.shape[0]
# for i in tnrange(test_raw.shape[0]):
#     test_cleaned[i] = clean_text(test_raw.iloc[i].comment_text)

In [19]:
test_raw.iloc[0:5]

Unnamed: 0,id,comment_text,cleaned_comments
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,yo bitch ja rule be more succesful then you wi...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,from rfc the title be fine as it be imo
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",source zawe ashton on lapland
3,00017563c3f7919a,":If you have a look back at the source, the in...",if you have a look back at the source the info...
4,00017695ad8997eb,I don't anonymously edit articles at all.,I do not anonymously edit article at all


In [20]:
if not os.path.exists('CLEAN'):
    os.mkdir("CLEAN")

In [21]:
test_raw.to_pickle('CLEAN/test.P.gz', compression='gzip')

In [22]:
gc.collect()

41045