In [55]:
import pandas as pd
import numpy as np
import nltk
import pickle
from contextlib import contextmanager
import copy
import os
import re
import string
import time
import warnings

In [56]:
@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')

In [57]:
misspell_dict = {"aren't": "are not", "can't": "cannot", "couldn't": "could not",
                 "didn't": "did not", "doesn't": "does not", "don't": "do not",
                 "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would", "he'll": "he will", "he's": "he is",
                 "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not",
                 "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us",
                 "mightn't": "might not", "mustn't": "must not", "shan't": "shall not",
                 "she'd": "she would", "she'll": "she will", "she's": "she is",
                 "shouldn't": "should not", "that's": "that is", "there's": "there is",
                 "they'd": "they would", "they'll": "they will", "they're": "they are",
                 "they've": "they have", "we'd": "we would", "we're": "we are",
                 "weren't": "were not", "we've": "we have", "what'll": "what will",
                 "what're": "what are", "what's": "what is", "what've": "what have",
                 "where's": "where is", "who'd": "who would", "who'll": "who will",
                 "who're": "who are", "who's": "who is", "who've": "who have",
                 "won't": "will not", "wouldn't": "would not", "you'd": "you would",
                 "you'll": "you will", "you're": "you are", "you've": "you have",
                 "'re": " are", "wasn't": "was not", "we'll": " will", "tryin'": "trying"}

def _get_misspell(misspell_dict):
    misspell_re = re.compile('(%s)' % '|'.join(misspell_dict.keys()))
    return misspell_dict, misspell_re

def replace_typical_misspell(text):
    misspellings, misspellings_re = _get_misspell(misspell_dict)
    def replace(match):
        return misspellings[match.group(0)]
    return misspellings_re.sub(replace, text)
    
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']

def clean_text(x):
    x = str(x)
    for punct in puncts + list(string.punctuation):
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    return re.sub(r'\d+', ' ', x)

In [58]:
train_data = '/Users/kelvin/Toxicity/data/train.csv'

In [64]:
def load_and_prec():
    train = pd.read_csv(train_data)
    # lower
    train['comment_text'] = train['comment_text'].str.lower()
    # clean misspellings
    train['comment_text'] = train['comment_text'].apply(replace_typical_misspell)
    # clean the text
    train['comment_text'] = train['comment_text'].apply(clean_text)
    # clean numbers
    train['comment_text'] = train['comment_text'].apply(clean_numbers)
    # strip
    train['comment_text'] = train['comment_text'].str.strip()
    # replace blank with nan
    train['comment_text'].replace('', np.nan, inplace=True)
    # fill up the missing values
    train['comment_text'] = pd.Series(train['comment_text'].fillna('_##_').values)
    return train

In [65]:
with timer('load data'):
    train = load_and_prec()

[load data] start.
[load data] done in 2.57 min.


In [67]:
with timer('load data'):
    # Text preprocessing steps - remove numbers, captial letters and punctuation
    alphabetic = lambda x: re.sub('\w*\d\w*', ' ', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
    # text
    text = train['comment_text'].map(alphabetic).map(punc_lower)
    text.head() 

[load data] start.
[load data] done in 1.71 min.


In [68]:
train["comment_text"] = text
train.rename({'homosexual_gay_or_lesbian':'gay_lesbian', 'intellectual_or_learning_disability':'learning_disability', 'psychiatric_or_mental_illness':'mental_illness', 'other_race_or_ethnicity':'other_race'}, axis=1, inplace=True)

In [69]:
with timer('labels'):
    train.target[train.target >= 0.5]
    train['class_target'] = np.where(train['target'] >= 0.5, 'toxic', 'non_toxic')
    train.class_target.value_counts(normalize=True)*100

[labels] start.
[labels] done in 0.01 min.


In [70]:
train.to_csv('preprocessed_data.csv', index=False)

In [71]:
with open('preprocessed_data', mode='wb') as f:
    pickle.dump(train, f)

In [72]:
with timer('load pickle'):
    with open('preprocessed_data', mode='rb') as f:
        data = pickle.load(f)

[load pickle] start.
[load pickle] done in 0.29 min.


In [73]:
data.shape

(1804874, 46)

In [76]:
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis