In [1]:
import pandas as pd
import numpy as np
import nltk
import pickle
from contextlib import contextmanager
import copy
import os
import re
import string
import time
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
from spacy.lang.en import English

from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')

In [3]:
train_short = pd.read_csv('cleaned_short.csv')

In [4]:
small_df = train_short[['target', 'comment_text']]

In [5]:
small_df.comment_text

0         this is a great story      man      i wonder i...
1         yet call out all muslims for the acts of a few...
2         because the people who drive cars more are the...
3         mormons have had a complicated relationship wi...
4                   i am doing the same thing              
                                ...                        
235082    xi and his comrades must be smirking over trum...
235083    my thought exactly       the only people he ha...
235084    i agree      bill g\nthe vote     buying has b...
235085    no      the probability of dying may be very  ...
235086    nah      i am too boring to parody       this ...
Name: comment_text, Length: 235087, dtype: object

In [6]:
with timer('sent_to_words'):
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

    data_words = list(sent_to_words(small_df.comment_text))

    print(data_words[:1])

[sent_to_words] start.
[['this', 'is', 'great', 'story', 'man', 'wonder', 'if', 'the', 'person', 'who', 'yelled', 'shut', 'the', 'fuck', 'up', 'at', 'him', 'ever', 'heard', 'it']]
[sent_to_words] done in 0.73 min.


In [7]:
with timer('bigram-trigram'):
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10)
    trigram = gensim.models.Phrases(bigram[data_words], min_count=5, threshold=10)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

[bigram-trigram] start.
[bigram-trigram] done in 3.04 min.


In [8]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
# NLTK Stop words
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [10]:
spacy.load('en')
parser = English()

In [11]:
with timer('trigram-lemmatization'):
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

[trigram-lemmatization] start.
[trigram-lemmatization] done in 0.80 min.


## Save sentences simple preprocess

In [15]:
data_words_nostops[:1]

[['great',
  'story',
  'man',
  'wonder',
  'person',
  'yelled',
  'shut',
  'fuck',
  'ever',
  'heard']]

In [22]:
sentences = []
#untokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
for sentence in data_words_nostops:    
    sentences.append(TreebankWordDetokenizer().detokenize(sentence))

In [48]:
y = np.where(small_df.target >= .5, 1, 0)

In [65]:
cleaned_data = pd.concat([pd.DataFrame(y, columns=['y']) , pd.DataFrame(sentences, columns=['comment_text'])], axis=1)

In [66]:
cleaned_data.head()

Unnamed: 0,y,comment_text
0,0,great story man wonder person yelled shut fuck...
1,1,yet call muslims acts get pilloried okay smear...
2,0,people drive cars ones cause wear tear roads p...
3,0,mormons complicated relationship federal law
4,0,thing


In [67]:
cleaned_data.to_csv('cleaned_short_simple.csv', index=False)

## Continue Preprocess

In [35]:
with timer('make-trigrams'):    
    # Form Trigrams
    data_words_trigrams = make_bigrams(data_words_nostops)
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

[make-trigrams] start.
[make-trigrams] done in 0.36 min.


In [40]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [41]:
en_stop = set(nltk.corpus.stopwords.words('english'))
additional_stops = ['&lt;', '&gt;', 'lt;#&gt', 'lt;', '&gt', 'know', 'just', 'txt', 'like', 'ok', 'come', 'want', 'did', 'got']
en_stop.update(additional_stops)
def prepare_text_for_lda_lemma(tokens):
    tokens = [token for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [43]:
with timer('text_data'):
    text_data = []

    for row in data_words_trigrams:
        tokens = prepare_text_for_lda_lemma(row)
        text_data.append(tokens)

[text_data] start.
[text_data] done in 1.01 min.


In [44]:
len(text_data)

235087

In [45]:
import json

with open('preprocess_trigram_token.csv','w') as myfile:
    json.dump(text_data,myfile)

In [46]:
with open('preprocess_trigram_token.csv','r') as infile:
    text_tokens = json.load(infile)