# NLP Classification

Workflow for supervised learning on text data

In [252]:
import pandas as pd

from sklearn.pipeline import Pipeline

# ignore warning messages (sklearn has a ton)
import warnings
warnings.filterwarnings('ignore')

In [3]:
# combine all daasets
usc_df = pd.read_csv('../data/usc.csv')
stats_df = pd.read_csv('../data/statistics.csv')
office_df = pd.read_csv('../data/dundermifflin.csv')
overwatch_df = pd.read_csv('../data/overwatch.csv')
dutch_df = pd.read_csv('../data/dutch.csv')

In [13]:
all_df = pd.concat([office_df, overwatch_df])

In [14]:
print(all_df.shape)
all_df.head()

(89241, 6)


Unnamed: 0.1,Unnamed: 0,title,id,subreddit,body,comment
0,0,Should I call you Jimothy?,ay2o5j,DunderMifflin,,I read somewhere that most people who think th...
1,1,Should I call you Jimothy?,ay2o5j,DunderMifflin,,I got Oscar Martinez... Michael am I gay?
2,2,Should I call you Jimothy?,ay2o5j,DunderMifflin,,That is correct.
3,3,Should I call you Jimothy?,ay2o5j,DunderMifflin,,Am I the only one who took slight pride in get...
4,4,Should I call you Jimothy?,ay2o5j,DunderMifflin,,You got: Creed Bratton\nYou're very mysterious...


In [15]:
all_df.subreddit.value_counts()

Overwatch        47774
DunderMifflin    41467
Name: subreddit, dtype: int64

# Preprocess Text

As a first step, we'll tokenize the documents, remove stopwords, etc.

Gensim requires that we create a `gensim.corpora.Dictionary` object for our text.
This object expects a list of lists, (each nested list itself a list of tokens), so we'll ensure our preprocessing functions output our data as such:

In [159]:
import re
def clean_token(token):
    c_token = re.sub("[^A-Za-z']+", ' ', str(token))
    # lower-case and strip whitespace
    c_token = c_token.lower().strip()
    # remove stopwords
    if c_token in stop_words:
        return ''
    return c_token

def clean_comment(comment):
    if not isinstance(comment, str):
        return ['']
    cleaned_comment = [
        clean_token(token) 
        for token in comment.split()
    ]
    # remove non-empty strings
#     cleaned_comment = [com for com in cleaned_comment 
#                       if com != '']
    return cleaned_comment

stop_words = ['fuck', 'hey']

In [211]:
all_df['clean_comment'] = all_df['comment'].apply(lambda x: ' '.join(clean_comment(x)))

In [212]:
all_df['clean_comment']

0        i read somewhere that most people who think th...
1                    i got oscar martinez michael am i gay
2                                          that is correct
3        am i the only one who took slight pride in get...
4        you got creed bratton you're very mysterious a...
5        damn i just took a test and i'm a phyllis i gu...
6        god i just took the quiz for the first time ev...
7                                identity theft is a crime
8                                          daryll  love it
9        if you take a quiz and get dwight and keep ret...
10                                                 or ryan
11       i just took it and apparently i m pam makes sense
12             lmao this  year old woman is stanley hudson
13                                     apparently i m andy
14       you re even more of a michael if you keep reta...
15       apparently i'm a meredith i'm not sure how i f...
16       if you take a what character from the office a.

## Create embeddings

Before throwing our into ml models, we need to create embeddings.
This can be done in several ways, with complexity differening depending on the desired metod

In [163]:


from gensim.sklearn_api import TfIdfTransformer

from gensim.corpora import Dictionary

from sklearn.linear_model import SGDClassifier

from gensim.sklearn_api import LdaSeqTransformer

In [164]:
from sklearn.model_selection import train_test_split

In [194]:
# def make_label(x):
#     if x == 'Overwatch':
#         return 1
#     else:
#         return 0

# all_df['label'] = all_df['subreddit'].apply(lambda x: make_label(x))

In [226]:
# all_df['com'] = all_df['clean_comment'].apply(lambda x: ' '.join(x))
X = all_df['clean_comment']
y = all_df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Basic Example: sklearn

In [233]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# init model w/ default params
tfidf  = TfidfVectorizer()

corpus_train = tfidf.fit_transform(X_train)
corpus_test = tfidf.transform(X_test)

In [234]:
from sklearn.ensemble import RandomForestClassifier

# select classifier, and fit
rf = RandomForestClassifier()
rf.fit(corpus_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [235]:
# test model and view accuracy
from sklearn.metrics import accuracy_score
y_preds = rf.predict(corpus_test)
accuracy_score(y_preds, y_test)

0.7612146565569791

## sklearn pipelines

Idea: throw everything together into one object. Treat this object itself as a single classifier.

In [236]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

def create_pipeline(vectorizer, estimator, reducer=False):
    """
    Create pipeline with optional dimensionality-reduction.
    """
    steps = [
        ('vectorizer', vectorizer)
    ]
    if reducer:
        steps.append(('reducer', TruncatedSVD()))
    steps.append(('classifier', estimator))
    return Pipeline(steps)


In [240]:
pipe = create_pipeline(CountVectorizer(), SGDClassifier(), reducer=False)

pipe.fit(X_train, y_train)
y_preds = pipe.predict(X_test)
print(accuracy_score(y_preds, y_test))



0.7927389534232249


Defining a pipeline allows us to iterate through many different model + vector combinations.
Also, the code becomes more concise, and more expressive

In [249]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

models = []
for vectorizer in (CountVectorizer(), TfidfVectorizer()):
    for estimator in (LogisticRegression, SGDClassifier, RandomForestClassifier):
        models.append(create_pipeline(vectorizer, estimator(), reducer=False))
        models.append(create_pipeline(vectorizer, estimator(), reducer=True))

print(models[1:4])

[Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]), Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...m_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))]), Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=

In [253]:
scores = []
for model in models:
    model_name = str(type(model.named_steps['classifier'])).split('.')[-1]
    
    if 'reducer' in model.named_steps:
        acc_print = 'Accuracy of {} with dimensionality reduction: {}'
    else:
        acc_print = 'Accuracy of {} without dimensionality reduction: {}'
        
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    scores.append(accuracy)
    print(acc_print.format(model_name, accuracy))

Accuracy of LogisticRegression'> without dimensionality reduction: 0.8130952825607889
Accuracy of LogisticRegression'> with dimensionality reduction: 0.5350913233481492
Accuracy of SGDClassifier'> without dimensionality reduction: 0.792440144922123
Accuracy of SGDClassifier'> with dimensionality reduction: 0.5350913233481492
Accuracy of RandomForestClassifier'> without dimensionality reduction: 0.7626713479998506
Accuracy of RandomForestClassifier'> with dimensionality reduction: 0.5468569080790349
Accuracy of LogisticRegression'> without dimensionality reduction: 0.812385612370672
Accuracy of LogisticRegression'> with dimensionality reduction: 0.5350913233481492
Accuracy of SGDClassifier'> without dimensionality reduction: 0.796362006499085
Accuracy of SGDClassifier'> with dimensionality reduction: 0.5350913233481492
Accuracy of RandomForestClassifier'> without dimensionality reduction: 0.7576663056063945
Accuracy of RandomForestClassifier'> with dimensionality reduction: 0.5257162066

## A bit more advanced - using embeddings not provided by sklearn

Let's say you want to embed your text-data using a different embedding method, one not provided by scikit-learn. However, you also still want to be able to use those awesome `sklearn.Pipeline` objects. 

What can ya do???

Wrap your own sklearn transformer!

In [193]:
# fit dictionary to training data
dictionary = Dictionary(X_train)

# create bag-of-words for training & testing data
corpus = [dictionary.doc2bow(text) for text in X_train]
corpus_test = [dictionary.doc2bow(text) for text in X_test]

TypeError: doc2bow expects an array of unicode tokens on input, not a single string

In [168]:
tfidf = TfIdfTransformer(id2word=dictionary)

In [169]:
all_df.isnull().sum()

Unnamed: 0           0
title                0
id                   0
subreddit            0
body             85847
comment              1
clean_comment        0
label                0
dtype: int64

In [170]:
lda = LdaSeqTransformer(id2word=dictionary, num_topics=5)

In [171]:
tfidf.fit(corpus)

TfIdfTransformer(dictionary=None,
         id2word=<gensim.corpora.dictionary.Dictionary object at 0x000002463519D208>,
         normalize=True, pivot=None, slope=0.65, smartirs='ntc',
         wglobal=<function df2idf at 0x00000246098F8C80>,
         wlocal=<function identity at 0x000002460DA759D8>)

In [172]:
tcorp = tfidf.transform(corpus)

In [179]:
tc2 = [i for i in tcorp if len(i) > 0]
print(len(tc2))
print(len(y_train[3:]))

62465
62465


In [152]:
sgd = SGDClassifier()

In [181]:
sgd.fit(tc2, y_train[3:])



ValueError: setting an array element with a sequence.

In [124]:
all_df.head()

Unnamed: 0.1,Unnamed: 0,title,id,subreddit,body,comment,clean_comment,label
0,0,Should I call you Jimothy?,ay2o5j,DunderMifflin,,I read somewhere that most people who think th...,"[i, read, somewhere, that, most, people, who, ...",0
1,1,Should I call you Jimothy?,ay2o5j,DunderMifflin,,I got Oscar Martinez... Michael am I gay?,"[i, got, oscar, martinez, michael, am, i, gay]",0
2,2,Should I call you Jimothy?,ay2o5j,DunderMifflin,,That is correct.,"[that, is, correct]",0
3,3,Should I call you Jimothy?,ay2o5j,DunderMifflin,,Am I the only one who took slight pride in get...,"[am, i, the, only, one, who, took, slight, pri...",0
4,4,Should I call you Jimothy?,ay2o5j,DunderMifflin,,You got: Creed Bratton\nYou're very mysterious...,"[you, got, creed, bratton, you're, very, myste...",0
