In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

import sklearn.feature_extraction as ftex
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Bag of Words, Bag of Popcorn

[Bag of Words, Bag of Popcorn](https://www.kaggle.com/c/word2vec-nlp-tutorial/data) competition. 

NLP feature pre-processing (using SKLearn and NLTK) to build the best classifier possible with a feature pipeline, and gridsearch for your final model.

# 1. Importing Data

In [None]:
df=pd.read_csv("data/labeledTrainData.tsv", sep="\t")
df["ogreview"]=df.review


# 2. Data Cleaning 

In [2]:
to_replace={
    "<br /><br />" :"",
    "...": ".",
    "..": ".",
    '"': "",
    "'s": ""

}
def cleanup(series, dic):
    for key in dic.keys():
        series=series.str.replace(key, dic[key], regex=False)

    return series
# cleanup(df.review, to_replace)[0]

df["line1"]=df.review.str.split(".").str[0]
df["end1"]=df.review.str.split(".").str[-2]
df["end2"]=df.review.str.split(".").str[-3]

df.review = df.review.str.lower()
df.review=cleanup(df.review, to_replace)
#do split line before this bit
#clean un \ and other random char
df[["review", "line1", "end1", "end2"]]=df[["review","line1", "end1", "end2"]].apply(lambda x: x.str.replace('[^a-zA-Z0-9 ]', '', regex=True), axis=1, ) 


In [3]:

#df["text"]=df.review +" "+ df.line1+" "+df.end1+" "+df.end2
df["text"]=df.review


x, x_holdout,y, y_holdout = train_test_split(df.drop(columns=['sentiment']).text, df.sentiment, train_size=0.9, random_state=0)

idf=ftex.text.TfidfVectorizer(stop_words="english", min_df=5, max_df=0.9, strip_accents="ascii")
# vect=CountVectorizer(stop_words="english", min_df=5, max_df=0.9, strip_accents="ascii")

e=[]
s=[]




# 3. Modeling

In [52]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD

pipeline=Pipeline(
    [   
        ("vect", idf),
        # ('darth', add_vader),


        ('clf', MultinomialNB()),
        # ('knn', KNeighborsClassifier()),
        # ('dtree', DecisionTreeClassifier()), 
        # ('line_svc', SVC(kernel="linear")), #too slow

        # ('tsvd', TruncatedSVD()), #for svc models
        # ('svc', SVC()),

        # ('randomf', RandomForestClassifier())
    ]
)

# pipeline.fit(x, y)
# pipeline.score(x_holdout, y_holdout)

params_grid={
    # 'vect__min_df' : [2], #np.arange(2,6)
    # 'vect__ngram_range': [(1,2)], #(1,2) 0.871

    # 'clf__alpha': [1], #[0.1, 1, 10, 100]
    # 'knn__n_neighbors': [8], #range(5,10) 
    # 'dtree__random_state':[0]
    # 'line_svc__C': [ 0.025] #too slow

    # 'tsvd__n_components': [200], #for svc models
    # 'svc__gamma': [2],
    # 'svc__C' :[1]



}

grid=GridSearchCV(pipeline, param_grid=params_grid, scoring='accuracy', cv=5, n_jobs=5, verbose=3)
grid.fit(x,y)

print(grid.best_params_)
print(grid.best_score_)

e.append(str(grid.best_estimator_).split("\n")[3])
s.append(grid.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
{}
0.8617333333333332


In [53]:
grid.score(x_holdout, y_holdout)

0.8736

In [31]:
import sys


if 'nltk' not in sys.modules:
    import nltk
    nltk.download('vader_lexicon')

    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    
def vader(x):
    vader_out = x.apply(lambda review: sid.polarity_scores(review))
    scores=vader_out.apply(pd.Series)

    # x_out=hstack((x, scores))
    return scores



class add_vader:
    def __init__(self) -> None:
        # print("Initialer called")
        return self

    def fit(self, input):
        print("Fit called")
        return self  #important to have a return self statement apparently for the later transform operation in the pipeline

    def transform(self, input):
        if input.shape[0] == x.shape[0]:
            vader_scores=vader(x)

        elif input.shape[0] == x_holdout.shape[0]:
            vader_scores=vader(x_holdout)        

        else:
            return input

        x_out=hstack((input, vader_scores))
        return x_out




#vader(x_holdout)

In [42]:
vd=vader(x)

vd.compound=(vd.compound+1)/2

#0.7 accuracy score

# 4. Comparing the Different Models

In [None]:
estim= { "                ('clf', MultinomialNB(alpha=1))])": 0.876,
"                ('knn', KNeighborsClassifier(n_neighbors=8))])": 0.7891111111111111,
 "                ('dtree', DecisionTreeClassifier(random_state=0))])": 0.7053777777777778,
 "                ('tsvd', TruncatedSVD(n_components=200)),": 0.8637777777777776,
 "                ('randomf', RandomForestClassifier())])": 0.8478666666666667
 }