In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression



In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df['label'] = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0 )

############################################
# Reducing the size of the dataset to make it more manageable (reduce computation times while developing)
# Comment code when model is ready.
df = df[:5000]
############################################

df.head(20)

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
5,"Probably my all-time favorite movie, a story o...",positive,1
6,I sure would like to see a resurrection of a u...,positive,1
7,"This show was an amazing, fresh & innovative i...",negative,0
8,Encouraged by the positive comments about this...,negative,0
9,If you like original gut wrenching laughter yo...,positive,1


In [3]:
from sklearn.model_selection import train_test_split
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df.review, df.label, test_size=0.30)

### Token Exploration

In [4]:
from nltk.stem.snowball import SnowballStemmer

def preprocess_document(doc):
    ############################################
    #
    # ... do some of our pe processing steps here
    
    processed_doc = doc
    return processed_doc

X_train_preprocessed = [preprocess_document(doc) for doc in X_train_raw]
X_test_preprocessed = [preprocess_document(doc) for doc in X_test_raw]

### Pre-processing

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [5]:
from nltk.corpus import stopwords 

stop_words = set(stopwords.words('english')) 

cv = CountVectorizer(
    stop_words=stop_words,
    min_df = 0.01,
    max_df=0.9,
    ngram_range=(1,2),
)
cv.fit(X_train_preprocessed)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
                ngram_range=(1, 2), preprocessor=None,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
X_train = cv.transform(X_train_preprocessed)
X_test = cv.transform(X_test_preprocessed)

In [7]:
pd.DataFrame(list(zip(cv.get_feature_names(), X_train.sum(axis=0).tolist()[0])), columns=['token', 'records']).sort_values('records', ascending=False).head(50)

Unnamed: 0,token,records
193,br,14226
198,br br,7124
1060,movie,6347
589,film,5552
1145,one,3760
918,like,2782
694,good,2103
501,even,1752
1806,would,1740
1634,time,1725


### Choose a model

In [8]:
### model.fit(X_train, y_train) ...



### Evaluate Results

In [9]:
### model.score(X_test, y_test) ...



### How quick NLP can be...

Though this hides the complexity in data preparation and preprocessing 
and will ultimately result in a poor model...

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [13]:
df = pd.read_csv('IMDB Dataset.csv')[:10000]

X_train, X_test, y_train, y_test = train_test_split(df.review, 
                                                    df.sentiment.apply(lambda x: 1 if x == 'positive' else 0 ),
                                                    test_size=0.30)

pipeline = Pipeline(steps=[
    ('Vectorizer', CountVectorizer(stop_words='english', min_df=0.01, max_df=0.9, ngram_range=(1,2))),
    ('Classifier', LogisticRegression(solver='newton-cg'))
]).fit(X_train, y_train)

pipeline.score(X_test, y_test)

0.8263333333333334

In [12]:
pd.DataFrame(sorted(list(zip(pipeline.steps[0][1].get_feature_names(), pipeline.steps[1][1].coef_[0])), key=lambda x: x[1]), columns=['token','coef'])

Unnamed: 0,token,coef
0,waste,-2.819026
1,awful,-2.466558
2,worst,-1.985432
3,avoid,-1.900649
4,cheap,-1.837641
...,...,...
1684,excellent,1.741983
1685,don want,1.752338
1686,beautifully,1.869398
1687,perfectly,2.009997
