In [17]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

import string
import numpy as np
import pandas as pd

# import data

In [2]:
df = pd.read_csv('../raw_data/nlp/reviews.csv')
df.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


# text pre-cleaning

In [11]:
# basic cleaning

def basic_cleaning(sentence):
    
    # remove punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')
    
    # strip sentences
    sentence = sentence.strip()
    
    # lowercase
    sentence = sentence.lower()
    
    # remove digits
    sentence = ''.join(char for char in sentence if not char.isdigit())
    
    # tokenize
    sentence = word_tokenize(sentence)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    without_stop = [word for word in sentence if not word in stop_words]
    
    return ' '.join(without_stop)

In [12]:
df['clean_reviews'] = df['reviews'].apply(basic_cleaning)
df.head()

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couples go church party drink dr...
1,neg,the happy bastard's quick movie review \ndamn ...,happy bastards quick movie review damn yk bug ...
2,neg,it is movies like these that make a jaded movi...,movies like make jaded movie viewer thankful i...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest camelot warner bros first featurelength ...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis mentally unstable man undergoing psyc...


# bag-of-words modeling

In [13]:
# vectorize
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df.clean_reviews)
y = df.target

In [14]:
model = MultinomialNB()
model.fit(X,y)
model.score(X,y)

0.978

# N-gram model

In [15]:
vectorizer = CountVectorizer(ngram_range=(2,2))
X = vectorizer.fit_transform(df.clean_reviews)
y = df.target

model = MultinomialNB()
model.fit(X,y)
model.score(X,y)

1.0

# Model tuning

## countvectorizer

In [22]:
# pipeline

pipeline = Pipeline([
    ('vector',CountVectorizer()),
    ('nb',MultinomialNB())
])

# param_grid

params = {
    'vector__ngram_range':((1,1),(2,2)),
    'nb__alpha':(0.1,1.0)
}

# grid_search

search = GridSearchCV(pipeline,params,cv=5,n_jobs=-1,
                      verbose=1,scoring='accuracy',
                      refit=True)

# fit

X = df.reviews
y = df.target
search.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    8.3s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vector', CountVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1.0),
                         'vector__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [25]:
# to get valid params
pipeline.get_params();

# best_params
search.best_params_

# best_score
search.best_score_

0.8320000000000001

## tfidfvectorizer

In [26]:
# pipeline

pipeline = Pipeline([
    ('vector',TfidfVectorizer()),
    ('nb',MultinomialNB())
])

# param_grid

params = {
    'vector__ngram_range':((2,2),(3,3)),
    'nb__alpha':(0.1,0.2,0.3,0.4)
}

# grid_search

search = GridSearchCV(pipeline,params,cv=5,n_jobs=-1,
                      verbose=1,scoring='accuracy',
                      refit=True)

# fit

X = df.reviews
y = df.target
search.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   31.7s finished


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vector', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 0.2, 0.3, 0.4),
                         'vector__ngram_range': ((2, 2), (3, 3))},
             scoring='accuracy', verbose=1)

In [28]:
# best_params
search.best_params_

# best_score
search.best_score_

0.836