# Vectorizer Tuning

In [59]:
# import librairies
import pandas as pd
import string
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [8]:
# load dataset
data = pd.read_pickle("reviews_3")
data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [35]:
# Clean data by removing punctuation and upper case.
data['clean_reviews'] = data['reviews'].str.translate(str.maketrans('','',string.punctuation))
data['clean_reviews'] = data['clean_reviews'].str.lower()
data
X = data.clean_reviews
y = data.target

## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [42]:
# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB()),
])
# Set parameters to search (model and vectorizer)
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
    'nb__alpha':(0.1,1),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
                           verbose=1, scoring ='accuracy',
                           refit=True, cv=5)

grid_search.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1),
                         'tfidf__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [43]:
grid_search.best_params_

{'nb__alpha': 0.1, 'tfidf__ngram_range': (2, 2)}

In [44]:
grid_search.best_score_

0.8394999999999999

In [49]:
# Create Pipeline
pipeline = Pipeline([
    ('vect', None), # placeholder for vectorizer
    ('nb', MultinomialNB()),
])
# Set parameters to search (model and vectorizer)
parameters = {
    'vect': [TfidfVectorizer(), CountVectorizer()],
    'vect__ngram_range': ((1,1), (2,2)),
    'nb__alpha':(0.1,1),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
                           verbose=1, scoring ='accuracy',
                           refit=True, cv=5)

grid_search.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', None),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1),
                         'vect': [TfidfVectorizer(),
                                  CountVectorizer(ngram_range=(2, 2))],
                         'vect__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [50]:
grid_search.best_params_

{'nb__alpha': 0.1,
 'vect': CountVectorizer(ngram_range=(2, 2)),
 'vect__ngram_range': (2, 2)}

In [51]:
grid_search.best_score_

0.8400000000000001

# Cleaning data more 

In [56]:
def remove_stopwords(text):
    stop_words = stopwords.words('english')
    word_tokens = word_tokenize(text)
    filtered_text = " ".join([word for word in word_tokens if not word in stop_words])
    return filtered_text

data['clean_reviews'] = data['clean_reviews'].apply(remove_stopwords)

In [60]:
def lemmatize_text(text):
  
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_words = " ".join([lemmatizer.lemmatize(word) for word in word_tokens])
    return lemmatized_words

data['clean_reviews'] = data['clean_reviews'].apply(lemmatize_text)

In [61]:
data

Unnamed: 0,target,reviews,clean_reviews
0,neg,"plot : two teen couples go to a church party ,...",plot two teen couple go church party drink dri...
1,neg,the happy bastard's quick movie review \ndamn ...,happy bastard quick movie review damn y2k bug ...
2,neg,it is movies like these that make a jaded movi...,movie like make jaded movie viewer thankful in...
3,neg,""" quest for camelot "" is warner bros . ' firs...",quest camelot warner bros first featurelength ...
4,neg,synopsis : a mentally unstable man undergoing ...,synopsis mentally unstable man undergoing psyc...
...,...,...,...
1995,pos,wow ! what a movie . \nit's everything a movie...,wow movie everything movie funny dramatic inte...
1996,pos,"richard gere can be a commanding actor , but h...",richard gere commanding actor he always great ...
1997,pos,"glory--starring matthew broderick , denzel was...",glorystarring matthew broderick denzel washing...
1998,pos,steven spielberg's second epic film on world w...,steven spielberg second epic film world war ii...


In [62]:
X = data.clean_reviews
y = data.target

In [63]:
# Create Pipeline
pipeline = Pipeline([
    ('vect', None), # placeholder for vectorizer
    ('nb', MultinomialNB()),
])
# Set parameters to search (model and vectorizer)
parameters = {
    'vect': [TfidfVectorizer(), CountVectorizer()],
    'vect__ngram_range': ((1,1), (2,2)),
    'nb__alpha':(0.1,1),}

# Perform grid search on pipeline
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
                           verbose=1, scoring ='accuracy',
                           refit=True, cv=5)

grid_search.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vect', None),
                                       ('nb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'nb__alpha': (0.1, 1),
                         'vect': [TfidfVectorizer(), CountVectorizer()],
                         'vect__ngram_range': ((1, 1), (2, 2))},
             scoring='accuracy', verbose=1)

In [64]:
grid_search.best_params_

{'nb__alpha': 1, 'vect': TfidfVectorizer(), 'vect__ngram_range': (1, 1)}

In [65]:
grid_search.best_score_

0.8215

⚠️ Please push the exercise once you are done 🙃

## 🏁 