# Vectorizer Tuning

In [1]:
import pandas as pd

data = pd.read_csv("reviews.csv")

data.head()

Unnamed: 0,target,reviews
0,neg,"plot : two teen couples go to a church party ,..."
1,neg,the happy bastard's quick movie review \ndamn ...
2,neg,it is movies like these that make a jaded movi...
3,neg,""" quest for camelot "" is warner bros . ' firs..."
4,neg,synopsis : a mentally unstable man undergoing ...


The dataset is made up of positive and negative movie reviews.

## Preprocessing

👇 Remove punctuation and lower case the text.

In [2]:
def clean_text():
    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    ser = []
    for d in data['reviews']:
        m = tokenizer.tokenize(d)
        m = [' '.join(m)]
        ser.append(m)
    df1 = pd.DataFrame(ser,columns=['sub'])
    return df1

In [4]:
clean_text()
data['clean_text'] = clean_text()
data = data.drop(columns=['reviews'], axis=1)
data

Unnamed: 0,target,clean_text
0,neg,plot two teen couples go to a church party dri...
1,neg,the happy bastard s quick movie review damn th...
2,neg,it is movies like these that make a jaded movi...
3,neg,quest for camelot is warner bros first feature...
4,neg,synopsis a mentally unstable man undergoing ps...
...,...,...
1995,pos,wow what a movie it s everything a movie can b...
1996,pos,richard gere can be a commanding actor but he ...
1997,pos,glory starring matthew broderick denzel washin...
1998,pos,steven spielberg s second epic film on world w...


## Tuning

👇 Tune a vectorizer of your choice (or try both!) and a MultinomialNB model simultaneously.

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import os, json, codecs, nltk  
import numpy as np  
from sklearn.feature_extraction.text import TfidfVectorizer,  CountVectorizer,TfidfTransformer  
from sklearn.model_selection import GridSearchCV  
from time import time  
# Create Pipeline
pipeline = Pipeline([('vec', CountVectorizer(encoding='cp874', preprocessor=data.clean_text, token_pattern=None)),('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
parameters = {  
'vec__max_df': (0.5, 0.625, 0.75, 0.875, 1.0),  
'vec__max_features': (None, 5000, 10000, 20000),  
'vec__min_df': (1, 5, 10, 20, 50),  
'tfidf__use_idf': (True, False),  
'tfidf__sublinear_tf': (True, False),  
'vec__binary': (True, False),  
'tfidf__norm': ('l1', 'l2'),  
'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)  
}  
# Set parameters to search (model and vectorizer)
X = data.clean_text
y = data.target
if __name__ == "__main__":  
        grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=2)  
        t0 = time()  
        grid_search.fit(X, y)  
        print("done in {0}s".format(time() - t0))  
        print("Best score: {0}".format(grid_search.best_score_))  
        print("Best parameters set:")  
        best_parameters = grid_search.best_estimator_.get_params()  
        for param_name in sorted(list(parameters.keys())):  
            print("\t{0}: {1}".format(param_name, best_parameters[param_name]))


Fitting 5 folds for each of 9600 candidates, totalling 48000 fits


KeyboardInterrupt: 

⚠️ Please push the exercise once you are done 🙃

## 🏁 