In [1]:
import re
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

nltk.download('stopwords')

class StemTokenizer:
    def __init__(self, stopwords):
        self.tokenizer = re.compile("\w\w+")
        self.stopwords = set(stopwords)
        self.ps = SnowballStemmer('english')
    def __call__(self, doc):
        tokens = self.tokenizer.findall(doc)
        return [self.ps.stem(t) for t in tokens if not t in self.stopwords]

pipeline = Pipeline([('vect', TfidfVectorizer()), ('clf', LinearSVC())])

param_grid = [
    {'vect__tokenizer': [StemTokenizer(nltk_stopwords.words('english')), None]},
    {'clf': [LinearSVC(), RandomForestClassifier(), MLPClassifier()]}
    ]
grid_search_cv = GridSearchCV(pipeline, param_grid)

dataset = pd.read_csv('data/our_dataset.csv')
X = dataset.loc[:, "reviewText"]
y = dataset.loc[:, "overall"]

grid_search_cv.fit(X, y)
print(grid_search_cv.cv_results_)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rishabhtiwari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'mean_fit_time': array([2.03970613, 0.21286254, 0.2120625 , 1.01177521, 9.25585032]), 'std_fit_time': array([0.04141676, 0.00637323, 0.00502842, 0.01320117, 0.79885224]), 'mean_score_time': array([0.50517521, 0.04158273, 0.04055715, 0.06107683, 0.04522324]), 'std_score_time': array([0.03452487, 0.00271368, 0.00217356, 0.00257197, 0.00236044]), 'param_vect__tokenizer': masked_array(data=[<__main__.StemTokenizer object at 0x7f9cebe32820>,
                   None, --, --, --],
             mask=[False, False,  True,  True,  True],
       fill_value='?',
            dtype=object), 'param_clf': masked_array(data=[--, --, LinearSVC(), RandomForestClassifier(),
                   MLPClassifier()],
             mask=[ True,  True, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'vect__tokenizer': <__main__.StemTokenizer object at 0x7f9cebe32820>}, {'vect__tokenizer': None}, {'clf': LinearSVC()}, {'clf': RandomForestClassifier()}, {'clf': MLPClassifier()}], 