In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

import re
import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('datasets/googleplaystore_user_reviews.csv')
data.dropna(inplace=True)

In [3]:
# data['Sentiment'] = data['Sentiment'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1}).astype(int)
X = data['Translated_Review']
Y = data['Sentiment']

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [5]:

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [6]:
text_clf.fit(x_train, y_train)  

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [7]:
y_pred = text_clf.predict(x_test)

In [8]:
np.mean(y_pred == y_test)

0.6840769436281058

In [9]:
text_clf.predict(['This is a good thing'])

array(['Positive'], dtype='<U8')

In [10]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred,
    target_names=y_test))
                                        
metrics.confusion_matrix(y_test, y_pred)

             precision    recall  f1-score   support

   Positive       0.96      0.18      0.31      1653
    Neutral       1.00      0.04      0.07      1049
   Negative       0.67      1.00      0.80      4784

avg / total       0.78      0.68      0.59      7486



  .format(len(labels), len(target_names))


array([[ 302,    0, 1351],
       [   8,   38, 1003],
       [   3,    0, 4781]])

In [11]:
from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RandomizedSearchCV

# from scipy.stats import randint as sp_randint
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}
# parameters = {"max_depth": [3, None],
#               "max_features": sp_randint(1, 11),
#               "min_samples_split": sp_randint(2, 11),
#               "bootstrap": [True, False],
#               "criterion": ["gini", "entropy"]}

In [12]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [13]:
gs_clf.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (0.01, 0.001)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [14]:
y_pred = gs_clf.predict(x_test)
np.mean(y_pred == y_test)

0.8072401816724553