In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # Plotting labelled data
from nltk.corpus import stopwords # dealing with stop words

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [3]:
df = pd.read_csv("C:/Users/Jatin/Downloads/Major Project/Dataset/Sentiment Analysis/HotelReview1to5.csv", encoding = "ISO-8859-1")
df.head()

Unnamed: 0,S.No.,Review,Rating
0,1,nice hotel expensive parking got good deal sta...,4
1,2,ok nothing special charge diamond member hilto...,2
2,3,nice rooms not 4* experience hotel monaco seat...,3
3,4,unique \tgreat stay \twonderful time hotel mon...,5
4,5,great stay great stay \twent seahawk game awes...,5


In [4]:
df.drop("S.No.", axis = 1, inplace = True)
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,unique \tgreat stay \twonderful time hotel mon...,5
4,great stay great stay \twent seahawk game awes...,5


In [5]:
grouped = df.groupby("Rating")
equal_df = pd.DataFrame(columns = ["Review", "Rating"])

for name, group in grouped:
    group = group.sample(n = 1421)
    equal_df = equal_df.append(group)
    
equal_df.reset_index(inplace = True, drop = True)
equal_df.head()

Unnamed: 0,Review,Rating
0,shame magnificent building lousy caretakers do...,1
1,worst inclusive resort punta cana \tresort not...,1
2,inclusive runs guaranteed \tparty 6 77 yr. old...,1
3,not return park plaza extremely unhappy stay p...,1
4,not families pictures expedia san juan web sit...,1


In [6]:
equal_df["Rating"].value_counts()

5    1421
4    1421
3    1421
2    1421
1    1421
Name: Rating, dtype: int64

## Splitting dataset

In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(equal_df.Review, equal_df.Rating, test_size = 0.25, random_state = 42)
ytrain = ytrain.astype('int')
ytest = ytest.astype('int')

## Preprocessing function

In [8]:
import re
import string
table = str.maketrans("", "", string.punctuation + "\t")

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def preprocess(text):
    # removing punctuations
    no_punc = text.translate(table)
    
    # removing numbers
    no_num = re.sub("[0-9]+", "", no_punc)
    
    # lowercasing, stopwords removal
    # porter stemming
    return " ".join([stemmer.stem(word) for word in no_num.split()])


## Making pipeline

In [9]:
from nltk import word_tokenize
from nltk.sentiment.util import mark_negation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline



### Count Vectorizer

In [10]:
clf1 = Pipeline([
    ('vectorizer1', CountVectorizer(analyzer="word",
                                   tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   #tokenizer=lambda text: mark_negation(word_tokenize(text)), 
                                   preprocessor = preprocess,
                                   stop_words = stopwords.words('english'), ngram_range = (1,2),
                                   lowercase = True,
                                   max_features = 3000) ),
    ('classifier1', MultinomialNB(alpha = 0.5))
])

In [11]:
clf1.fit(xtrain, ytrain)

Pipeline(memory=None,
     steps=[('vectorizer1', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 2),
        preprocessor=<function preprocess ...     vocabulary=None)), ('classifier1', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [12]:
clf1.score(xtest, ytest)

0.526167698368036

In [13]:
predictions1 = clf1.predict(xtest)

In [14]:
from sklearn.metrics import classification_report

In [15]:
print (classification_report(predictions1,ytest))

             precision    recall  f1-score   support

          1       0.72      0.69      0.71       393
          2       0.43      0.47      0.45       329
          3       0.32      0.40      0.36       282
          4       0.47      0.36      0.41       415
          5       0.68      0.68      0.68       358

avg / total       0.53      0.53      0.53      1777



### Tfidf Vectorizer

In [16]:
clf2 = Pipeline([
    ('vectorizer2', TfidfVectorizer(analyzer="word",
                                   tokenizer=word_tokenize,         # ! Comment line to include mark_negation and uncomment next line
                                   #tokenizer=lambda text: mark_negation(word_tokenize(text)), 
                                   preprocessor = preprocess,
                                   stop_words = stopwords.words('english'), ngram_range = (1,2),
                                   lowercase = True,
                                   max_features = 3000) ),
    ('classifier2', MultinomialNB(alpha = 0.5))
])

In [17]:
clf2.fit(xtrain, ytrain)

Pipeline(memory=None,
     steps=[('vectorizer2', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=3000, min_df=1,
        ngram_range=(1, 2), norm='l2',
        preprocessor=<function ...rue, vocabulary=None)), ('classifier2', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [19]:
clf2.score(xtest, ytest)

0.5323579065841305

## Saving classifiers

In [20]:
import pickle

In [21]:
# dumping the count vectorizer pipeline with pickle
cv_senti_analyser_path = "./cv_senti_pipeline.pkl"

# Open the file to save as pkl file
cv_senti_analyser_pkl = open(cv_senti_analyser_path, 'wb')
pickle.dump(clf1, cv_senti_analyser_pkl)

#close the pickle file
cv_senti_analyser_pkl.close()

In [22]:
# dumping the count vectorizer pipeline with pickle
tf_senti_analyser_path = "./tf_senti_pipeline.pkl"

# Open the file to save as pkl file
tf_senti_analyser_pkl = open(tf_senti_analyser_path, 'wb')
pickle.dump(clf2, tf_senti_analyser_pkl)

#close the pickle file
tf_senti_analyser_pkl.close()