In [422]:
# import pandas and pickle
import pandas as pd
import pickle

In [423]:
# read the csv review dataset
trip = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')

In [424]:
# Let's create a new data frame
 
trip = trip[(trip['Rating']==5)|(trip['Rating']==2)|(trip['Rating']==1)][['Review','Rating']]

# Lets modify the Rating column
trip['Rating'] = trip['Rating'].apply(lambda rating: 'Pos' if rating==5 else 'Neg')

In [425]:
# reseting the index because after removing some rows, the index gets crowded
trip.reset_index(inplace=True)
trip.head()

Unnamed: 0,index,Review,Rating
0,1,ok nothing special charge diamond member hilto...,Neg
1,3,"unique, great stay, wonderful time hotel monac...",Pos
2,4,"great stay great stay, went seahawk game aweso...",Pos
3,5,love monaco staff husband stayed hotel crazy w...,Pos
4,6,"cozy stay rainy city, husband spent 7 nights m...",Pos


In [426]:
#Data cleaning and preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [428]:
# Lemmatization object
ps = WordNetLemmatizer()
corpus = []

In [429]:
# Text preprocessing
# keep only text based
# lower all the letters
# split the words
for i in range(0,len(tweets)):
    review = re.sub('[^a-zA-Z]'," ",trip['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [452]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus)

In [453]:
y = pd.get_dummies(trip['Rating'])
y = y.iloc[:,1].values

In [454]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=3)

In [455]:
#training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train,y_train)
spam_detect_model.score(X_test,y_test)

y_pred = spam_detect_model.predict(X_test)

In [456]:
#compare y test and y_pred
#confusion matrix is a 2x2 matrix and it tells,
#how many number of elements are correctly predicted.

from sklearn.metrics import confusion_matrix
confusion_m = confusion_matrix(y_test,y_pred)

In [457]:
#checking accuracy score

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)

In [458]:
# print accuracy
accuracy

0.9539527302363489

In [461]:
# Checking the training model with custom input data
message = "We had a terrible experience at the hotel. The food was not good and waiter sounded very wrong"
data = [message]
vect = cv.transform(data).toarray()
my_prediction = spam_detect_model.predict(vect)
if my_prediction==1:
    print("Positive")
else:
    print("Negative")
    

Negative


In [462]:
# Dump the machine learning model outsite so you can use outsite and not retrain again and again
pickle.dump(cv, open('tranform.pkl', 'wb'))
filename = 'nlp_model.pkl'
pickle.dump(spam_detect_model, open(filename, 'wb'))