In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("tripadvisor_hotel_reviews.csv")

Cleaning The Data

In [3]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [4]:
df['Rating'].value_counts()

5    9054
4    6039
3    2184
2    1793
1    1421
Name: Rating, dtype: int64

In [5]:
df_neg=df.loc[df['Rating']<3]
df_neg=df_neg.reset_index(drop=True)

In [6]:
df_five = df.loc[df['Rating'] == 5]
df_five = df_five.reset_index(drop = True)

In [7]:
print(len(df_five))

9054


In [8]:
df_pos = df_five.loc[:len(df_neg)]

In [9]:
df_all = pd.concat([df_neg,df_pos],axis = 0)
df_all = df_all.reset_index(drop = True)

In [10]:
df_all.head()

Unnamed: 0,Review,Rating
0,ok nothing special charge diamond member hilto...,2
1,"poor value stayed monaco seattle july, nice ho...",2
2,horrible customer service hotel stay february ...,1
3,disappointed say anticipating stay hotel monac...,2
4,great location need internally upgrade advanta...,2


In [11]:
len(df_all)

6429

 Create a Sentiment Column

In [12]:
df_all['Sentiment'] = np.where(df_all['Rating'] == 5, "Positive","Negative")

In [13]:
df_all.head()

Unnamed: 0,Review,Rating,Sentiment
0,ok nothing special charge diamond member hilto...,2,Negative
1,"poor value stayed monaco seattle july, nice ho...",2,Negative
2,horrible customer service hotel stay february ...,1,Negative
3,disappointed say anticipating stay hotel monac...,2,Negative
4,great location need internally upgrade advanta...,2,Negative


In [14]:
df_all.tail()

Unnamed: 0,Review,Rating,Sentiment
6424,perfect hotel hotel does not really need glowi...,5,Positive
6425,perfect hotel small hotel comfortable perfect ...,5,Positive
6426,ordinary location extraordinary hotel know lov...,5,Positive
6427,"classy indulgence awesome experience, staff n'...",5,Positive
6428,first-rate experience stay library hotel wife ...,5,Positive


In [15]:
df_all = df_all.sample(frac = 1)
df_all = df_all.reset_index(drop = True)

In [16]:
df_all.head()

Unnamed: 0,Review,Rating,Sentiment
0,"worst receptionists hotel not awarded stars, y...",2,Negative
1,"great holiday riu bambu, wife thoroughly enjoy...",5,Positive
2,"nice, ca n't add reviewer st. paul wrote, magn...",5,Positive
3,"love place, wow, stayed hotel nights easter we...",5,Positive
4,"n't consider staying, stayed 2 nights oct. 2 3...",5,Positive


Split into test and train examples

In [37]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(df_all.Review,df_all.Sentiment,test_size=0.3, random_state=5, stratify=None)

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
x_train_vec  = v.fit_transform(x_train)
x_test_vec = v.transform(x_test )

Use a classification Model

In [39]:
from sklearn.svm import SVC 
clf_svc = SVC(kernel = "rbf",random_state=45)
clf_svc.fit(x_train_vec,y_train)

Test Accuracy

In [40]:
clf_svc.score(x_test_vec,y_test)

0.9341627786417833

In [41]:
from sklearn.metrics import f1_score
f1_score(y_test,clf_svc.predict(x_test_vec),average = None)

array([0.93490518, 0.93340325])

In [47]:
rev = ["good place"]
rev_vec = v.transform(rev)
clf_svc.predict(rev_vec)

array(['Negative'], dtype=object)

In [43]:
rev = ["Absolutely Hated this place. Bad Food"]
rev_vec = v.transform(rev)
clf_svc.predict(rev_vec)

array(['Negative'], dtype=object)

In [44]:
import pickle
pickle.dump(clf_svc, open('trained_model.pkl', 'wb'))

In [45]:
import pickle
pickle.dump(v, open('vectorizer.pkl', 'wb'))