# Sentiment Analysis on Hotel Reviews

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('HotelReview.csv')

In [3]:
df.drop(['categories','latitude','longitude','province','reviews.date','reviews.dateAdded','city','country','name','postalCode','reviews.doRecommend','reviews.id','reviews.title','reviews.userCity','reviews.username','reviews.userProvince'],axis =1,inplace = True)

In [4]:
df['reviews.rating'].value_counts()

5.0     12671
4.0      9755
3.0      5708
1.0      3068
2.0      2979
0.0       337
10.0      101
9.6        53
9.2        49
4.3        26
4.8        25
8.3        25
7.9        24
7.5        24
8.8        24
4.2        22
3.4        17
4.1        14
6.3        14
7.1        13
4.5        13
6.7        12
5.4         9
4.6         7
3.1         7
5.8         6
4.4         6
3.3         6
4.7         6
4.9         6
2.4         4
2.5         3
3.8         3
3.7         2
3.2         2
3.9         2
2.8         1
1.3         1
9.5         1
1.5         1
6.0         1
2.9         1
7.0         1
Name: reviews.rating, dtype: int64

In [5]:
df['reviews.rating'].isnull().sum()

862

In [6]:
df['reviews.rating'].fillna((df['reviews.rating'].mean()),inplace=True)

In [7]:
df['reviews.rating'].isnull().sum()

0

In [8]:
sentiment = []
for i in range(len(df['reviews.rating'])):
    if (df['reviews.rating'].iloc[i].astype(float)<3.00):
        sentiment.append('Negative')
    elif (df['reviews.rating'].iloc[i].astype(float)==3.00):
        sentiment.append('Neutral')
    else:
        sentiment.append('Positive')

In [9]:
df['Sentiment']=sentiment

In [10]:
df['Sentiment'].value_counts()

Positive    23809
Negative     6395
Neutral      5708
Name: Sentiment, dtype: int64

In [11]:
df

Unnamed: 0,address,reviews.rating,reviews.text,Sentiment
0,Riviera San Nicol 11/a,4.0,Pleasant 10 min walk along the sea front to th...,Positive
1,Riviera San Nicol 11/a,5.0,Really lovely hotel. Stayed on the very top fl...,Positive
2,Riviera San Nicol 11/a,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Positive
3,Riviera San Nicol 11/a,5.0,We stayed here for four nights in October. The...,Positive
4,Riviera San Nicol 11/a,5.0,We stayed here for four nights in October. The...,Positive
...,...,...,...,...
35907,1088 Powdermill Rd,5.0,Best of the Best!!!! My family and I stayed in...,Positive
35908,1088 Powdermill Rd,5.0,Amazing Vacation in Beautiful Cabin We stayed ...,Positive
35909,1088 Powdermill Rd,5.0,My husband and I stayed at Tree Tops which is ...,Positive
35910,3747 29th St S E,0.0,to share your opinion of this businesswith YP ...,Negative


In [12]:
df.drop(['address','reviews.rating'],axis=1,inplace=True)

In [13]:
df

Unnamed: 0,reviews.text,Sentiment
0,Pleasant 10 min walk along the sea front to th...,Positive
1,Really lovely hotel. Stayed on the very top fl...,Positive
2,Ett mycket bra hotell. Det som drog ner betyge...,Positive
3,We stayed here for four nights in October. The...,Positive
4,We stayed here for four nights in October. The...,Positive
...,...,...
35907,Best of the Best!!!! My family and I stayed in...,Positive
35908,Amazing Vacation in Beautiful Cabin We stayed ...,Positive
35909,My husband and I stayed at Tree Tops which is ...,Positive
35910,to share your opinion of this businesswith YP ...,Negative


## Removing Punctuations 

In [14]:
import re
review_pun = []

for i in range(len(df['reviews.text'])):
    review_pun.append(re.sub(r'[^\w\s]','',str(df['reviews.text'].iloc[i])))

In [15]:
df['reviews.text'] = review_pun

In [16]:
df


Unnamed: 0,reviews.text,Sentiment
0,Pleasant 10 min walk along the sea front to th...,Positive
1,Really lovely hotel Stayed on the very top flo...,Positive
2,Ett mycket bra hotell Det som drog ner betyget...,Positive
3,We stayed here for four nights in October The ...,Positive
4,We stayed here for four nights in October The ...,Positive
...,...,...
35907,Best of the Best My family and I stayed in the...,Positive
35908,Amazing Vacation in Beautiful Cabin We stayed ...,Positive
35909,My husband and I stayed at Tree Tops which is ...,Positive
35910,to share your opinion of this businesswith YP ...,Negative


## Removing Stop Words

In [17]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

updated = []

for i in range(len(df['reviews.text'])):

    rev = str(df['reviews.text'].iloc[i])

    stop_words = set(stopwords.words('english'))

    word_tokens = word_tokenize(rev)

    filtered_sentence = [w for w in word_tokens if not w in stop_words]

    filtered_sentence = []

    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)

    updated.append(filtered_sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
after_stopwords = []
for i in range(len(df['reviews.text'])):
               
    separator = ' '
    after_stopwords.append(separator.join(updated[i]))

In [19]:
df['reviews.text'] = after_stopwords

In [20]:
df

Unnamed: 0,reviews.text,Sentiment
0,Pleasant 10 min walk along sea front Water Bus...,Positive
1,Really lovely hotel Stayed top floor surprised...,Positive
2,Ett mycket bra hotell Det som drog ner betyget...,Positive
3,We stayed four nights October The hotel staff ...,Positive
4,We stayed four nights October The hotel staff ...,Positive
...,...,...
35907,Best Best My family I stayed Grin Bear It cabi...,Positive
35908,Amazing Vacation Beautiful Cabin We stayed lon...,Positive
35909,My husband I stayed Tree Tops Bearfoot Resort ...,Positive
35910,share opinion businesswith YP visitors across ...,Negative


## Stemming of Words

In [21]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [22]:
after_stemming = []
for j in range(len(df['reviews.text'])):
    string = ''
    x=word_tokenize(after_stopwords[j])
    for i in range(len(x)):
        string=string+ps.stem(x[i])+' '
    after_stemming.append(string)

In [23]:
df['reviews.text'] = after_stemming

In [24]:
df

Unnamed: 0,reviews.text,Sentiment
0,pleasant 10 min walk along sea front water bu ...,Positive
1,realli love hotel stay top floor surpris jacuz...,Positive
2,ett mycket bra hotel det som drog ner betyget ...,Positive
3,We stay four night octob the hotel staff welco...,Positive
4,We stay four night octob the hotel staff welco...,Positive
...,...,...
35907,best best My famili I stay grin bear It cabin ...,Positive
35908,amaz vacat beauti cabin We stay long weekend r...,Positive
35909,My husband I stay tree top bearfoot resort man...,Positive
35910,share opinion businesswith YP visitor across u...,Negative


## Training of Model

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

tvec = TfidfVectorizer()
clf = RandomForestClassifier(n_estimators = 60)

In [26]:
from sklearn.model_selection import train_test_split
model = Pipeline([('vec',tvec),('cfl',clf)])
X_train,X_test,Y_train,Y_test = train_test_split(df['reviews.text'],df['Sentiment'],test_size=0.2)

In [27]:
model.fit(X_train,Y_train)

Pipeline(memory=None,
         steps=[('vec',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [28]:
model.predict([str(df['reviews.text'].iloc[4])])

array(['Positive'], dtype=object)

In [29]:
pred = model.predict(X_test)

In [39]:
from sklearn.metrics import accuracy_score
accuracy_score(pred,Y_test)

0.7314492551858555

In [38]:
model.predict(['Hotel was not good'])

array(['Negative'], dtype=object)

In [33]:
model.predict(['Hotel was good'])

array(['Positive'], dtype=object)

In [34]:
model.predict(['Hotel was OK'])

array(['Neutral'], dtype=object)

In [40]:
model.predict(['bad'])

array(['Negative'], dtype=object)

In [42]:
model.predict(['bad quality'])

array(['Negative'], dtype=object)

In [43]:
model.predict(['No Problem'])

array(['Positive'], dtype=object)

In [44]:
model.predict(['Staff was bad'])

array(['Negative'], dtype=object)