In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import pickle

In [2]:
##adding the dataset
df = pd.read_csv('hotel_reviews.csv')

In [3]:
##displaying the dataset

In [4]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
df.shape

(20491, 2)

## data cleaning

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [7]:
##drop cols
df.drop(columns=['Rating'],inplace=True)

In [8]:
df.sample(5)

Unnamed: 0,Review
10618,"big thumbs meaning write review months, attend..."
15536,"fabulous great experience, wow, best hotel wor..."
15164,worthy just come night stay paragon say expect...
16653,"beautiful property amazing staff, stayed hotel..."
12848,bad hotel bad hotel horrible arrived friday ju...


In [9]:
# renaming the cols
df.rename(columns={'Review':'text'},inplace=True)
df.sample(10)

Unnamed: 0,text
7603,wonderful marriott great vacation location nic...
8916,"paris gem family stayed rooms nights august, r..."
9080,"best bed, spent 3 nights end, got excellent ra..."
4380,paris magical indulgent overnight stay turned ...
8590,"clean room staff friendly helpful, communal ar..."
5288,"large bathroom, solo female australian travell..."
10691,"good location bigger rooms spacious nice, quie..."
12582,worth money stayed seven nights feb 2008. temp...
10578,"recommended hotel paris, great hotel weekend p..."
19525,loved hotel mela hotel mela great boutique hot...


In [10]:
# delete a range of rows - index values 10-20
df.drop(labels=range(501, 20491), axis=0,inplace=True)

In [11]:
df.shape

(501, 1)

In [12]:
# missing values
df.isnull().sum()

text    0
dtype: int64

In [13]:
# check for duplicate values
df.duplicated().sum()

0

In [14]:
df.shape

(501, 1)

## data preprocessing

In [15]:
##steps to follow
##Lower case
##Tokenization
##Removing special characters
##Removing stop words and punctuation
##Stemming

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [18]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
        
    
    return y

In [20]:
transform_text('hi How are you %% ? nitesh')

['hi', 'nitesh']

In [21]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

'love'

In [22]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [23]:
df['text'][11]

"nice value seattle stayed 4 nights late 2007. looked comparable hilton marriott westin area points/miles n't gave monaco shot, pleasantly surprised nice room service quick tasty bed especially comfortable unlike people nothing positive things say service, downside relatively small exercise room treadmill heck east coast time ran person 5 days, definitely stay,  "

In [24]:
##displaying the transformed text in table
df['transformed_text'] = df['text'].apply(transform_text)

In [25]:
df.sample(5)

Unnamed: 0,text,transformed_text
344,price right stayed travelodge mid-late march 2...,price right stay travelodg march book onlin pa...
457,"not stay, room did not microwave refridgerator...",stay room microwav refridger price paid balcon...
275,nice hotel nice location stayed hotel andra ni...,nice hotel nice locat stay hotel andra night q...
332,"disappointed, booking expedia got assigned roo...",disappoint book expedia got assign room averag...
338,cold pool nice room cold pool expensive parkin...,cold pool nice room cold pool expens park damp...


## prediction of newdata


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=2000)
import pickle
tfidf = pickle.load(open('vectorizer.pkl', "rb"))

In [27]:
X_fresh = tfidf.fit_transform(df['transformed_text']).toarray()

In [28]:
X_fresh.shape

(501, 2000)

## predictions via sentiment classifier


In [29]:
import joblib
mnb= joblib.load('model1')

In [30]:
y_pred = mnb.predict(X_fresh)
print(y_pred)

[0 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0
 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 1 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 

In [31]:
df['predicted_label'] = y_pred.tolist()
df.head()

Unnamed: 0,text,transformed_text,predicted_label
0,nice hotel expensive parking got good deal sta...,nice hotel expens park got good deal stay hote...,0
1,ok nothing special charge diamond member hilto...,ok noth special charg diamond member hilton de...,0
2,nice rooms not 4* experience hotel monaco seat...,nice room 4 experi hotel monaco seattl good ho...,0
3,"unique, great stay, wonderful time hotel monac...",uniqu great stay wonder time hotel monaco loca...,1
4,"great stay great stay, went seahawk game aweso...",great stay great stay went seahawk game awesom...,0


In [32]:
df.to_csv("c3_Predicted_Sentiments_Fresh_Dump.tsv", sep='\t', encoding='UTF-8', index=False)