In [1]:
### Warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
### Reading data.
import pandas as pd
hotel_df=pd.read_csv("E://Live Project Dataset//hotel-reviews.csv")
hotel_df.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [3]:
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38932 entries, 0 to 38931
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   User_ID       38932 non-null  object
 1   Description   38932 non-null  object
 2   Browser_Used  38932 non-null  object
 3   Device_Used   38932 non-null  object
 4   Is_Response   38932 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


In [4]:
hotel_df.describe().transpose()

Unnamed: 0,count,unique,top,freq
User_ID,38932,38932,id22408,1
Description,38932,38932,I read some reviews on this hotel and can't be...,1
Browser_Used,38932,11,Firefox,7367
Device_Used,38932,3,Desktop,15026
Is_Response,38932,2,happy,26521


### Cleaning Data.


In [5]:
### Finding null values.
count=hotel_df.isnull().sum().sort_values(ascending=False)
print(count)

Is_Response     0
Device_Used     0
Browser_Used    0
Description     0
User_ID         0
dtype: int64


In [6]:
percentage=((hotel_df.isnull().sum()/len(hotel_df)*100)).sort_values(ascending=False)
print(percentage)

Is_Response     0.0
Device_Used     0.0
Browser_Used    0.0
Description     0.0
User_ID         0.0
dtype: float64


In [7]:
### Concatenate both data.
missing_data=pd.concat([count,percentage],axis=1,keys=['Count','Percentage'])
print(missing_data)

              Count  Percentage
Is_Response       0         0.0
Device_Used       0         0.0
Browser_Used      0         0.0
Description       0         0.0
User_ID           0         0.0


In [8]:
hotel_df.columns

Index(['User_ID', 'Description', 'Browser_Used', 'Device_Used', 'Is_Response'], dtype='object')

In [9]:
## Dropping unwanted columns in data.
hotel_df.drop(columns=['User_ID','Browser_Used','Device_Used'],inplace=True)

In [10]:
### Cleaning text data.like removing square brackets,number and punctuations.
import re
import string

def text_clean_1(text):
    ## converting tect into lower case.
    text=text.lower()
    ## Removing bigger brackets.
    text=re.sub('\[.*?\]','',text)
    ## removing the punctuations.
    text=re.sub('[%s]'% re.escape(string.punctuation),'',text)
    ## removing the digits.
    text=re.sub('\w*\d\w*','',text)
    return text

cleaned1=lambda x:text_clean_1(x)

In [11]:
### Updated clean text.
hotel_df['cleaned_description']=pd.DataFrame(hotel_df.Description.apply(cleaned1))
hotel_df.head()

Unnamed: 0,Description,Is_Response,cleaned_description
0,The room was kind of clean but had a VERY stro...,not happy,the room was kind of clean but had a very stro...
1,I stayed at the Crown Plaza April -- - April -...,not happy,i stayed at the crown plaza april april th...
2,I booked this hotel through Hotwire at the low...,not happy,i booked this hotel through hotwire at the low...
3,Stayed here with husband and sons on the way t...,happy,stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,not happy,my girlfriends and i stayed here to celebrate ...


In [12]:
def text_clean_2(text):
    text=re.sub('[''""_]','',text)
    text=re.sub('\n','',text)
    return text

cleaned2=lambda x:text_clean_2(x)
   

In [13]:
hotel_df['cleaned_description_new']=pd.DataFrame(hotel_df['cleaned_description'].apply(cleaned2))
hotel_df.head()

Unnamed: 0,Description,Is_Response,cleaned_description,cleaned_description_new
0,The room was kind of clean but had a VERY stro...,not happy,the room was kind of clean but had a very stro...,the room was kind of clean but had a very stro...
1,I stayed at the Crown Plaza April -- - April -...,not happy,i stayed at the crown plaza april april th...,i stayed at the crown plaza april april th...
2,I booked this hotel through Hotwire at the low...,not happy,i booked this hotel through hotwire at the low...,i booked this hotel through hotwire at the low...
3,Stayed here with husband and sons on the way t...,happy,stayed here with husband and sons on the way t...,stayed here with husband and sons on the way t...
4,My girlfriends and I stayed here to celebrate ...,not happy,my girlfriends and i stayed here to celebrate ...,my girlfriends and i stayed here to celebrate ...


### model training.

In [14]:
from sklearn.model_selection import train_test_split

## creating independent and dependent vaiable.
independent_var=hotel_df.cleaned_description_new
dependent_var=hotel_df.Is_Response

x_train,x_test,y_train,y_test=train_test_split(independent_var,dependent_var,test_size=0.1,random_state=25)


In [15]:
### using TF-IDF vector.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tvec=TfidfVectorizer()
clf2=LogisticRegression(solver='lbfgs')

from sklearn.pipeline import Pipeline

In [16]:
## creating model.
model=Pipeline([('vectoizer',tvec),('classifier',clf2)])
model.fit(x_train,y_train)

Pipeline(steps=[('vectoizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [18]:
### Calculating  the confusion matrix.
prediction=model.predict(x_test)

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(prediction,y_test)
print(cm)

[[2479  291]
 [ 171  953]]


### Model prediction

In [21]:
from sklearn.metrics import accuracy_score,precision_score,recall_score
print("Accuracy :",accuracy_score(prediction,y_test))
print("Precision :",precision_score(prediction,y_test,average='weighted'))
print("Recall :",recall_score(prediction,y_test,average='weighted'))

Accuracy : 0.8813559322033898
Precision : 0.8865761025483881
Recall : 0.8813559322033898


In [34]:
### Tring with new test data.
example=['I am not happy']
result=model.predict(example)
result

array(['not happy'], dtype=object)