# ***Import Libraries***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
import folium
from folium.plugins import HeatMap
import plotly.express as px
%matplotlib inline
pd.set_option('display.max_columns', 32)
plt.style.use('fivethirtyeight')


# ***Raed Data***

In [None]:
df=pd.read_csv('/content/hotel_booking.csv',sep=';')

In [None]:
len(df)

1000

In [None]:
df2 = pd.read_csv('/content/id.csv')   # The second file with a single column

# Ensure that the single column in df2 will be added as the first column
# If df2 has only one column, get its name
df2_column_name = df2.columns[0]

# Concatenate df2 as the first column of df
# We need to reset the index of df2 to make sure the rows align properly
df = pd.concat([df2, df], axis=1)

In [None]:
emp_id = pd.read_csv('/content/id_copy.csv')

In [None]:
df = pd.concat([df, emp_id], axis=1)

In [None]:
df.head(5)

Unnamed: 0,BookingID,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID
0,1,Resort Hotel,0,342,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,...,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01/07/15,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,1
1,2,Resort Hotel,0,737,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,...,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01/07/15,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2
2,3,Resort Hotel,0,7,2015,July,27,1,0,1,1,0,0,BB,GBR,Direct,...,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,02/07/15,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,3
3,4,Resort Hotel,0,13,2015,July,27,1,0,1,1,0,0,BB,GBR,Corporate,...,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,02/07/15,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,4
4,5,Resort Hotel,0,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,...,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,03/07/15,Linda Hines,LHines@verizon.com,713-226-5883,************5498,5


# ***Data Checks to Perform***


*   Check Missing values
*   Check Duplicates
*   Check data type
*   Check the number of unique values of each column
*   Check statistics of the dataset
*   Check various categories present in the different categorical columns



# ***Exploratory Data Analysis 📊***

In [None]:
df.head(5)

Unnamed: 0,BookingID,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID
0,1,Resort Hotel,0,342,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,...,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01/07/15,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,1
1,2,Resort Hotel,0,737,2015,July,27,1,0,0,2,0,0,BB,PRT,Direct,...,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01/07/15,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2
2,3,Resort Hotel,0,7,2015,July,27,1,0,1,1,0,0,BB,GBR,Direct,...,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,02/07/15,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,3
3,4,Resort Hotel,0,13,2015,July,27,1,0,1,1,0,0,BB,GBR,Corporate,...,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,02/07/15,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,4
4,5,Resort Hotel,0,14,2015,July,27,1,0,2,2,0,0,BB,GBR,Online TA,...,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,03/07/15,Linda Hines,LHines@verizon.com,713-226-5883,************5498,5


In [None]:
df.tail()

Unnamed: 0,BookingID,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID
995,996,Resort Hotel,1,122,2015,August,33,9,2,4,2,0,0,HB,PRT,Online TA,...,0,No Deposit,240.0,,0,Transient,166.0,0,2,Canceled,27/05/15,Christina Collins,Christina_Collins@yandex.com,629-364-3798,************4977,996
996,997,Resort Hotel,1,41,2015,August,33,9,2,4,2,0,0,BB,PRT,Online TA,...,0,No Deposit,240.0,,0,Transient,202.0,0,2,Canceled,17/07/15,Katherine Parsons,KatherineParsons@gmail.com,837-549-5190,************5190,997
997,998,Resort Hotel,1,41,2015,August,33,9,2,4,2,0,0,BB,PRT,Online TA,...,0,No Deposit,240.0,,0,Transient,172.0,0,2,Canceled,17/07/15,Christian Bailey,Christian.Bailey@gmail.com,880-193-6472,************5125,998
998,999,Resort Hotel,0,81,2015,August,33,9,2,4,2,1,1,FB,ESP,Direct,...,0,No Deposit,250.0,,0,Transient,277.0,1,1,Check-Out,15/08/15,Mr. William Bell,Mr..B@mail.com,401-188-7246,************6274,999
999,1000,Resort Hotel,0,59,2015,August,33,9,2,5,2,0,0,BB,GBR,Offline TA/TO,...,0,No Deposit,243.0,,0,Transient,104.0,0,0,Check-Out,16/08/15,James Haynes,Haynes.James@hotmail.com,711-535-8794,************8511,1000


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   BookingID                       1000 non-null   int64  
 1   hotel                           1000 non-null   object 
 2   is_canceled                     1000 non-null   int64  
 3   lead_time                       1000 non-null   int64  
 4   arrival_date_year               1000 non-null   int64  
 5   arrival_date_month              1000 non-null   object 
 6   arrival_date_week_number        1000 non-null   int64  
 7   arrival_date_day_of_month       1000 non-null   int64  
 8   stays_in_weekend_nights         1000 non-null   int64  
 9   stays_in_week_nights            1000 non-null   int64  
 10  adults                          1000 non-null   int64  
 11  children                        1000 non-null   int64  
 12  babies                          100

In [None]:
df.describe()

Unnamed: 0,BookingID,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,empID
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,904.0,12.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,0.322,57.812,2015.0,29.463,13.614,1.328,3.482,1.994,0.213,0.024,0.0,0.0,0.0,0.196,212.25,165.583333,0.0,131.95076,0.138,0.788,500.5
std,288.819436,0.467477,56.67531,0.0,1.609727,9.00089,1.076836,2.16665,0.337754,0.638779,0.159529,0.0,0.0,0.0,0.560266,68.295183,62.794699,0.0,45.794436,0.347961,0.881949,288.819436
min,1.0,0.0,0.0,2015.0,27.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,110.0,0.0,0.0,0.0,0.0,1.0
25%,250.75,0.0,22.75,2015.0,28.0,6.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,240.0,110.0,0.0,105.6575,0.0,0.0,250.75
50%,500.5,0.0,50.0,2015.0,29.0,12.0,1.5,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,240.0,149.0,0.0,127.565,0.0,1.0,500.5
75%,750.25,1.0,81.0,2015.0,31.0,20.0,2.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,240.0,193.5,0.0,159.05,0.0,1.0,750.25
max,1000.0,1.0,737.0,2015.0,33.0,31.0,6.0,15.0,4.0,10.0,2.0,0.0,0.0,0.0,5.0,306.0,270.0,0.0,280.74,2.0,4.0,1000.0


In [None]:
df.shape

(1000, 38)

In [None]:
df.columns

Index(['BookingID', 'hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'name', 'email',
       'phone-number', 'credit_card', 'empID'],
      dtype='object')

In [None]:
df.describe(include = 'object')

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date,name,email,phone-number,credit_card
count,1000,1000,1000,999,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000
unique,1,2,3,34,6,3,8,9,1,4,3,108,992,999,1000,944
top,Resort Hotel,July,BB,PRT,Online TA,TA/TO,A,A,No Deposit,Transient,Check-Out,17/07/15,Jessica Anderson,James_B@hotmail.com,669-792-1661,************1563
freq,1000,842,744,644,575,788,484,356,1000,873,678,49,2,2,1,3


In [None]:
df['reservation_status_date']= pd.to_datetime(df['reservation_status_date'])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   BookingID                       1000 non-null   int64         
 1   hotel                           1000 non-null   object        
 2   is_canceled                     1000 non-null   int64         
 3   lead_time                       1000 non-null   int64         
 4   arrival_date_year               1000 non-null   int64         
 5   arrival_date_month              1000 non-null   object        
 6   arrival_date_week_number        1000 non-null   int64         
 7   arrival_date_day_of_month       1000 non-null   int64         
 8   stays_in_weekend_nights         1000 non-null   int64         
 9   stays_in_week_nights            1000 non-null   int64         
 10  adults                          1000 non-null   int64         
 11  child

In [None]:
df.rename(columns = {
    'adults': 'no_of_adults',
    'children': 'no_of_children',
    'stays_in_weekend_nights': 'no_of_weekend_nights',
    'stays_in_week_nights': 'no_of_week_nights',
    'meal': 'type_of_meal_plan',
    'required_car_parking_spaces': 'required_car_parking_space',
    'reserved_room_type': 'room_type_reserved',
    'lead_time': 'lead_time',
    'arrival_date_year': 'arrival_year',
    'arrival_date_month': 'arrival_month',
    'arrival_date_day_of_month': 'arrival_day',
    'market_segment': 'market_segment_type',
    'is_repeated_guest': 'repeated_guest',
    'previous_cancellations': 'no_of_previous_cancellations',
    'previous_bookings_not_canceled': 'no_of_previous_bookings_not_canceled',
    'adr': 'avg_price_per_room',
    'total_of_special_requests': 'no_of_special_requests',
    'is_canceled': 'booking_status'
}, inplace=True)

In [None]:
df['arrival_month'] = df['arrival_month'].map({
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
})

In [None]:
#df['reservation_date'] = df['reservation_status_date'].dt.strftime('%Y-%m-%d')

In [None]:
df.head(10)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID
0,1,Resort Hotel,0,342,2015,7,27,1,0,0,2,0,0,BB,PRT,Direct,...,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-01-07,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,1
1,2,Resort Hotel,0,737,2015,7,27,1,0,0,2,0,0,BB,PRT,Direct,...,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-01-07,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2
2,3,Resort Hotel,0,7,2015,7,27,1,0,1,1,0,0,BB,GBR,Direct,...,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-02-07,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,3
3,4,Resort Hotel,0,13,2015,7,27,1,0,1,1,0,0,BB,GBR,Corporate,...,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-02-07,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,4
4,5,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,BB,GBR,Online TA,...,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-03-07,Linda Hines,LHines@verizon.com,713-226-5883,************5498,5
5,6,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,BB,GBR,Online TA,...,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-03-07,Jasmine Fletcher,JFletcher43@xfinity.com,190-271-6743,************9263,6
6,7,Resort Hotel,0,0,2015,7,27,1,0,2,2,0,0,BB,PRT,Direct,...,0,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-03-07,Dylan Rangel,Rangel.Dylan@comcast.net,420-332-5209,************6994,7
7,8,Resort Hotel,0,9,2015,7,27,1,0,2,2,0,0,FB,PRT,Direct,...,0,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-03-07,William Velez,Velez_William@mail.com,286-669-4333,************8729,8
8,9,Resort Hotel,1,85,2015,7,27,1,0,3,2,0,0,BB,PRT,Online TA,...,0,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-06-05,Steven Murphy,Steven.Murphy54@aol.com,341-726-5787,************3639,9
9,10,Resort Hotel,1,75,2015,7,27,1,0,3,2,0,0,HB,PRT,Offline TA/TO,...,0,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22,Michael Moore,MichaelMoore81@outlook.com,316-648-6176,************9190,10


In [None]:
df['type_of_meal_plan'] = \
df['type_of_meal_plan'].map({
        'BB': 'Bed and Breakfass', 'HB': 'Half Board', 'SC': 'Self Catering', 'FB': 'Full Board'
    })

In [None]:
df.tail(5)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID
995,996,Resort Hotel,1,122,2015,8,33,9,2,4,2,0,0,Half Board,PRT,Online TA,...,0,No Deposit,240.0,,0,Transient,166.0,0,2,Canceled,2015-05-27,Christina Collins,Christina_Collins@yandex.com,629-364-3798,************4977,996
996,997,Resort Hotel,1,41,2015,8,33,9,2,4,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,,0,Transient,202.0,0,2,Canceled,2015-07-17,Katherine Parsons,KatherineParsons@gmail.com,837-549-5190,************5190,997
997,998,Resort Hotel,1,41,2015,8,33,9,2,4,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,,0,Transient,172.0,0,2,Canceled,2015-07-17,Christian Bailey,Christian.Bailey@gmail.com,880-193-6472,************5125,998
998,999,Resort Hotel,0,81,2015,8,33,9,2,4,2,1,1,Full Board,ESP,Direct,...,0,No Deposit,250.0,,0,Transient,277.0,1,1,Check-Out,2015-08-15,Mr. William Bell,Mr..B@mail.com,401-188-7246,************6274,999
999,1000,Resort Hotel,0,59,2015,8,33,9,2,5,2,0,0,Bed and Breakfass,GBR,Offline TA/TO,...,0,No Deposit,243.0,,0,Transient,104.0,0,0,Check-Out,2015-08-16,James Haynes,Haynes.James@hotmail.com,711-535-8794,************8511,1000


# ***Data Pre-processing***

# check missing data

In [None]:
df.isnull().sum()

Unnamed: 0,0
BookingID,0
hotel,0
booking_status,0
lead_time,0
arrival_year,0
arrival_month,0
arrival_date_week_number,0
arrival_day,0
no_of_weekend_nights,0
no_of_week_nights,0


In [None]:
max_null_val=df.isnull().sum().idxmax()
print('The maximum number of Null data is in column *** ',max_null_val,' ***')

The maximum number of Null data is in column ***  company  ***


In [None]:
#remove company column
df=df.drop(['company'], axis = 1)

In [None]:
#remove null rows in country column
df = df.dropna(subset=['country'])

In [None]:
#filling null values with zero in agent column
df['agent'] = df['agent'].fillna(0)

In [None]:
df.isnull().sum()

Unnamed: 0,0
BookingID,0
hotel,0
booking_status,0
lead_time,0
arrival_year,0
arrival_month,0
arrival_date_week_number,0
arrival_day,0
no_of_weekend_nights,0
no_of_week_nights,0


In [None]:
df.head(5)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,assigned_room_type,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID
0,1,Resort Hotel,0,342,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,C,3,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-01-07,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,1
1,2,Resort Hotel,0,737,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,C,4,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-01-07,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2
2,3,Resort Hotel,0,7,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Direct,...,C,0,No Deposit,0.0,0,Transient,75.0,0,0,Check-Out,2015-02-07,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,3
3,4,Resort Hotel,0,13,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Corporate,...,A,0,No Deposit,304.0,0,Transient,75.0,0,0,Check-Out,2015-02-07,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,4
4,5,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,GBR,Online TA,...,A,0,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,2015-03-07,Linda Hines,LHines@verizon.com,713-226-5883,************5498,5


In [None]:
#Check for duplicate rows
df.duplicated().sum()

0

In [None]:
#Check data types before Processing
df.dtypes

Unnamed: 0,0
BookingID,int64
hotel,object
booking_status,int64
lead_time,int64
arrival_year,int64
arrival_month,int64
arrival_date_week_number,int64
arrival_day,int64
no_of_weekend_nights,int64
no_of_week_nights,int64


In [None]:
#  columns are merged into one
df['arrival_date'] = df['arrival_year'].map(str) + '-' +df['arrival_month'].map(str) + '-' \
                       + df['arrival_day'].map(str)

In [None]:
df.head(10)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID,arrival_date
0,1,Resort Hotel,0,342,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,3,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-01-07,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,1,2015-7-1
1,2,Resort Hotel,0,737,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,4,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-01-07,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2,2015-7-1
2,3,Resort Hotel,0,7,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Direct,...,0,No Deposit,0.0,0,Transient,75.0,0,0,Check-Out,2015-02-07,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,3,2015-7-1
3,4,Resort Hotel,0,13,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Corporate,...,0,No Deposit,304.0,0,Transient,75.0,0,0,Check-Out,2015-02-07,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,4,2015-7-1
4,5,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,GBR,Online TA,...,0,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,2015-03-07,Linda Hines,LHines@verizon.com,713-226-5883,************5498,5,2015-7-1
5,6,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,GBR,Online TA,...,0,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,2015-03-07,Jasmine Fletcher,JFletcher43@xfinity.com,190-271-6743,************9263,6,2015-7-1
6,7,Resort Hotel,0,0,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,PRT,Direct,...,0,No Deposit,0.0,0,Transient,107.0,0,0,Check-Out,2015-03-07,Dylan Rangel,Rangel.Dylan@comcast.net,420-332-5209,************6994,7,2015-7-1
7,8,Resort Hotel,0,9,2015,7,27,1,0,2,2,0,0,Full Board,PRT,Direct,...,0,No Deposit,303.0,0,Transient,103.0,0,1,Check-Out,2015-03-07,William Velez,Velez_William@mail.com,286-669-4333,************8729,8,2015-7-1
8,9,Resort Hotel,1,85,2015,7,27,1,0,3,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,0,Transient,82.0,0,1,Canceled,2015-06-05,Steven Murphy,Steven.Murphy54@aol.com,341-726-5787,************3639,9,2015-7-1
9,10,Resort Hotel,1,75,2015,7,27,1,0,3,2,0,0,Half Board,PRT,Offline TA/TO,...,0,No Deposit,15.0,0,Transient,105.5,0,0,Canceled,2015-04-22,Michael Moore,MichaelMoore81@outlook.com,316-648-6176,************9190,10,2015-7-1


In [None]:
#df['reservation_status_date'] = df['reservation_status_date'].apply(lambda x: x.replace(month=7, day=1))

In [None]:
df['arrival_date'] = pd.to_datetime(df['arrival_date'], errors='coerce').dt.strftime('%Y-%m-%d')

In [None]:
#df['arrival_date'] = pd.to_datetime(df['arrival_date'], format='%Y-%B-%d')
#df['arrival_date'] = df['arrival_date'].dt.strftime('%Y-%m-%d')

In [None]:
df.tail(20)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,booking_changes,deposit_type,agent,days_in_waiting_list,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID,arrival_date
980,981,Resort Hotel,0,98,2015,8,32,8,2,5,2,0,0,Half Board,PRT,Offline TA/TO,...,0,No Deposit,127.0,0,Transient,138.0,0,0,Check-Out,2015-08-15,Joel Ball,JBall98@hotmail.com,804-266-6202,************4096,981,2015-08-08
981,982,Resort Hotel,1,91,2015,8,32,8,2,5,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,0,Transient,134.0,0,1,Canceled,2015-05-14,Kyle Lynch,Kyle_Lynch69@att.com,192-714-9819,************7438,982,2015-08-08
982,983,Resort Hotel,1,58,2015,8,32,8,2,6,1,0,0,Bed and Breakfass,PRT,Direct,...,1,No Deposit,250.0,0,Transient,173.0,0,0,Canceled,2015-07-20,Patricia Coleman,Patricia.Coleman@outlook.com,264-195-0004,************2862,983,2015-08-08
983,984,Resort Hotel,0,61,2015,8,32,8,4,9,2,0,0,Bed and Breakfass,FRA,Online TA,...,0,No Deposit,240.0,0,Transient,165.69,1,0,Check-Out,2015-08-21,Eileen Rivera,EileenRivera@hotmail.com,155-629-6379,************5560,984,2015-08-08
984,985,Resort Hotel,0,0,2015,8,33,9,1,0,2,0,0,Bed and Breakfass,PRT,Direct,...,0,No Deposit,0.0,0,Transient,195.0,0,0,Check-Out,2015-10-08,Jose Bradley,Jose.Bradley@zoho.com,529-511-6356,************3859,985,2015-08-09
985,986,Resort Hotel,0,5,2015,8,33,9,1,0,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,306.0,0,Transient,127.31,0,0,Check-Out,2015-10-08,Mark Williams,Mark_Williams86@xfinity.com,114-150-8445,************1818,986,2015-08-09
986,987,Resort Hotel,0,2,2015,8,33,9,1,0,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,0,Transient,134.0,0,1,Check-Out,2015-10-08,Nathaniel Lewis,NLewis84@verizon.com,669-909-5201,************4758,987,2015-08-09
987,988,Resort Hotel,1,9,2015,8,33,9,2,0,2,1,0,Bed and Breakfass,PRT,Direct,...,0,No Deposit,250.0,0,Transient,207.0,0,2,Canceled,2015-01-08,Toni Watkins,Watkins_Toni87@verizon.com,498-324-5863,************7077,988,2015-08-09
988,989,Resort Hotel,1,10,2015,8,33,9,2,0,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,0,Transient,211.5,0,1,Canceled,2015-07-30,Marie Mitchell,MMitchell@protonmail.com,400-376-9213,************5286,989,2015-08-09
989,990,Resort Hotel,0,62,2015,8,33,9,2,1,2,0,0,Bed and Breakfass,FRA,Online TA,...,1,No Deposit,240.0,0,Transient,210.33,1,2,Check-Out,2015-12-08,Helen Pierce,Helen_P@protonmail.com,297-405-9734,************1749,990,2015-08-09


In [None]:
df['arrival_date']= pd.to_datetime(df['arrival_date'])

In [None]:
#Check data types before Processing.  2015-08-09
df.dtypes

Unnamed: 0,0
BookingID,int64
hotel,object
booking_status,int64
lead_time,int64
arrival_year,int64
arrival_month,int64
arrival_date_week_number,int64
arrival_day,int64
no_of_weekend_nights,int64
no_of_week_nights,int64


In [None]:
# Calculating total guests for each record
df['Total Guests'] = df['no_of_adults'] + df['no_of_children']


In [None]:
df.head(10)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,deposit_type,agent,days_in_waiting_list,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID,arrival_date,Total Guests
0,1,Resort Hotel,0,342,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-01-07,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,1,2015-07-01,2
1,2,Resort Hotel,0,737,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-01-07,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2,2015-07-01,2
2,3,Resort Hotel,0,7,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Direct,...,No Deposit,0.0,0,Transient,75.0,0,0,Check-Out,2015-02-07,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,3,2015-07-01,1
3,4,Resort Hotel,0,13,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Corporate,...,No Deposit,304.0,0,Transient,75.0,0,0,Check-Out,2015-02-07,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,4,2015-07-01,1
4,5,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,GBR,Online TA,...,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,2015-03-07,Linda Hines,LHines@verizon.com,713-226-5883,************5498,5,2015-07-01,2
5,6,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,GBR,Online TA,...,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,2015-03-07,Jasmine Fletcher,JFletcher43@xfinity.com,190-271-6743,************9263,6,2015-07-01,2
6,7,Resort Hotel,0,0,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,PRT,Direct,...,No Deposit,0.0,0,Transient,107.0,0,0,Check-Out,2015-03-07,Dylan Rangel,Rangel.Dylan@comcast.net,420-332-5209,************6994,7,2015-07-01,2
7,8,Resort Hotel,0,9,2015,7,27,1,0,2,2,0,0,Full Board,PRT,Direct,...,No Deposit,303.0,0,Transient,103.0,0,1,Check-Out,2015-03-07,William Velez,Velez_William@mail.com,286-669-4333,************8729,8,2015-07-01,2
8,9,Resort Hotel,1,85,2015,7,27,1,0,3,2,0,0,Bed and Breakfass,PRT,Online TA,...,No Deposit,240.0,0,Transient,82.0,0,1,Canceled,2015-06-05,Steven Murphy,Steven.Murphy54@aol.com,341-726-5787,************3639,9,2015-07-01,2
9,10,Resort Hotel,1,75,2015,7,27,1,0,3,2,0,0,Half Board,PRT,Offline TA/TO,...,No Deposit,15.0,0,Transient,105.5,0,0,Canceled,2015-04-22,Michael Moore,MichaelMoore81@outlook.com,316-648-6176,************9190,10,2015-07-01,2


In [None]:
print(df.apply(lambda col: col.unique()))

BookingID                               [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
hotel                                                                      [Resort Hotel]
booking_status                                                                     [0, 1]
lead_time                               [342, 737, 7, 13, 14, 0, 9, 85, 75, 23, 35, 68...
arrival_year                                                                       [2015]
arrival_month                                                                      [7, 8]
arrival_date_week_number                                     [27, 28, 29, 30, 31, 32, 33]
arrival_day                             [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
no_of_weekend_nights                                                   [0, 1, 2, 4, 3, 6]
no_of_week_nights                              [0, 1, 2, 3, 4, 5, 11, 8, 10, 6, 7, 15, 9]
no_of_adults                                                                 [2, 1, 3, 4]
no_of_chil

In [None]:
#drop columns that has only 0 value
df = df.drop(['repeated_guest', 'no_of_previous_cancellations','no_of_previous_bookings_not_canceled','days_in_waiting_list'], axis=1)

In [None]:
print(df.apply(lambda col: col.unique()))

BookingID                     [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
hotel                                                            [Resort Hotel]
booking_status                                                           [0, 1]
lead_time                     [342, 737, 7, 13, 14, 0, 9, 85, 75, 23, 35, 68...
arrival_year                                                             [2015]
arrival_month                                                            [7, 8]
arrival_date_week_number                           [27, 28, 29, 30, 31, 32, 33]
arrival_day                   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
no_of_weekend_nights                                         [0, 1, 2, 4, 3, 6]
no_of_week_nights                    [0, 1, 2, 3, 4, 5, 11, 8, 10, 6, 7, 15, 9]
no_of_adults                                                       [2, 1, 3, 4]
no_of_children                                                    [0, 1, 2, 10]
babies                                  

In [None]:
print(df['arrival_date'].to_string(index=False))

2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-01
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-02
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03
2015-07-03

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 999 entries, 0 to 999
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   BookingID                   999 non-null    int64         
 1   hotel                       999 non-null    object        
 2   booking_status              999 non-null    int64         
 3   lead_time                   999 non-null    int64         
 4   arrival_year                999 non-null    int64         
 5   arrival_month               999 non-null    int64         
 6   arrival_date_week_number    999 non-null    int64         
 7   arrival_day                 999 non-null    int64         
 8   no_of_weekend_nights        999 non-null    int64         
 9   no_of_week_nights           999 non-null    int64         
 10  no_of_adults                999 non-null    int64         
 11  no_of_children              999 non-null    int64         
 12 

In [None]:
df.head(5)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,booking_changes,deposit_type,agent,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,empID,arrival_date,Total Guests
0,1,Resort Hotel,0,342,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,3,No Deposit,0.0,Transient,0.0,0,0,Check-Out,2015-01-07,Ernest Barnes,Ernest.Barnes31@outlook.com,669-792-1661,************4322,1,2015-07-01,2
1,2,Resort Hotel,0,737,2015,7,27,1,0,0,2,0,0,Bed and Breakfass,PRT,Direct,...,4,No Deposit,0.0,Transient,0.0,0,0,Check-Out,2015-01-07,Andrea Baker,Andrea_Baker94@aol.com,858-637-6955,************9157,2,2015-07-01,2
2,3,Resort Hotel,0,7,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Direct,...,0,No Deposit,0.0,Transient,75.0,0,0,Check-Out,2015-02-07,Rebecca Parker,Rebecca_Parker@comcast.net,652-885-2745,************3734,3,2015-07-01,1
3,4,Resort Hotel,0,13,2015,7,27,1,0,1,1,0,0,Bed and Breakfass,GBR,Corporate,...,0,No Deposit,304.0,Transient,75.0,0,0,Check-Out,2015-02-07,Laura Murray,Laura_M@gmail.com,364-656-8427,************5677,4,2015-07-01,1
4,5,Resort Hotel,0,14,2015,7,27,1,0,2,2,0,0,Bed and Breakfass,GBR,Online TA,...,0,No Deposit,240.0,Transient,98.0,0,1,Check-Out,2015-03-07,Linda Hines,LHines@verizon.com,713-226-5883,************5498,5,2015-07-01,2


In [None]:
cols = [col for col in df.columns if col != 'empID'] + ['empID']
df = df[cols]

In [None]:
len(df)

999

In [None]:
df.tail(5)

Unnamed: 0,BookingID,hotel,booking_status,lead_time,arrival_year,arrival_month,arrival_date_week_number,arrival_day,no_of_weekend_nights,no_of_week_nights,no_of_adults,no_of_children,babies,type_of_meal_plan,country,market_segment_type,...,booking_changes,deposit_type,agent,customer_type,avg_price_per_room,required_car_parking_space,no_of_special_requests,reservation_status,reservation_status_date,name,email,phone-number,credit_card,arrival_date,Total Guests,empID
995,996,Resort Hotel,1,122,2015,8,33,9,2,4,2,0,0,Half Board,PRT,Online TA,...,0,No Deposit,240.0,Transient,166.0,0,2,Canceled,2015-05-27,Christina Collins,Christina_Collins@yandex.com,629-364-3798,************4977,2015-08-09,2,996
996,997,Resort Hotel,1,41,2015,8,33,9,2,4,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,Transient,202.0,0,2,Canceled,2015-07-17,Katherine Parsons,KatherineParsons@gmail.com,837-549-5190,************5190,2015-08-09,2,997
997,998,Resort Hotel,1,41,2015,8,33,9,2,4,2,0,0,Bed and Breakfass,PRT,Online TA,...,0,No Deposit,240.0,Transient,172.0,0,2,Canceled,2015-07-17,Christian Bailey,Christian.Bailey@gmail.com,880-193-6472,************5125,2015-08-09,2,998
998,999,Resort Hotel,0,81,2015,8,33,9,2,4,2,1,1,Full Board,ESP,Direct,...,0,No Deposit,250.0,Transient,277.0,1,1,Check-Out,2015-08-15,Mr. William Bell,Mr..B@mail.com,401-188-7246,************6274,2015-08-09,3,999
999,1000,Resort Hotel,0,59,2015,8,33,9,2,5,2,0,0,Bed and Breakfass,GBR,Offline TA/TO,...,0,No Deposit,243.0,Transient,104.0,0,0,Check-Out,2015-08-16,James Haynes,Haynes.James@hotmail.com,711-535-8794,************8511,2015-08-09,2,1000


In [None]:
len(df)

999

In [None]:
# Save the cleaned dataset to the new CSV file.
df.to_csv('cleaned_hotel_booking.csv', index=False)

# **sentiment** **analysis**

In [None]:
#read data
data=pd.read_csv('/content/hotel_Reviews.tsv',sep='\t')

In [None]:
data.insert(0, 'revID', range(1, len(data) + 1))

In [None]:
data.head(5)

Unnamed: 0,revID,Review,Liked
0,1,Wow... Loved this place.,1
1,2,Crust is not good.,0
2,3,Not tasty and the texture was just nasty.,0
3,4,Stopped by during the late May bank holiday of...,1
4,5,The selection on the menu was great and so wer...,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   revID   1000 non-null   int64 
 1   Review  1000 non-null   object
 2   Liked   1000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 23.6+ KB


In [None]:
data['Liked'].value_counts()

Unnamed: 0_level_0,count
Liked,Unnamed: 1_level_1
1,500
0,500


In [None]:
data['Review'].apply(len)

Unnamed: 0,Review
0,24
1,18
2,41
3,87
4,59
...,...
995,66
996,24
997,50
998,91


In [None]:
data['Review'].apply(len).max()

149

In [None]:
#pre-processing

#import string
#string.punctuation

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
print(stopwords.words('english')) #list of words that we can remove from our text

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
s=data['Review'][0]
s

'Wow... Loved this place.'

In [None]:
import re
re.sub('[^a-zA-Z]'," ",s)

'Wow    Loved this place '

In [None]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
corpus = []

for i in range(len(data)):
  s = re.sub('[^a-zA-Z]'," ",data['Review'][i])
  s = s.lower()
  s = s.split() #tokniziation
  s=[word for word in s if not word in stopwords.words('english')] #remove stop word
  s=' '.join(s) #convert to string
  s = ps.stem(s) #remove suffixes
  corpus.append(s)

corpus

['wow loved plac',
 'crust good',
 'tasty texture nasti',
 'stopped late may bank holiday rick steve recommendation lov',
 'selection menu great pric',
 'getting angry want damn pho',
 'honeslty taste fresh',
 'potatoes like rubber could tell made ahead time kept warm',
 'fries great',
 'great touch',
 'service prompt',
 'would go back',
 'cashier care ever say still ended wayyy overpr',
 'tried cape cod ravoli chicken cranberry mmmm',
 'disgusted pretty sure human hair',
 'shocked signs indicate cash',
 'highly recommend',
 'waitress little slow servic',
 'place worth time let alone vega',
 'like',
 'burrittos blah',
 'food amaz',
 'service also cut',
 'could care less interior beauti',
 'perform',
 'right red velvet cake ohhh stuff good',
 'never brought salad ask',
 'hole wall great mexican street tacos friendly staff',
 'took hour get food tables restaurant food luke warm sever running around like totally overwhelm',
 'worst salmon sashimi',
 'also combos like burger fries beer dec

In [None]:
type(corpus)

list

In [None]:
# prompt: convert corpus to dataframe

corpus_df = pd.DataFrame(corpus, columns=['Customer_Review'])
corpus_df


Unnamed: 0,Customer_Review
0,wow loved plac
1,crust good
2,tasty texture nasti
3,stopped late may bank holiday rick steve recom...
4,selection menu great pric
...,...
995,think food flavor texture lack
996,appetite instantly gon
997,overall impressed would go back
998,whole experience underwhelming think go ninja ...


In [None]:
# prompt: append corpus_df to data in one dataframe and remove review column

# Append corpus_df to data
data_ = pd.concat([data, corpus_df], axis=1)
# Remove 'Review' column
data_ = data_.drop('Review', axis=1)
data_.head()

Unnamed: 0,revID,Liked,Customer_Review
0,1,1,wow loved plac
1,2,0,crust good
2,3,0,tasty texture nasti
3,4,1,stopped late may bank holiday rick steve recom...
4,5,1,selection menu great pric


In [None]:
data_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   revID            1000 non-null   int64 
 1   Liked            1000 non-null   int64 
 2   Customer_Review  1000 non-null   object
dtypes: int64(2), object(1)
memory usage: 23.6+ KB


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [None]:
x=cv.fit_transform(corpus).toarray()

In [None]:
#use Naivebase algorithm

In [None]:
y=data_['Liked']
y

Unnamed: 0,Liked
0,1
1,0
2,0
3,1
4,1
...,...
995,0
996,0
997,0
998,0


In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [None]:
X_train.shape

(700, 1994)

# **Model Bulding & Training**

In [None]:
from sklearn.naive_bayes import MultinomialNB #for multi dim
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# **Prediction**

In [None]:
y_pred=classifier.predict(X_test)

In [None]:
y_pred

array([0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1])

In [None]:
y_test.values

array([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1])

# **Evaluation **

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[107  45]
 [ 30 118]]


In [None]:
# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))



Accuracy is  75.0 %
Precision is  0.72
Recall is  0.8


In [None]:

#df2_column_name = df2.columns[0]

# Concatenate df2 as the first column of df
# We need to reset the index of df2 to make sure the rows align properly
data__ = pd.concat([data_, df2], axis=1)


In [None]:
data__.head()

In [None]:
# Save the cleaned dataset to the new CSV file.
data__.to_csv('cleaned_review.csv', index=False)