In [91]:
# installing required libraries
# 
# pip install pandas numpy scipy matplotlib plotly seaborn


In [92]:
# importing relevant librarise for the project
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly as pl
import seaborn as sns
import re

In [93]:
# importing data
df = pd.read_csv("hotel_bookings.csv", na_values=pd.NA)
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,7/1/2015
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,7/2/2015
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,7/2/2015
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,7/3/2015


In [94]:
df.isna().sum().sort_values(ascending=False)

company                           112593
agent                              16340
country                              488
children                               4
arrival_date_month                     0
arrival_date_week_number               0
hotel                                  0
is_canceled                            0
stays_in_weekend_nights                0
arrival_date_day_of_month              0
adults                                 0
stays_in_week_nights                   0
babies                                 0
meal                                   0
lead_time                              0
arrival_date_year                      0
distribution_channel                   0
market_segment                         0
previous_bookings_not_canceled         0
is_repeated_guest                      0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
previous_cancellations                 0
deposit_type    

In [95]:
df.columns

Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date'],
      dtype='object')

In [96]:
object_data = df.select_dtypes(include="object").columns.tolist()
numerical_data = df.select_dtypes(exclude="object").columns.tolist()
print(object_data, numerical_data)

['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status', 'reservation_status_date'] ['is_canceled', 'lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'children', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'agent', 'company', 'days_in_waiting_list', 'adr', 'required_car_parking_spaces', 'total_of_special_requests']


In [97]:
def clean_obj(data, col):
    # Define emoji pattern
    emoji_pattern = re.compile("[" 
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002700-\U000027BF"  # dingbats
        u"\U000024C2-\U0001F251"  # enclosed characters
        "]+", flags=re.UNICODE)

    # Normalize text and handle common missing entries
    data[col] = data[col].astype(str).str.strip().str.lower().replace(["n/a", "na", "", "null", "none"], pd.NA)

    # Remove emojis
    data[col] = data[col].apply(lambda x: emoji_pattern.sub(r'', x) if pd.notna(x) else x)

    # Remove hashtags, mentions, special characters
    data[col] = data[col].replace(r'#\w+', '', regex=True)
    data[col] = data[col].replace(r'@[\w]+', '', regex=True)
    data[col] = data[col].replace(r'[^\w\s]', '', regex=True)

    # Re-check for empty strings after cleaning
    data[col] = data[col].astype(str).str.strip().replace(["", "n/a", "na", "null", "none"], pd.NA)

    return data

def clean_num(data, col):
    data[col] = pd.to_numeric(data[col], errors='coerce')  # Convert to numeric
    data[col] = data[col].fillna(data[col].mean())         # Fill missing values with mean
    return data

def clean_data(data, cols):
    for col in cols:
        if col not in data.columns:
            print(f"⚠️ Column '{col}' is not present in DataFrame. Skipping...")
            continue

        if pd.api.types.is_object_dtype(data[col]):
            data = clean_obj(data, col)

        elif pd.api.types.is_numeric_dtype(data[col]):
            data = clean_num(data, col)

        else:
            print(f"ℹ️ Column '{col}' is neither object nor numeric. Skipping...")

    return data


In [98]:
# handling date time serise columns converting them to date time serise for easy analysis
# print(pd.to_datetime(df['arrival_date_year']))

df = clean_data(df, df.columns)



In [99]:
df[object_data]

Unnamed: 0,hotel,arrival_date_month,meal,country,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type,reservation_status,reservation_status_date
0,resort hotel,july,bb,prt,direct,direct,c,c,no deposit,transient,checkout,712015
1,resort hotel,july,bb,prt,direct,direct,c,c,no deposit,transient,checkout,712015
2,resort hotel,july,bb,gbr,direct,direct,a,c,no deposit,transient,checkout,722015
3,resort hotel,july,bb,gbr,corporate,corporate,a,a,no deposit,transient,checkout,722015
4,resort hotel,july,bb,gbr,online ta,tato,a,a,no deposit,transient,checkout,732015
...,...,...,...,...,...,...,...,...,...,...,...,...
119385,city hotel,august,bb,bel,offline tato,tato,a,a,no deposit,transient,checkout,962017
119386,city hotel,august,bb,fra,online ta,tato,e,e,no deposit,transient,checkout,972017
119387,city hotel,august,bb,deu,online ta,tato,d,d,no deposit,transient,checkout,972017
119388,city hotel,august,bb,gbr,online ta,tato,a,a,no deposit,transient,checkout,972017


In [100]:
df.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [101]:
# check the size of the data set
df.shape

(119390, 32)

In [102]:
df.describe()

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,agent,company,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests
count,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0,119390.0
mean,0.370416,104.011416,2016.156554,27.165173,15.798241,0.927599,2.500302,1.856403,0.10389,0.007949,0.031912,0.087118,0.137097,0.221124,86.693382,189.266735,2.321149,101.831122,0.062518,0.571363
std,0.482918,106.863097,0.707476,13.605138,8.780829,0.998613,1.908286,0.579261,0.398555,0.097436,0.175767,0.844336,1.497437,0.652306,102.915247,31.411012,17.594721,50.53579,0.245291,0.792798
min,0.0,0.0,2015.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,-6.38,0.0,0.0
25%,0.0,18.0,2016.0,16.0,8.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,189.266735,0.0,69.29,0.0,0.0
50%,0.0,69.0,2016.0,28.0,16.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0,189.266735,0.0,94.575,0.0,0.0
75%,1.0,160.0,2017.0,38.0,23.0,2.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,152.0,189.266735,0.0,126.0,0.0,1.0
max,1.0,737.0,2017.0,53.0,31.0,19.0,50.0,55.0,10.0,10.0,1.0,26.0,72.0,21.0,535.0,543.0,391.0,5400.0,8.0,5.0


In [103]:
# check for missing values
df.isna().sum().sort_values(ascending=False)

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
agent                             0
company                           0
days_in_waiting_list              0
customer_type                     0
adr                         