<a href="https://colab.research.google.com/github/harshinshah29/133AProjectGroup17/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score


from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [3]:
# FOR LOWERCASE P
try:
  df = pd.read_csv('/content/drive/MyDrive/ECE 133A project/Datasets/hotel_booking.csv')
  df.info()
except Exception as e:
  df = pd.read_csv('/content/drive/MyDrive/ECE 133A Project/Datasets/hotel_booking.csv')
  df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78290 entries, 0 to 78289
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     78290 non-null  int64  
 1   hotel                           78290 non-null  object 
 2   lead_time                       78290 non-null  int64  
 3   arrival_date_month              78290 non-null  object 
 4   stays_in_weekend_nights         78290 non-null  int64  
 5   stays_in_week_nights            78290 non-null  int64  
 6   adults                          78290 non-null  int64  
 7   children                        78287 non-null  float64
 8   babies                          78290 non-null  int64  
 9   meal                            78290 non-null  object 
 10  country                         78290 non-null  object 
 11  previous_cancellations          78290 non-null  int64  
 12  previous_bookings_not_canceled  

In [4]:
#dropping unnecessary fields
df = df.drop(columns=['name', 'email', 'phone-number'])

In [5]:
#one hot encoding
categorical_cols = ['hotel', 'arrival_date_month', 'meal', 'country', 'reserved_room_type', 'deposit_type', 'customer_type']
ohe = OneHotEncoder(drop='first', sparse_output=False)
df_encoded = pd.DataFrame(ohe.fit_transform(df[categorical_cols]))
df_encoded.columns = ohe.get_feature_names_out(categorical_cols)
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, df_encoded], axis=1)

In [6]:
#handling missing values
df['children'].fillna(0, inplace=True)

In [7]:
#augment features
df['total_stay'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

df['total_guests'] = df['adults'] + df['children'] + df['babies']

df['total_previous_bookings'] = df['previous_cancellations'] + df['previous_bookings_not_canceled']

df['total_cost'] = df['adr'] / df['total_stay']

df['adults_ratio'] = df['adults'] / df['total_guests']

df['cars_to_guests'] = df['required_car_parking_spaces'] / df['total_guests']

df['guests_to_cost'] = df['total_guests'] + df['total_cost']

In [8]:
#drop nonsensical entries
df = df[df['total_stay'] != 0]
df = df[df['total_guests'] != 0]

In [9]:
# Check for NaN or inf values
nan_mask = df.isna()

inf_mask = df.applymap(np.isinf)

nan_or_inf_mask = nan_mask | inf_mask

df[nan_or_inf_mask.any(axis=1)]

Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,total_stay,total_guests,total_previous_bookings,total_cost,adults_ratio,cars_to_guests,guests_to_cost


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77640 entries, 0 to 78289
Data columns (total 55 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   is_canceled                     77640 non-null  int64  
 1   lead_time                       77640 non-null  int64  
 2   stays_in_weekend_nights         77640 non-null  int64  
 3   stays_in_week_nights            77640 non-null  int64  
 4   adults                          77640 non-null  int64  
 5   children                        77640 non-null  float64
 6   babies                          77640 non-null  int64  
 7   previous_cancellations          77640 non-null  int64  
 8   previous_bookings_not_canceled  77640 non-null  int64  
 9   booking_changes                 77640 non-null  int64  
 10  days_in_waiting_list            77640 non-null  int64  
 11  adr                             77640 non-null  float64
 12  required_car_parking_spaces     

## Standardizing Features

In [11]:
numerical_cols = ['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'babies', 'children', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'adr','required_car_parking_spaces','total_of_special_requests', 'total_stay', 'total_guests', 'total_previous_bookings','total_cost','adults_ratio', 'cars_to_guests', 'guests_to_cost' ]
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.describe()

Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,total_stay,total_guests,total_previous_bookings,total_cost,adults_ratio,cars_to_guests,guests_to_cost
count,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,...,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0,77640.0
mean,0.408836,-1.8852630000000002e-17,9.142609e-17,-6.225486e-17,7.358016e-17,-4.8046750000000005e-17,-5.033469e-17,-4.484363e-18,-1.972205e-17,-3.349545e-17,...,0.005165,0.730848,0.223712,-1.830352e-17,-2.037182e-16,-9.517832e-18,7.074311000000001e-17,-1.30797e-15,1.038725e-16,4.420301e-17
std,0.491622,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,...,0.071682,0.443522,0.416734,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006,1.000006
min,0.0,-0.9658937,-0.9013631,-1.325806,-3.007659,-0.2419117,-0.08281927,-0.1232062,-0.10038,-0.3432837,...,0.0,0.0,0.0,-0.9404342,-1.273818,-0.1362777,-1.093208,-8.817322,-0.2429311,-1.099351
25%,0.0,-0.816469,-0.9013631,-0.7855467,0.256819,-0.2419117,-0.08281927,-0.1232062,-0.10038,-0.3432837,...,0.0,0.0,0.0,-0.538843,0.07996452,-0.1362777,-0.5515229,0.2605143,-0.2429311,-0.5502695
50%,0.0,-0.3330363,0.11397,-0.2452876,0.256819,-0.2419117,-0.08281927,-0.1232062,-0.10038,-0.3432837,...,0.0,1.0,0.0,-0.1372517,0.07996452,-0.1362777,-0.2650676,0.2605143,-0.2429311,-0.2632497
75%,1.0,0.5283528,1.129303,0.2949714,0.256819,-0.2419117,-0.08281927,-0.1232062,-0.10038,-0.3432837,...,0.0,1.0,0.0,0.2643395,0.07996452,-0.1362777,0.2749804,0.2605143,-0.2429311,0.2758485
max,1.0,5.265993,15.34397,20.28456,86.76548,26.76704,95.03683,29.13323,41.21099,28.91497,...,1.0,1.0,1.0,21.14708,71.83042,37.23407,133.8722,0.2605143,13.03335,133.7001


In [12]:
import numpy as np
# Check for NaN or inf values
nan_mask = df.isna()


inf_mask = df.applymap(np.isinf)

nan_or_inf_mask = nan_mask | inf_mask

df[nan_or_inf_mask.any(axis=1)]

Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,total_stay,total_guests,total_previous_bookings,total_cost,adults_ratio,cars_to_guests,guests_to_cost


## SVD

In [13]:
import numpy as np
U, s, Vt = np.linalg.svd(df, full_matrices=False)

In [14]:
first_component = np.abs(Vt[0, :])

top_features_indices = np.argsort(first_component)[-8:]

top_feature_names = df.columns[top_features_indices]

top_feature_names

Index(['children', 'lead_time', 'adr', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'total_stay', 'guests_to_cost', 'total_cost'],
      dtype='object')

# K Means Algorithm

In [15]:
# Finding Best K Value

# sil = []
# kmax = 15

# dissimilarity not be defined for a single cluster so minimum number of clusters is 2
# for k in range(2, kmax+1):
#   kmeans = KMeans(n_clusters = k).fit(df)
#   labels = kmeans.labels_
#   sil.append(silhouette_score(df, labels, metric = 'euclidean'))


In [16]:
# k_opt = sil.index(max(sil)) + 2

# k_values = np.arange(2,kmax + 1,1)

# plt.plot(k_values,sil)
# plt.ylabel("Silhouette Score")
# plt.xlabel("K value")
# plt.show()

# print("Best K Value: ", k_opt)
# print("Best Silhouette Score: ", max(sil))

In [17]:
# from yellowbrick.cluster import KElbowVisualizer

# # Initialize the KElbowVisualizer with the KMeans estimator and a range of K values
# Elbow_M = KElbowVisualizer(KMeans(), k=kmax)
# # Fit the visualizer
# Elbow_M.fit(df)
# # Display the Elbow Method plot
# Elbow_M.show()

In [18]:
# Fitting Dataset to K Means

kmeans = KMeans(n_clusters=6, random_state=0, n_init="auto").fit(df)

kmeans.labels_

array([3, 3, 3, ..., 2, 0, 1], dtype=int32)

# Correlation Matrix

In [19]:
# calculate de-meaned vector
def de_mean_vector(vec):
  n = len(vec)
  avg = sum(vec)/n
  avg_vec = [avg] * n
  return vec - avg_vec

In [20]:
# find correlation coefficient
def correlation_coefficient(A, B):
  demeaned_a, demeaned_b = de_mean_vector(A), de_mean_vector(B)
  return (np.dot(demeaned_a, demeaned_b) / (np.linalg.norm(demeaned_a) * np.linalg.norm(demeaned_b)))

In [21]:
df.head()

Unnamed: 0,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,previous_cancellations,previous_bookings_not_canceled,booking_changes,...,customer_type_Group,customer_type_Transient,customer_type_Transient-Party,total_stay,total_guests,total_previous_bookings,total_cost,adults_ratio,cars_to_guests,guests_to_cost
0,0,-0.930735,0.11397,-0.245288,0.256819,-0.241912,-0.082819,-0.123206,-0.10038,-0.343284,...,0.0,1.0,0.0,-0.137252,0.079965,-0.136278,-0.452497,0.260514,-0.242931,-0.450437
1,1,0.545932,-0.901363,-0.245288,-1.37542,-0.241912,-0.082819,-0.123206,-0.10038,-0.343284,...,0.0,0.0,1.0,-0.538843,-1.273818,-0.136278,0.109792,0.260514,-0.242931,0.086166
2,0,-0.930735,1.129303,-0.785547,-1.37542,-0.241912,-0.082819,-0.123206,-0.10038,-0.343284,...,0.0,1.0,0.0,-0.137252,-1.273818,-0.136278,-0.5358,0.260514,-0.242931,-0.558589
3,1,-0.368195,-0.901363,-0.245288,0.256819,-0.241912,-0.082819,-0.123206,-0.10038,-0.343284,...,0.0,0.0,1.0,-0.538843,0.079965,-0.136278,-1.077264,0.260514,-0.242931,-1.074393
4,1,0.343769,1.129303,0.835231,1.889058,-0.241912,-0.082819,-0.123206,-0.10038,-0.343284,...,0.0,1.0,0.0,1.067522,1.433747,-0.136278,-0.378775,0.260514,-0.242931,-0.351851


In [22]:
# n_cols = len(df.columns)
# correlation_matrix = np.zeros(shape=(n_cols, n_cols))

# for i in range(n_cols):
#   for j in range(i+1, n_cols):
#     A, B = df[df.columns[i]], df[df.columns[j]]
#     rho = correlation_coefficient(A, B)
#     correlation_matrix[i][j] = rho
#     correlation_matrix[j][i] = rho

# # correlation coefficient of a column with itself will equal 1
# for i in range(n_cols):
#   correlation_matrix[i][i] = 1

#   print("Entries with correlation coefficient >= 0.7:")
# for i in range(n_cols):
#     for j in range(i+1, n_cols):
#         if correlation_matrix[i][j] >= 0.6:
#             print(f"{df.columns[i]} and {df.columns[j]}: {correlation_matrix[i][j]}")

# correlation_matrix

In [23]:

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score
from joblib import dump


In [24]:
def model_and_score(model, df, n_splits = 5):
  # X is features without target
    X = df.drop(columns=['is_canceled'])

  #  y is target
    y = df['is_canceled']

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    rms_errors = []
    accuracies = []

    best_fold_model = None
    best_fold_indices = None
    highest_accuracy = 0

    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        rms_error = np.sqrt(mean_squared_error(y_test, y_pred))
        rms_errors.append(rms_error)

        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

        if accuracy > highest_accuracy:
            highest_accuracy = accuracy
            best_fold_model = model
            best_fold_indices = (test_index, y_pred)

        dump(model, f'model_fold_{i+1}.joblib')

        print(f"Fold {i+1}: RMS Error = {rms_error}, Accuracy = {accuracy}")

    y_test_best_fold, y_pred_best_fold = y.iloc[best_fold_indices[0]], best_fold_indices[1]
    best_fold_confusion_matrix = confusion_matrix(y_test_best_fold, y_pred_best_fold)

    print(f"Best Fold Confusion Matrix:\n{best_fold_confusion_matrix}")

    return rms_errors, best_fold_confusion_matrix


## Linear Model

In [25]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
rms_errors, best_fold_confusion_matrix = model_and_score(model,df)

Fold 1: RMS Error = 0.43178397770217125, Accuracy = 0.8135625965996909
Fold 2: RMS Error = 0.4384442779341161, Accuracy = 0.8077666151468316
Fold 3: RMS Error = 0.43527484847459325, Accuracy = 0.8105358062854199
Fold 4: RMS Error = 0.4394712527208779, Accuracy = 0.8068650180319423
Fold 5: RMS Error = 0.4328267557755624, Accuracy = 0.8126609994848016
Best Fold Confusion Matrix:
[[8482  741]
 [2154 4151]]


## Feature Engineering

In [26]:
kmeans = KMeans(n_clusters=6, random_state=0, n_init="auto").fit(df)

labels = kmeans.labels_

In [27]:
df1 = df

df1['k_group'] = labels

df_array = []

for i in range(6):
  df_array.append(df1[df1['k_group'] == i])
  df_array[i].drop(columns=['k_group'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_array[i].drop(columns=['k_group'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_array[i].drop(columns=['k_group'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_array[i].drop(columns=['k_group'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_array[i].dro

In [28]:
model_strat = LogisticRegression(max_iter=1000)


for i in range(6):
  print('For Group ' + str(i) + ':')
  rms_errors, best_fold_confusion_matrix = model_and_score(model_strat,df_array[i])

For Group 0:
Fold 1: RMS Error = 0.4697923432911375, Accuracy = 0.779295154185022
Fold 2: RMS Error = 0.46754518663176264, Accuracy = 0.7814014984574702
Fold 3: RMS Error = 0.4791832334052704, Accuracy = 0.7703834288232702
Fold 4: RMS Error = 0.47176795844020797, Accuracy = 0.7774349933891582
Fold 5: RMS Error = 0.4754900513089478, Accuracy = 0.7739092111062142
Best Fold Confusion Matrix:
[[1408   71]
 [ 425  365]]
For Group 1:
Fold 1: RMS Error = 0.4564844883206178, Accuracy = 0.7916219119226638
Fold 2: RMS Error = 0.4457701912114778, Accuracy = 0.8012889366272825
Fold 3: RMS Error = 0.42919753763947605, Accuracy = 0.8157894736842105
Fold 4: RMS Error = 0.4506845947848193, Accuracy = 0.7968833960236432
Fold 5: RMS Error = 0.4476939539579823, Accuracy = 0.799570123589468
Best Fold Confusion Matrix:
[[997 147]
 [196 522]]
For Group 2:
Fold 1: RMS Error = 0.22378307432268488, Accuracy = 0.9499211356466877
Fold 2: RMS Error = 0.22986760693765204, Accuracy = 0.9471608832807571
Fold 3: RMS 

## Regularization

In [30]:
C_values = np.logspace(-1, 2, num=4)


for c in C_values:
  print ('Model ran on L2 Regularization with C value: ', c)
  model_with_reg = LogisticRegression(penalty='l2', C=c, max_iter=1000, random_state=42)

  rms_errors, best_fold_confusion_matrix = model_and_score(model_with_reg, df)

Model ran on L2 Regularization with C value:  0.1
Fold 1: RMS Error = 0.4311869722625486, Accuracy = 0.8140777949510561
Fold 2: RMS Error = 0.4388113305542278, Accuracy = 0.8074446161772282
Fold 3: RMS Error = 0.4353488182004182, Accuracy = 0.8104714064914992
Fold 4: RMS Error = 0.43925138882682624, Accuracy = 0.8070582174137043
Fold 5: RMS Error = 0.4328267557755624, Accuracy = 0.8126609994848016
Best Fold Confusion Matrix:
[[8488  735]
 [2152 4153]]
Model ran on L2 Regularization with C value:  1.0
Fold 1: RMS Error = 0.43163480375482677, Accuracy = 0.8136913961875322
Fold 2: RMS Error = 0.43925138882682624, Accuracy = 0.8070582174137043
Fold 3: RMS Error = 0.4349048111310989, Accuracy = 0.8108578052550232
Fold 4: RMS Error = 0.4396177675616615, Accuracy = 0.8067362184441009
Fold 5: RMS Error = 0.4328267557755624, Accuracy = 0.8126609994848016
Best Fold Confusion Matrix:
[[8483  740]
 [2153 4152]]
Model ran on L2 Regularization with C value:  10.0
Fold 1: RMS Error = 0.43141094611832

In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.model_selection import cross_val_predict
from joblib import dump

# Define the Random Forest classifier
model_nonlinear = RandomForestClassifier(n_estimators=100, random_state=42)


# Train models using cross-validation and save parameters
rms_errors_nonlinear, confusion_matrices_nonlinear = model_and_score(model_nonlinear, df)

# Overall performance metrics
print("Overall Performance Metrics:")
print("Mean RMS Error:", np.mean(rms_errors_nonlinear))
print("Mean Confusion Matrix:")
print(np.mean(confusion_matrices_nonlinear, axis=0))



Fold 1: RMS Error = 0.34404479537724747, Accuracy = 0.881633178773828
Fold 2: RMS Error = 0.3480459917378168, Accuracy = 0.8788639876352395
Fold 3: RMS Error = 0.34739777461480065, Accuracy = 0.8793147861926842
Fold 4: RMS Error = 0.3494309792699702, Accuracy = 0.8778979907264297
Fold 5: RMS Error = 0.3483234297984022, Accuracy = 0.8786707882534776
Best Fold Confusion Matrix:
[[8589  634]
 [1204 5101]]
Overall Performance Metrics:
Mean RMS Error: 0.34744859415964746
Mean Confusion Matrix:
[4896.5 2867.5]
