<a href="https://colab.research.google.com/github/ihebbennaceur/british-air-ways/blob/main/forage_58_ml_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('https://cdn.theforage.com/vinternships/companyassets/tMjbs76F526fF5v3G/L3MQ8f6cYSkfoukmz/1667814300249/customer_booking.csv', encoding='latin-1')
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [6]:
df.columns

Index(['num_passengers', 'sales_channel', 'trip_type', 'purchase_lead',
       'length_of_stay', 'flight_hour', 'flight_day', 'route',
       'booking_origin', 'wants_extra_baggage', 'wants_preferred_seat',
       'wants_in_flight_meals', 'flight_duration', 'booking_complete'],
      dtype='object')

In [7]:
#stats
print("Number of rows : {}".format(df.shape[0]))
print()
print("Basics statistics: ")
data_desc = df.describe(include='all')
display(data_desc)
print()

Number of rows : 50000

Basics statistics: 


Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
count,50000.0,50000,50000,50000.0,50000.0,50000.0,50000,50000,50000,50000.0,50000.0,50000.0,50000.0,50000.0
unique,,2,3,,,,7,799,104,,,,,
top,,Internet,RoundTrip,,,,Mon,AKLKUL,Australia,,,,,
freq,,44382,49497,,,,8102,2680,17872,,,,,
mean,1.59124,,,84.94048,23.04456,9.06634,,,,0.66878,0.29696,0.42714,7.277561,0.14956
std,1.020165,,,90.451378,33.88767,5.41266,,,,0.470657,0.456923,0.494668,1.496863,0.356643
min,1.0,,,0.0,0.0,0.0,,,,0.0,0.0,0.0,4.67,0.0
25%,1.0,,,21.0,5.0,5.0,,,,0.0,0.0,0.0,5.62,0.0
50%,1.0,,,51.0,17.0,9.0,,,,1.0,0.0,0.0,7.57,0.0
75%,2.0,,,115.0,28.0,13.0,,,,1.0,1.0,1.0,8.83,0.0





In [8]:
#missin values
print(df.isnull().any().value_counts())
print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

False    14
dtype: int64
Percentage of missing values: 


num_passengers           0.0
sales_channel            0.0
trip_type                0.0
purchase_lead            0.0
length_of_stay           0.0
flight_hour              0.0
flight_day               0.0
route                    0.0
booking_origin           0.0
wants_extra_baggage      0.0
wants_preferred_seat     0.0
wants_in_flight_meals    0.0
flight_duration          0.0
booking_complete         0.0
dtype: float64

In [9]:
# Correlation matrix
corr_matrix = df.corr().round(2)
import plotly.figure_factory as ff
fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())
fig.show()

  corr_matrix = df.corr().round(2)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.3+ 

In [11]:
# Replace 0/1 with no/yes for specific columns
binary_to_categorical_cols = ['wants_extra_baggage', 'wants_preferred_seat', 'wants_in_flight_meals']
df[binary_to_categorical_cols] = df[binary_to_categorical_cols].replace({0: 'no', 1: 'yes'})

#feature ing
df['total_flight_duration'] = df['flight_duration'] * df['num_passengers']


In [12]:
# Define features and target variable
numeric_features = ['num_passengers', 'purchase_lead', 'length_of_stay', 'flight_hour', 'flight_duration']
categorical_features = ['sales_channel', 'trip_type', 'flight_day', 'route', 'booking_origin', 'wants_extra_baggage', 'wants_preferred_seat', 'wants_in_flight_meals']
target_variable = 'booking_complete'

In [13]:
# Separate target variable Y from features X
X = df.drop(target_variable, axis=1)
Y = df.loc[:, target_variable]


In [14]:
# Split dataset into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0, stratify=Y)

In [15]:

# Define transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])


In [16]:
# Apply transformers to respective features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [17]:
# Preprocessings on train set
X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set
X_test = preprocessor.transform(X_test)


Found unknown categories in columns [3, 4] during transform. These unknown categories will be encoded as all zeros



In [18]:

# Perform grid search for XGBoost
classifier_xgboost = XGBClassifier()

# Grid of values to be tested
params_xgboost = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300]
}
gridsearch_xgboost = GridSearchCV(classifier_xgboost, param_grid=params_xgboost, cv=5, n_jobs=-1)
gridsearch_xgboost.fit(X_train, Y_train)

# Get the best model and hyperparameters for XGBoost
best_model_xgboost = gridsearch_xgboost.best_estimator_
best_params_xgboost = gridsearch_xgboost.best_params_

# Predict on the test set using XGBoost
Y_test_pred_xgboost = best_model_xgboost.predict(X_test)



In [19]:
# Evaluate the XGBoost model using F1-score and accuracy
f1_test_xgboost = f1_score(Y_test, Y_test_pred_xgboost)
test_accuracy_xgboost = accuracy_score(Y_test, Y_test_pred_xgboost)


In [20]:
# Generate a classification report for XGBoost
classification_rep_xgboost = classification_report(Y_test, Y_test_pred_xgboost)

print("Best Hyperparameters (XGBoost):", best_params_xgboost)
print("F1-score on Test Set (XGBoost):", f1_test_xgboost)
print("Test Accuracy (XGBoost):", test_accuracy_xgboost)
print("\nClassification Report (XGBoost):\n", classification_rep_xgboost)

Best Hyperparameters (XGBoost): {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100}
F1-score on Test Set (XGBoost): 0.0950446791226645
Test Accuracy (XGBoost): 0.8514666666666667

Classification Report (XGBoost):
               precision    recall  f1-score   support

           0       0.86      0.99      0.92     12757
           1       0.53      0.05      0.10      2243

    accuracy                           0.85     15000
   macro avg       0.70      0.52      0.51     15000
weighted avg       0.81      0.85      0.80     15000



In [None]:
# # Make predictions on a new dataset (X_new)
# Y_new_pred_xgboost = best_model_xgboost.predict(X_new)