In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import  f1_score, classification_report, confusion_matrix

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML/8.project1/canceli/data/train_data.csv')
data

Unnamed: 0,Created,CancelTime,DepartureTime,BillID,TicketID,ReserveStatus,UserID,Male,Price,CouponDiscount,...,Domestic,VehicleType,VehicleClass,TripReason,Vehicle,Cancel,HashPassportNumber_p,HashEmail,BuyerMobile,NationalCode
0,2022-07-26 13:33:20.457,,2022-07-26 16:30:00,38428546,7445571.0,3,,True,1180000.0,0.0,...,1,VIPمانیتوردار-شارژراختصاصی تخت شو مارال (جدید)...,True,Work,Bus,0,,,302222356019,330024570
1,2022-10-27 23:07:01.837,2022-10-27 23:26:39.070,2022-10-29 09:45:00,39768762,7762719.0,5,,False,1050000.0,0.0,...,1,classicus 2+2,True,Int,Bus,1,,,900764168521,995520696
2,2022-09-12 11:01:13.607,,2022-10-03 18:35:00,39128001,2327596.0,5,800398.0,False,4674000.0,0.0,...,1,فوکر 100,False,Int,Plane,0,,1c44d7a76b52341fa12dcfa993138576befcc9ebf01d14...,749804783291,979382950
3,2022-08-08 17:43:35.840,,2022-08-08 22:30:00,38606546,7495440.0,3,,True,1200000.0,0.0,...,1,VIPدرسا+مانیتوردار+شارژراختصاصی+پذیرایی,True,Work,Bus,0,,,781396205677,911237229
4,2022-11-01 15:12:56.823,,2022-11-03 11:30:00,39822185,2356902.0,5,,True,6222000.0,0.0,...,1,,False,Work,Plane,0,,bb38b345aec02255e31d178492907175c5984f2a1f5b59...,524576220177,727496008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101012,2022-10-27 21:41:35.803,,2022-11-05 20:15:00,39767774,3082282.0,2,,False,7200000.0,0.0,...,1,5ستاره بيزينس سلامت فدك,,Int,Train,0,,,395081863564,789320493
101013,2022-09-24 15:51:11.993,,2022-10-21 09:15:00,39319207,3026516.0,2,151423.0,False,4292000.0,0.0,...,1,4ستاره4تخته سپهر,,Int,Train,0,,3f28ed65a16d629747e4d27fab100b2b082fcbdf7ec831...,130026405332,866503410
101014,2022-08-12 13:46:20.480,,2022-08-12 23:59:00,38653461,7508988.0,3,,True,1320000.0,0.0,...,1,VIP 2+1 / مانیتوردار / سیستم تهویه مطبوع / تخ...,True,Int,Bus,0,,,784949357488,881677039
101015,2022-06-20 15:15:37.780,,2022-06-25 10:10:00,37880627,1057205.0,5,641744.0,True,13000000.0,0.0,...,1,,False,Work,Plane,0,,750690ca99468f159eff6ad928cec4339b089af8c2e6dc...,766602541733,403505466


In [26]:
# splitting the dataset
train_data, test_data = train_test_split(data, test_size = 0.2)

In [27]:
# checking for missing data
print(train_data.isna().sum())
print(test_data.isna().sum())

Created                     0
CancelTime              68565
DepartureTime               0
BillID                      0
TicketID                    0
ReserveStatus               0
UserID                  46852
Male                        0
Price                       0
CouponDiscount              0
From                        0
To                          0
Domestic                    0
VehicleType              6109
VehicleClass            30634
TripReason                  0
Vehicle                     0
Cancel                      0
HashPassportNumber_p    80122
HashEmail               46400
BuyerMobile                 0
NationalCode                0
dtype: int64
Created                     0
CancelTime              17140
DepartureTime               0
BillID                      0
TicketID                    0
ReserveStatus               0
UserID                  11663
Male                        0
Price                       0
CouponDiscount              0
From                       

In [28]:
# convert date columns to datetime format and calculate the days until departure
train_data['Created'] = pd.to_datetime(train_data['Created'])
train_data['DepartureTime'] = pd.to_datetime(train_data['DepartureTime'])
train_data['DaysUntilDeparture'] = (train_data['DepartureTime'] - train_data['Created']).dt.days

test_data['Created'] = pd.to_datetime(test_data['Created'])
test_data['DepartureTime'] = pd.to_datetime(test_data['DepartureTime'])
test_data['DaysUntilDeparture'] = (test_data['DepartureTime'] - test_data['Created']).dt.days

In [29]:
# group by 'BillID', calculate unique 'Male' values, and create a 'HasFamily' feature based on the count,
# then merge it back to both train and test datasets
grouped = train_data.groupby('BillID')['Male'].nunique()
grouped = grouped.reset_index()
grouped['HasFamily'] = grouped['Male'].apply(lambda x: 1 if x > 1 else 0)

train_data = train_data.merge(grouped[['BillID', 'HasFamily']], on = 'BillID', how = 'left')

grouped = test_data.groupby('BillID')['Male'].nunique()
grouped = grouped.reset_index()
grouped['HasFamily'] = grouped['Male'].apply(lambda x: 1 if x > 1 else 0)

test_data = test_data.merge(grouped[['BillID', 'HasFamily']], on = 'BillID', how = 'left')

In [30]:
# group by 'BillID', calculate the unique 'TicketID' values, count the tickets,
# and merge the 'TicketCount' feature back to both train and test datasets
grouped = train_data.groupby('BillID')['TicketID'].unique()
grouped = grouped.reset_index()
grouped['TicketCount'] = grouped['TicketID'].apply(lambda x: len(x))

train_data = train_data.merge(grouped[['BillID', 'TicketCount']], on = 'BillID', how = 'left')

grouped = test_data.groupby('BillID')['TicketID'].unique()
grouped = grouped.reset_index()
grouped['TicketCount'] = grouped['TicketID'].apply(lambda x: len(x))

test_data = test_data.merge(grouped[['BillID', 'TicketCount']], on = 'BillID', how = 'left')

In [32]:
# fill missing values in 'VehicleType' with the most frequent value and drop irrelevant columns from both train and test datasets.
VehicleType_most_frequent = train_data['VehicleType'].mode()[0]
train_data['VehicleType'] = train_data['VehicleType'].fillna(VehicleType_most_frequent)

test_data['VehicleType'] = test_data['VehicleType'].fillna(VehicleType_most_frequent)

train_data = train_data.drop(columns = ['Created', 'DepartureTime', 'CancelTime', 'BillID', 'TicketID', 'UserID', 'Male', 'VehicleClass', 'HashPassportNumber_p', 'HashEmail', 'BuyerMobile', 'NationalCode'])
test_data = test_data.drop(columns = ['Created', 'DepartureTime', 'CancelTime', 'BillID', 'TicketID', 'UserID', 'Male', 'VehicleClass', 'HashPassportNumber_p', 'HashEmail', 'BuyerMobile', 'NationalCode'])

In [33]:
# encoding the categorical features
unique_values = pd.concat([train_data['From'], train_data['To'], test_data['From'], test_data['To']]).unique()

label_encoder = LabelEncoder()
label_encoder.fit(unique_values)

train_data['From'] = label_encoder.transform(train_data['From'])
train_data['To'] = label_encoder.transform(train_data['To'])

test_data['From'] = label_encoder.transform(test_data['From'])
test_data['To'] = label_encoder.transform(test_data['To'])



unique_values = pd.concat([train_data['VehicleType'], test_data['VehicleType']]).unique()

label_encoder = LabelEncoder()
label_encoder.fit(unique_values)

train_data['VehicleType'] = label_encoder.transform(train_data['VehicleType'])
test_data['VehicleType'] = label_encoder.transform(test_data['VehicleType'])



unique_values = pd.concat([train_data['Vehicle'], test_data['Vehicle']]).unique()

label_encoder = LabelEncoder()
label_encoder.fit(unique_values)

train_data['Vehicle'] = label_encoder.transform(train_data['Vehicle'])
test_data['Vehicle'] = label_encoder.transform(test_data['Vehicle'])


label_encoder = LabelEncoder()
train_data['TripReason'] = label_encoder.fit_transform(train_data['TripReason'])
test_data['TripReason'] = label_encoder.transform(test_data['TripReason'])

In [34]:
# standardize features
std_scaler = StandardScaler()

y_train = train_data['Cancel']
X_train = train_data.drop(columns = ['Cancel'])

train_data_scaled = std_scaler.fit_transform(X_train)

train_data_scaled_df = pd.DataFrame(train_data_scaled, columns = X_train.columns)
train_data_scaled_df['Cancel'] = y_train.values
train_data = train_data_scaled_df.copy()



y_test = test_data['Cancel']
X_test = test_data.drop(columns = ['Cancel'])

test_data_scaled = std_scaler.transform(X_test)

test_data_scaled_df = pd.DataFrame(test_data_scaled, columns = X_test.columns)
test_data_scaled_df['Cancel'] = y_test.values
test_data = test_data_scaled_df.copy()

In [35]:
train_data

Unnamed: 0,ReserveStatus,Price,CouponDiscount,From,To,Domestic,VehicleType,TripReason,Vehicle,DaysUntilDeparture,HasFamily,TicketCount,Cancel
0,1.704898,7.154196,-0.112933,1.038509,1.243294,-11.166073,0.830795,-1.127238,-0.287722,-0.680957,-0.449665,-0.505504,0
1,-0.152223,-0.318478,-0.112933,-1.198562,1.958894,0.089557,1.254069,-1.127238,-1.000832,0.303139,-0.449665,-0.505504,0
2,-1.080783,-0.447648,-0.112933,0.779953,-0.699047,0.089557,-0.819345,-1.127238,1.138498,-0.434933,-0.449665,-0.505504,0
3,1.704898,0.876695,-0.112933,1.038509,0.391390,0.089557,-0.819345,-1.127238,0.425388,-0.311921,2.223877,0.030792,0
4,-0.152223,-0.099884,-0.112933,-0.636484,-1.198830,0.089557,0.142451,0.887124,-1.000832,-0.680957,-0.449665,0.030792,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
80808,-1.080783,-0.447648,-0.112933,0.779953,-0.699047,0.089557,-0.819345,0.887124,1.138498,0.426151,-0.449665,-0.505504,0
80809,-1.080783,-0.447648,-0.112933,0.779953,-0.699047,0.089557,-0.819345,-1.127238,1.138498,0.057115,2.223877,0.030792,0
80810,-1.080783,-0.396406,-0.112933,0.971060,-0.767199,0.089557,-0.856015,0.887124,1.138498,-0.434933,2.223877,0.030792,0
80811,-0.152223,-0.256023,-0.112933,0.071735,1.004761,0.089557,1.254069,-1.127238,-1.000832,-0.188909,-0.449665,-0.505504,0


In [36]:
test_data

Unnamed: 0,ReserveStatus,Price,CouponDiscount,From,To,Domestic,VehicleType,TripReason,Vehicle,DaysUntilDeparture,HasFamily,TicketCount,Cancel
0,-0.152223,-0.324156,-0.112933,-0.636484,-1.266983,0.089557,1.783162,0.887124,-1.000832,-0.680957,-0.449665,-0.505504,0
1,-1.080783,-0.126640,-0.112933,-0.636484,1.686284,0.089557,-0.823536,-1.127238,1.138498,-0.311921,-0.449665,-0.505504,0
2,0.776338,-0.094135,-0.112933,-0.636484,0.993402,0.089557,-0.839251,-1.127238,1.138498,-0.188909,-0.449665,-0.505504,1
3,0.776338,0.151642,-0.112933,-1.198562,0.993402,0.089557,-0.775341,-1.127238,1.138498,2.886392,-0.449665,-0.505504,1
4,-0.152223,-0.285831,-0.112933,1.443206,-0.699047,0.089557,1.563143,0.887124,-1.000832,-0.557945,-0.449665,-0.505504,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20199,-1.080783,0.479250,-0.112933,-0.434136,-0.699047,0.089557,-0.756482,0.887124,1.138498,-0.434933,-0.449665,-0.505504,0
20200,-0.152223,-0.238989,-0.112933,-1.322219,1.004761,0.089557,0.708213,-1.127238,-1.000832,-0.680957,-0.449665,0.030792,0
20201,-0.152223,-0.102722,-0.112933,1.386998,-0.699047,0.089557,1.909935,0.887124,-1.000832,-0.680957,-0.449665,-0.505504,0
20202,-0.152223,-0.329834,-0.112933,-1.378427,-0.699047,0.089557,1.185968,0.887124,-1.000832,0.057115,-0.449665,-0.505504,0


In [37]:
# splitting the dataset
X_train = train_data.drop(columns = ['Cancel'])
X_test = test_data.drop(columns = ['Cancel'])
y_train = train_data['Cancel']
y_test = test_data['Cancel']

In [38]:
# modeling
model = XGBClassifier(n_estimators = 150)
model.fit(X_train, y_train)

In [39]:
# predicting on the train and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [40]:
# model evaluation
score_1 = f1_score(y_train, y_train_pred)
print(f"f1 score on Training Data: {score_1}")

score_2 = f1_score(y_test, y_test_pred)
print(f"f1 score on Test Data: {score_2}")

print(f"\nClassification Report: {classification_report(y_train, y_train_pred)}")

print(f"\nConfusion Matrix: {confusion_matrix(y_train, y_train_pred)}")

f1 score on Training Data: 0.9620306805661497
f1 score on Test Data: 0.9410150891632373

Classification Report:               precision    recall  f1-score   support

           0       0.99      1.00      0.99     68575
           1       1.00      0.93      0.96     12238

    accuracy                           0.99     80813
   macro avg       0.99      0.96      0.98     80813
weighted avg       0.99      0.99      0.99     80813


Confusion Matrix: [[68566     9]
 [  887 11351]]
