In [8]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import  f1_score, classification_report, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML/8.project1/tafrihi/data/train_data.csv')
data

Unnamed: 0,Created,CancelTime,DepartureTime,BillID,TicketID,ReserveStatus,UserID,Male,Price,CouponDiscount,...,Domestic,VehicleType,VehicleClass,TripReason,Vehicle,Cancel,HashPassportNumber_p,HashEmail,BuyerMobile,NationalCode
0,2022-10-23 09:38:49.110,,2022-11-02 23:59:00,39710203,1091777.0,5,122885.0,True,6600000.0,34425.0,...,1,,False,Work,Plane,0,,66c7f29e3b92f3b77e20830ac29e7758037a53d2238a5b...,764974891906,477368495
1,2022-08-15 14:51:43.160,,2022-08-18 04:15:00,38689463,1070902.0,5,876925.0,True,9500000.0,0.0,...,1,,False,Int,Plane,0,,b24634843858a4175d03422aa9e7211ec3b9f3ce4c481c...,27479149496,15987669
2,2022-09-20 17:25:27.250,,2022-09-21 11:00:00,39245173,7624237.0,3,916640.0,False,2000000.0,0.0,...,1,VIP 2+1,True,Work,Bus,0,,,323657282999,667640412
3,2022-06-25 11:32:53.980,,2022-06-26 08:30:00,37957585,2867547.0,2,,False,40000.0,0.0,...,1,3 ستاره اتوبوسي,,Int,Train,0,,,169459057632,392476186
4,2022-06-01 11:30:53.633,,2022-06-02 23:00:00,37584530,7212559.0,3,,True,1130000.0,0.0,...,1,اسکانیا تک صندلی ۳۱نفره,True,Int,Bus,0,,,408595008421,79497837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101012,2022-06-01 00:20:14.280,,2022-06-04 12:10:00,37579327,1050781.0,5,,True,5900000.0,0.0,...,1,بوئینگ 737,False,Int,Plane,0,,,605105141718,103215806
101013,2022-10-29 20:54:31.330,,2022-11-01 15:30:00,39789479,3085407.0,2,403095.0,True,926500.0,0.0,...,1,4 ستاره اتوبوسي نگين,,Work,Train,0,,,414997568556,193262890
101014,2022-09-03 17:57:22.067,,2022-09-13 09:30:00,38991563,2322052.0,5,528307.0,True,30000000.0,0.0,...,0,,False,Int,InternationalPlane,0,47b8f2d9b5de7e0e0e7234c18a1aa0c4b35798e6cb46b4...,a4dcb7941ee3c8f7b1fc6a171015692bc961d65a84ad47...,99460830937,34732401
101015,2022-09-29 13:15:51.303,,2022-09-29 17:30:00,39406503,7664730.0,3,797946.0,True,980000.0,0.0,...,1,25 نفره (VIP),True,Work,Bus,0,,718bc52c3e88520531463b385998a1193e2821d518b60b...,487489926847,458338866


In [39]:
# splitting the dataset
train_data, test_data = train_test_split(data, test_size = 0.2)

In [40]:
# checking for missing data
print(train_data.isna().sum())
print(test_data.isna().sum())

Created                     0
CancelTime              68589
DepartureTime               0
BillID                      0
TicketID                    0
ReserveStatus               0
UserID                  46772
Male                        0
Price                       0
CouponDiscount              0
From                        0
To                          0
Domestic                    0
VehicleType              6041
VehicleClass            30716
TripReason                  0
Vehicle                     0
Cancel                      0
HashPassportNumber_p    80121
HashEmail               46300
BuyerMobile                 0
NationalCode                0
dtype: int64
Created                     0
CancelTime              17102
DepartureTime               0
BillID                      0
TicketID                    0
ReserveStatus               0
UserID                  11702
Male                        0
Price                       0
CouponDiscount              0
From                       

In [41]:
# convert date columns to datetime format and calculate the days until departure
train_data['Created'] = pd.to_datetime(train_data['Created'])
train_data['DepartureTime'] = pd.to_datetime(train_data['DepartureTime'])
train_data['DaysUntilDeparture'] = (train_data['DepartureTime'] - train_data['Created']).dt.days

test_data['Created'] = pd.to_datetime(test_data['Created'])
test_data['DepartureTime'] = pd.to_datetime(test_data['DepartureTime'])
test_data['DaysUntilDeparture'] = (test_data['DepartureTime'] - test_data['Created']).dt.days

In [42]:
# group by 'BillID', calculate unique 'Male' values, and create a 'HasFamily' feature based on the count,
# then merge it back to both train and test datasets.
grouped = train_data.groupby('BillID')['Male'].nunique()
grouped = grouped.reset_index()
grouped['HasFamily'] = grouped['Male'].apply(lambda x: 1 if x > 1 else 0)

train_data = train_data.merge(grouped[['BillID', 'HasFamily']], on = 'BillID', how = 'left')

grouped = test_data.groupby('BillID')['Male'].nunique()
grouped = grouped.reset_index()
grouped['HasFamily'] = grouped['Male'].apply(lambda x: 1 if x > 1 else 0)

test_data = test_data.merge(grouped[['BillID', 'HasFamily']], on = 'BillID', how = 'left')

In [43]:
# group by 'BillID', calculate the unique 'TicketID' values, count the tickets,
# and merge the 'TicketCount' feature back to both train and test datasets.
grouped = train_data.groupby('BillID')['TicketID'].unique()
grouped = grouped.reset_index()
grouped['TicketCount'] = grouped['TicketID'].apply(lambda x: len(x))

train_data = train_data.merge(grouped[['BillID', 'TicketCount']], on = 'BillID', how = 'left')

grouped = test_data.groupby('BillID')['TicketID'].unique()
grouped = grouped.reset_index()
grouped['TicketCount'] = grouped['TicketID'].apply(lambda x: len(x))

test_data = test_data.merge(grouped[['BillID', 'TicketCount']], on = 'BillID', how = 'left')

In [44]:
# fill missing values in 'VehicleType' with the most frequent value and drop irrelevant columns from both train and test datasets.
VehicleType_most_frequent = train_data['VehicleType'].mode()[0]
train_data['VehicleType'] = train_data['VehicleType'].fillna(VehicleType_most_frequent)

test_data['VehicleType'] = test_data['VehicleType'].fillna(VehicleType_most_frequent)

train_data = train_data.drop(columns = ['Created', 'DepartureTime', 'CancelTime', 'BillID', 'TicketID', 'UserID', 'Male', 'VehicleClass', 'HashPassportNumber_p', 'HashEmail', 'BuyerMobile', 'NationalCode'])
test_data = test_data.drop(columns = ['Created', 'DepartureTime', 'CancelTime', 'BillID', 'TicketID', 'UserID', 'Male', 'VehicleClass', 'HashPassportNumber_p', 'HashEmail', 'BuyerMobile', 'NationalCode'])

In [45]:
# encoding the categorical features
unique_values = pd.concat([train_data['From'], train_data['To'], test_data['From'], test_data['To']]).unique()

label_encoder = LabelEncoder()
label_encoder.fit(unique_values)

train_data['From'] = label_encoder.transform(train_data['From'])
train_data['To'] = label_encoder.transform(train_data['To'])

test_data['From'] = label_encoder.transform(test_data['From'])
test_data['To'] = label_encoder.transform(test_data['To'])



unique_values = pd.concat([train_data['VehicleType'], test_data['VehicleType']]).unique()

label_encoder = LabelEncoder()
label_encoder.fit(unique_values)

train_data['VehicleType'] = label_encoder.transform(train_data['VehicleType'])
test_data['VehicleType'] = label_encoder.transform(test_data['VehicleType'])



unique_values = pd.concat([train_data['Vehicle'], test_data['Vehicle']]).unique()

label_encoder = LabelEncoder()
label_encoder.fit(unique_values)

train_data['Vehicle'] = label_encoder.transform(train_data['Vehicle'])
test_data['Vehicle'] = label_encoder.transform(test_data['Vehicle'])


train_data.loc[:, 'TripReason'] = train_data['TripReason'].map({'Work': 0, 'Int': 1})
test_data.loc[:, 'TripReason'] = test_data['TripReason'].map({'Work': 0, 'Int': 1})

In [46]:
# standardize features
std_scaler = StandardScaler()

y_train = train_data['TripReason']
X_train = train_data.drop(columns = ['TripReason'])

train_data_scaled = std_scaler.fit_transform(X_train)

train_data_scaled_df = pd.DataFrame(train_data_scaled, columns = X_train.columns)
train_data_scaled_df['TripReason'] = y_train.values
train_data = train_data_scaled_df.copy()



y_test = test_data['TripReason']
X_test = test_data.drop(columns = ['TripReason'])

test_data_scaled = std_scaler.transform(X_test)

test_data_scaled_df = pd.DataFrame(test_data_scaled, columns = X_test.columns)
test_data_scaled_df['TripReason'] = y_test.values
test_data = test_data_scaled_df.copy()

In [47]:
train_data

Unnamed: 0,ReserveStatus,Price,CouponDiscount,From,To,Domestic,VehicleType,Vehicle,Cancel,DaysUntilDeparture,HasFamily,TicketCount,TripReason
0,0.784229,-0.349400,-0.111045,-0.639568,-0.018442,0.089697,-0.819590,1.138135,2.369785,0.059904,-0.449426,-0.506352,0
1,-0.147427,-0.143491,-0.111045,1.689625,-1.384395,0.089697,0.197089,-0.999088,-0.421979,-0.675144,-0.449426,-0.506352,0
2,-1.079083,-0.008732,-0.111045,1.037001,-0.701419,0.089697,-0.801567,1.138135,-0.421979,-0.307620,-0.449426,0.034015,1
3,-0.147427,-0.093203,-0.111045,1.003245,-0.701419,0.089697,1.040965,-0.999088,-0.421979,-0.552636,-0.449426,-0.506352,0
4,-1.079083,-0.053720,-0.111045,1.037001,-0.701419,0.089697,-0.824890,1.138135,-0.421979,0.549936,-0.449426,-0.506352,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
80808,-1.079083,0.175702,-0.111045,-0.819602,0.994639,0.089697,-0.824890,1.138135,-0.421979,-0.552636,2.225058,0.574381,1
80809,-1.079083,-0.098707,-0.111045,-1.134662,-0.701419,0.089697,-0.783545,1.138135,-0.421979,2.265049,-0.449426,-0.506352,1
80810,1.715885,0.764820,-0.111045,-0.707081,-0.701419,0.089697,0.814094,0.425727,-0.421979,1.039969,2.225058,0.034015,1
80811,1.715885,0.942187,-0.111045,-1.202175,-0.883546,0.089697,-0.820650,0.425727,2.369785,0.182412,-0.449426,0.034015,0


In [48]:
test_data

Unnamed: 0,ReserveStatus,Price,CouponDiscount,From,To,Domestic,VehicleType,Vehicle,Cancel,DaysUntilDeparture,HasFamily,TicketCount,TripReason
0,-0.147427,-0.271249,-0.111045,1.442078,-0.701419,0.089697,1.553015,-0.999088,-0.421979,-0.675144,-0.449426,-0.506352,1
1,1.715885,-0.355516,-0.111045,-0.639568,-0.211952,0.089697,0.681576,-0.999088,2.369785,0.182412,-0.449426,-0.506352,1
2,-0.147427,-0.220961,-0.111045,-0.279500,-0.701419,0.089697,1.668571,-0.999088,-0.421979,-0.675144,-0.449426,-0.506352,0
3,-1.079083,-0.082126,-0.111045,1.037001,-0.701419,0.089697,-0.783545,1.138135,-0.421979,-0.430128,-0.449426,0.034015,1
4,-0.147427,-0.365030,-0.111045,-0.639568,0.892193,0.089697,0.681576,-0.999088,-0.421979,-0.675144,-0.449426,-0.506352,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20199,-0.147427,-0.176110,-0.111045,-0.639568,0.402726,0.089697,-0.117774,-0.999088,-0.421979,-0.552636,-0.449426,-0.506352,1
20200,1.715885,7.786932,-0.111045,-0.639568,-1.350246,-11.148609,0.781229,-0.286680,-0.421979,0.059904,-0.449426,-0.506352,0
20201,-0.147427,-0.191060,-0.111045,-0.819602,-0.052591,0.089697,2.236809,-0.999088,-0.421979,-0.675144,-0.449426,-0.506352,0
20202,-1.079083,-0.154432,-0.111045,1.037001,-0.701419,0.089697,-0.840793,1.138135,-0.421979,-0.675144,-0.449426,-0.506352,1


In [49]:
# splitting the dataset
X_train = train_data.drop(columns = ['TripReason'])
X_test = test_data.drop(columns = ['TripReason'])
y_train = train_data['TripReason']
y_test = test_data['TripReason']

In [50]:
# converting y_train and y_test to integers to ensure consistent numeric encoding
y_train = y_train.astype(int)
y_test = y_test.astype(int)

In [57]:
# modeling
model = XGBClassifier()
model.fit(X_train, y_train)

In [58]:
# predicting on the train and test sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [59]:
# model evaluation
score_1 = f1_score(y_train, y_train_pred)
print(f"f1 score on Training Data: {score_1}")

score_2 = f1_score(y_test, y_test_pred)
print(f"f1 score on Test Data: {score_2}")

f1 score on Training Data: 0.7949628868161273
f1 score on Test Data: 0.6974900924702774
