# 📥 Import Libraries & Dataset

In [269]:
# %pip install pandas
# %pip install numpy
# %pip install scikit-learn==1.5.2
# %pip install xgboost
# %pip install catboost



## 📦 Libraries

In [270]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb # type: ignore
from sklearn.model_selection import RandomizedSearchCV, train_test_split #type: ignore
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report #type: ignore
from catboost import CatBoostClassifier #type: ignore

## 🗃️ Dataset

In [271]:
# Import dataset
url = "https://drive.google.com/uc?id=1wnDUJCke0araT3A7SAk8ZLTIbV00R3ev"
df = pd.read_csv(url)

# 📰 Description

## 📍 Shape

In [272]:
print("Jumlah baris: ", df.shape[0])
print("Jumlah kolom: ", df.shape[1])

Jumlah baris:  8000
Jumlah kolom:  21


## 📍 Structure

In [273]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     8000 non-null   object 
 1   VendorID               8000 non-null   float64
 2   lpep_pickup_datetime   8000 non-null   object 
 3   lpep_dropoff_datetime  8000 non-null   object 
 4   store_and_fwd_flag     8000 non-null   object 
 5   RatecodeID             8000 non-null   float64
 6   PULocationID           8000 non-null   int64  
 7   DOLocationID           8000 non-null   int64  
 8   passenger_count        8000 non-null   float64
 9   trip_distance          8000 non-null   float64
 10  fare_amount            8000 non-null   float64
 11  extra                  8000 non-null   float64
 12  mta_tax                8000 non-null   float64
 13  tip_amount             8000 non-null   float64
 14  tolls_amount           8000 non-null   float64
 15  ehai

## 📍 Columns

In [274]:
df.columns

Index(['ID', 'VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'trip_type', 'congestion_surcharge', 'payment_type'],
      dtype='object')

Berikut ini adalah deskripsi kolom pada dataset:
- **VendorID**: Kode yang menunjukkan penyedia TPEP yang menyediakan catatan.
  - *1 = Creative Mobile Technologies, LLC*
  - *2 = VeriFone Inc*
- **lpep_pickup_datetime**: Tanggal dan waktu ketika meteran dinyalakan.
- **lpep_dropoff_datetime**: Tanggal dan waktu ketika meteran dimatikan.
- **store_and_fwd_flag**: Flag yang menunjukkan apakah catatan perjalanan disimpan dalam memori kendaraan sebelum dikirim ke vendor.
  - *Y = perjalanan yang disimpan dan diteruskan*
  - *N = perjalanan yang tidak disimpan dan diteruskan*
- **RatecodeID**: Kode tarif akhir yang berlaku pada akhir perjalanan.
  - *1 = Tarif standar*
  - *2 = JFK*
  - *3 = Newark*
  - *4 = Nassau atau Westchester*
  - *5 = Tarif negosiasi*
  - *6 = Perjalanan grup*
- **PULocationID**: Zona Taksi TLC di mana meteran dinyalakan.
- **DOLocationID**: Zona Taksi TLC di mana meteran dimatikan.
- **passenger_count**: Jumlah penumpang di kendaraan. Ini adalah nilai yang dimasukkan oleh sopir.
- **trip_distance**: Jarak perjalanan yang telah dilalui dalam mil yang dilaporkan oleh meteran.
- **fare_amount**: Biaya waktu dan jarak yang dihitung oleh meteran.
- **extra**: Biaya tambahan dan surcharge (misalnya, biaya tambahan $0,50 dan $1 untuk jam sibuk dan malam hari).
- **mta_tax**: Pajak MTA sebesar 0,50 dolar yang otomatis dikenakan berdasarkan tarif meteran yang digunakan.
- **tip_amount**: Jumlah tip field ini otomatis terisi untuk tip kartu kredit. Tip tunai tidak termasuk.
- **tolls_amount**: Jumlah total semua tol yang dibayar selama perjalanan.
- **improvement_surcharge**: Biaya perbaikan sebesar $0,30 yang dikenakan pada awal perjalanan. Biaya ini mulai diterapkan pada tahun 2015.
- **total_amount**: Jumlah total yang dibebankan kepada penumpang (tidak termasuk tip tunai).
- **payment_type**: Kode numerik yang menunjukkan bagaimana penumpang membayar perjalanan.
  - *1 = Kartu kredit*
  - *2 = Tunai*
- **trip_type**: Jenis perjalanan.
  - *1 = Inner city*
  - *2 = Outer city*
- **congestion_surcharge**: Jumlah total yang dikumpulkan untuk surcharge kemacetan NYC selama perjalanan.

## 📍 Overview

In [275]:
df.head()

Unnamed: 0,ID,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,...,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,trip_type,congestion_surcharge,payment_type
0,T03315,2.0,2021-07-02 17:19:11,2021-07-02 17:40:02,N,1.0,152,142,1.0,4.3,...,1.0,0.5,5.0,0.0,,0.3,27.55,1.0,2.75,1.0
1,T07720,2.0,2021-07-05 22:48:04,2021-07-05 23:12:02,N,1.0,93,107,2.0,13.52,...,0.5,0.5,12.15,6.55,,0.3,60.75,1.0,2.75,1.0
2,T09695,2.0,2021-07-07 06:29:43,2021-07-07 06:38:42,N,1.0,74,75,1.0,1.55,...,0.0,0.5,0.0,0.0,,0.3,8.8,1.0,0.0,2.0
3,T08802,2.0,2021-07-06 15:33:29,2021-07-06 15:43:25,N,1.0,226,129,1.0,1.22,...,0.0,0.5,0.0,0.0,,0.3,8.8,1.0,0.0,2.0
4,T01413,2.0,2021-07-01 17:09:07,2021-07-01 18:03:12,N,1.0,55,86,1.0,14.23,...,1.0,0.5,0.0,2.45,,0.3,49.75,1.0,0.0,1.0


In [276]:
df.describe()

Unnamed: 0,VendorID,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,trip_type,congestion_surcharge,payment_type
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,0.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,1.83325,1.176375,98.700125,134.694875,1.270125,3.459532,14.995941,0.330781,0.475875,1.277159,0.35139,,0.296887,18.237415,1.037875,0.5995,1.427625
std,0.372776,0.813541,64.792549,77.750038,0.917885,4.601964,14.543702,0.626111,0.113664,2.476832,1.503395,,0.038558,16.126335,0.190906,1.136344,0.517972
min,1.0,1.0,3.0,1.0,0.0,0.0,-25.0,-1.0,-0.5,-1.14,0.0,,-0.3,-28.55,1.0,-2.75,1.0
25%,2.0,1.0,55.0,74.0,1.0,1.03,7.0,0.0,0.5,0.0,0.0,,0.3,8.8,1.0,0.0,1.0
50%,2.0,1.0,75.0,135.0,1.0,1.93,10.2,0.0,0.5,0.0,0.0,,0.3,13.3,1.0,0.0,1.0
75%,2.0,1.0,129.0,212.0,1.0,3.79,16.5,0.5,0.5,2.04,0.0,,0.3,20.8,1.0,0.0,2.0
max,2.0,5.0,265.0,265.0,6.0,37.6,156.0,4.5,0.5,47.88,13.75,,0.3,207.48,2.0,2.75,4.0


# 🛠️ Data Pre-Processing

## 📍 Duplicate Data

In [277]:
# Cek duplikat data
print("Jumlah duplikasi data: ", df.duplicated().sum())

Jumlah duplikasi data:  0


## 📍 Missing Values

In [278]:
# Threshold
MISSING_THRESHOLD = df.shape[0] * 0.5

missing_counts = df.isna().sum().sort_values(ascending=False)
present_counts = df.notna().sum()
missing_percentage = (missing_counts / df.shape[0] * 100).round(2)

missing_data_summary = pd.DataFrame({
    'Missing Values': missing_counts,
    'Present Values': present_counts,
    'Missing Percent': missing_percentage
})

missing_data_summary[missing_data_summary['Missing Values'] > MISSING_THRESHOLD]

Unnamed: 0,Missing Values,Present Values,Missing Percent
ehail_fee,8000,0,100.0


In [279]:
# Drop ehail_fee
df = df.drop(columns=['ehail_fee'], axis=1)

In [280]:
# Drop payment_type diluar 1, 2
df = df[df['payment_type'].isin([1, 2])]

# 🛠️ Feature Engineering

## 📍 Encoding

In [281]:
# Encoding store_and_fwd_flag
label_encoder = LabelEncoder()

df["store_and_fwd_flag"] = label_encoder.fit_transform(df["store_and_fwd_flag"])

In [282]:
# Convert datetime
df['pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

In [283]:
# Extract time-based features
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_day'] = df['pickup_datetime'].dt.dayofweek
df['pickup_month'] = df['pickup_datetime'].dt.month
df['dropoff_hour'] = df['dropoff_datetime'].dt.hour
df['dropoff_day'] = df['dropoff_datetime'].dt.dayofweek
df['dropoff_month'] = df['dropoff_datetime'].dt.month

In [284]:
# Create trip_duration column
df['trip_duration'] = (df['dropoff_datetime'] - df['pickup_datetime']).dt.total_seconds() / 60

In [285]:
def encode_cyclical(df, feature, max_val):
    df[f'{feature}_sin'] = np.sin(2 * np.pi * df[feature] / max_val)
    df[f'{feature}_cos'] = np.cos(2 * np.pi * df[feature] / max_val)

In [286]:
# Cyclinical encoding hour and month
encode_cyclical(df, 'pickup_hour', 24)
encode_cyclical(df, 'dropoff_hour', 24)
encode_cyclical(df, 'pickup_month', 12)
encode_cyclical(df, 'dropoff_month', 12)

In [287]:
# Drop unnecessary columns
df = df.drop(columns=['pickup_hour', 'pickup_month', 'dropoff_hour', 'dropoff_month', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'pickup_datetime', 'dropoff_datetime'], axis=1)

In [288]:
df.head()

Unnamed: 0,ID,VendorID,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,...,dropoff_day,trip_duration,pickup_hour_sin,pickup_hour_cos,dropoff_hour_sin,dropoff_hour_cos,pickup_month_sin,pickup_month_cos,dropoff_month_sin,dropoff_month_cos
0,T03315,2.0,0,1.0,152,142,1.0,4.3,18.0,1.0,...,4,20.85,-0.965926,-0.258819,-0.965926,-0.258819,-0.5,-0.866025,-0.5,-0.866025
1,T07720,2.0,0,1.0,93,107,2.0,13.52,38.0,0.5,...,0,23.966667,-0.5,0.8660254,-0.258819,0.9659258,-0.5,-0.866025,-0.5,-0.866025
2,T09695,2.0,0,1.0,74,75,1.0,1.55,8.0,0.0,...,2,8.983333,1.0,6.123234000000001e-17,1.0,6.123234000000001e-17,-0.5,-0.866025,-0.5,-0.866025
3,T08802,2.0,0,1.0,226,129,1.0,1.22,8.0,0.0,...,1,9.933333,-0.707107,-0.7071068,-0.707107,-0.7071068,-0.5,-0.866025,-0.5,-0.866025
4,T01413,2.0,0,1.0,55,86,1.0,14.23,45.5,1.0,...,3,54.083333,-0.965926,-0.258819,-1.0,-1.83697e-16,-0.5,-0.866025,-0.5,-0.866025


In [289]:
# One Hot Encoding Day
pickup_day_one_hot = pd.get_dummies(df['pickup_day'], prefix='pickup_day', drop_first=True).astype(int)
dropoff_day_one_hot = pd.get_dummies(df['dropoff_day'], prefix='dropoff_day', drop_first=True).astype(int)

In [290]:
df = pd.concat([df, pickup_day_one_hot, dropoff_day_one_hot], axis=1)

In [291]:
# Drop unnecessary columns
df = df.drop(columns=['pickup_day', 'dropoff_day'], axis=1)

In [292]:
df.head()

Unnamed: 0,ID,VendorID,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,...,pickup_day_3,pickup_day_4,pickup_day_5,pickup_day_6,dropoff_day_1,dropoff_day_2,dropoff_day_3,dropoff_day_4,dropoff_day_5,dropoff_day_6
0,T03315,2.0,0,1.0,152,142,1.0,4.3,18.0,1.0,...,0,1,0,0,0,0,0,1,0,0
1,T07720,2.0,0,1.0,93,107,2.0,13.52,38.0,0.5,...,0,0,0,0,0,0,0,0,0,0
2,T09695,2.0,0,1.0,74,75,1.0,1.55,8.0,0.0,...,0,0,0,0,0,1,0,0,0,0
3,T08802,2.0,0,1.0,226,129,1.0,1.22,8.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,T01413,2.0,0,1.0,55,86,1.0,14.23,45.5,1.0,...,1,0,0,0,0,0,1,0,0,0


# 🛠️ Persiapan Dataset Testing (Kaggle)

In [293]:
url = "https://drive.google.com/uc?id=192s2gBVRe7_5kTDkoIRM6Tls7SPNsnvf"
df_kaggle = pd.read_csv(url)

In [294]:
df_kaggle.head()

Unnamed: 0.1,Unnamed: 0,ID,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,...,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,trip_type,congestion_surcharge,payment_type
0,0,T00876,1.0,2021-07-01 13:41:08,2021-07-01 14:36:53,N,2.0,74,132,1.0,...,0.0,0.5,10.0,6.55,,0.3,69.35,1.0,0.0,1.0
1,1,T01498,2.0,2021-07-01 18:33:34,2021-07-01 19:20:39,N,5.0,82,143,1.0,...,0.0,0.0,2.46,6.55,,0.3,34.08,1.0,2.75,1.0
2,2,T08153,2.0,2021-07-06 09:37:59,2021-07-06 09:46:37,N,1.0,74,75,1.0,...,0.0,0.5,2.08,0.0,,0.3,10.38,1.0,0.0,1.0
3,3,T03394,2.0,2021-07-02 18:24:34,2021-07-02 18:39:57,N,1.0,244,50,1.0,...,1.0,0.5,4.91,0.0,,0.3,29.46,1.0,2.75,1.0
4,4,T09784,2.0,2021-07-07 08:30:59,2021-07-07 08:40:24,N,1.0,7,129,1.0,...,0.0,0.5,0.0,0.0,,0.3,10.8,1.0,0.0,2.0


In [295]:
df_kaggle = df_kaggle.drop(columns=['Unnamed: 0', 'ehail_fee'])

In [296]:
df_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     400 non-null    object 
 1   VendorID               400 non-null    float64
 2   lpep_pickup_datetime   400 non-null    object 
 3   lpep_dropoff_datetime  400 non-null    object 
 4   store_and_fwd_flag     400 non-null    object 
 5   RatecodeID             400 non-null    float64
 6   PULocationID           400 non-null    int64  
 7   DOLocationID           400 non-null    int64  
 8   passenger_count        400 non-null    float64
 9   fare_amount            400 non-null    float64
 10  extra                  400 non-null    float64
 11  mta_tax                400 non-null    float64
 12  tip_amount             400 non-null    float64
 13  tolls_amount           400 non-null    float64
 14  improvement_surcharge  400 non-null    float64
 15  total_

In [297]:
processed_df_kaggle = df_kaggle.copy()

In [298]:
processed_df_kaggle["store_and_fwd_flag"] = label_encoder.transform(processed_df_kaggle["store_and_fwd_flag"])

In [299]:
processed_df_kaggle

Unnamed: 0,ID,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,trip_type,congestion_surcharge,payment_type
0,T00876,1.0,2021-07-01 13:41:08,2021-07-01 14:36:53,0,2.0,74,132,1.0,52.00,0.0,0.5,10.00,6.55,0.3,69.35,1.0,0.00,1.0
1,T01498,2.0,2021-07-01 18:33:34,2021-07-01 19:20:39,0,5.0,82,143,1.0,22.02,0.0,0.0,2.46,6.55,0.3,34.08,1.0,2.75,1.0
2,T08153,2.0,2021-07-06 09:37:59,2021-07-06 09:46:37,0,1.0,74,75,1.0,7.50,0.0,0.5,2.08,0.00,0.3,10.38,1.0,0.00,1.0
3,T03394,2.0,2021-07-02 18:24:34,2021-07-02 18:39:57,0,1.0,244,50,1.0,20.00,1.0,0.5,4.91,0.00,0.3,29.46,1.0,2.75,1.0
4,T09784,2.0,2021-07-07 08:30:59,2021-07-07 08:40:24,0,1.0,7,129,1.0,10.00,0.0,0.5,0.00,0.00,0.3,10.80,1.0,0.00,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,T07978,2.0,2021-07-06 08:58:05,2021-07-06 09:24:58,0,1.0,75,136,1.0,25.00,0.0,0.5,3.00,0.00,0.3,28.80,1.0,0.00,1.0
396,T09337,2.0,2021-07-06 18:15:57,2021-07-06 18:16:50,0,1.0,193,193,1.0,0.00,0.0,0.0,0.00,0.00,0.0,0.00,1.0,0.00,1.0
397,T03284,1.0,2021-07-02 17:52:27,2021-07-02 17:59:55,0,1.0,95,95,1.0,6.50,1.0,0.5,1.70,0.00,0.3,10.00,1.0,0.00,1.0
398,T07319,2.0,2021-07-05 17:14:54,2021-07-05 17:25:24,0,1.0,95,95,1.0,9.50,0.0,0.5,0.00,0.00,0.3,10.30,1.0,0.00,2.0


In [300]:
processed_df_kaggle['pickup_datetime'] = pd.to_datetime(processed_df_kaggle['lpep_pickup_datetime'])
processed_df_kaggle['dropoff_datetime'] = pd.to_datetime(processed_df_kaggle['lpep_dropoff_datetime'])

In [301]:
processed_df_kaggle['pickup_hour'] = processed_df_kaggle['pickup_datetime'].dt.hour
processed_df_kaggle['pickup_day'] = processed_df_kaggle['pickup_datetime'].dt.dayofweek
processed_df_kaggle['pickup_month'] = processed_df_kaggle['pickup_datetime'].dt.month
processed_df_kaggle['dropoff_hour'] = processed_df_kaggle['dropoff_datetime'].dt.hour
processed_df_kaggle['dropoff_day'] = processed_df_kaggle['dropoff_datetime'].dt.dayofweek
processed_df_kaggle['dropoff_month'] = processed_df_kaggle['dropoff_datetime'].dt.month


In [302]:
processed_df_kaggle['trip_duration'] = (processed_df_kaggle['dropoff_datetime'] - processed_df_kaggle['pickup_datetime']).dt.total_seconds() / 60

In [303]:
def encode_test_cyclical(feature, max_val):
    processed_df_kaggle[f'{feature}_sin'] = np.sin(2 * np.pi * processed_df_kaggle[feature] / max_val)
    processed_df_kaggle[f'{feature}_cos'] = np.cos(2 * np.pi * processed_df_kaggle[feature] / max_val)

In [304]:
encode_test_cyclical('pickup_hour', 24)
encode_test_cyclical('dropoff_hour', 24)
encode_test_cyclical('pickup_month', 12)
encode_test_cyclical('dropoff_month', 12)

In [305]:
processed_df_kaggle = processed_df_kaggle.drop(columns=['pickup_hour', 'pickup_month', 'dropoff_hour', 'dropoff_month', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'pickup_datetime', 'dropoff_datetime'], axis=1)

In [306]:
test_pickup_day_one_hot = pd.get_dummies(processed_df_kaggle['pickup_day'], prefix='pickup_day', drop_first=True).astype(int)
test_dropoff_day_one_hot = pd.get_dummies(processed_df_kaggle['dropoff_day'], prefix='dropoff_day', drop_first=True).astype(int)

In [307]:
processed_df_kaggle = pd.concat([processed_df_kaggle, test_pickup_day_one_hot, test_dropoff_day_one_hot], axis=1)

In [308]:
processed_df_kaggle = processed_df_kaggle.drop(columns=['pickup_day', 'dropoff_day'], axis=1)

In [309]:
processed_df_kaggle.head()

Unnamed: 0,ID,VendorID,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,fare_amount,extra,mta_tax,...,pickup_day_3,pickup_day_4,pickup_day_5,pickup_day_6,dropoff_day_1,dropoff_day_2,dropoff_day_3,dropoff_day_4,dropoff_day_5,dropoff_day_6
0,T00876,1.0,0,2.0,74,132,1.0,52.0,0.0,0.5,...,1,0,0,0,0,0,1,0,0,0
1,T01498,2.0,0,5.0,82,143,1.0,22.02,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
2,T08153,2.0,0,1.0,74,75,1.0,7.5,0.0,0.5,...,0,0,0,0,1,0,0,0,0,0
3,T03394,2.0,0,1.0,244,50,1.0,20.0,1.0,0.5,...,0,1,0,0,0,0,0,1,0,0
4,T09784,2.0,0,1.0,7,129,1.0,10.0,0.0,0.5,...,0,0,0,0,0,1,0,0,0,0


In [310]:
processed_df_kaggle = processed_df_kaggle.drop(columns="ID")

# 🛠️ Klasfikasi

## 📍 Training

In [311]:
def evaluate_classifier_performance(prediction, y_test):
    # Informasi evaluasi secara compact
    print("Hasil Evaluasi berdasarkan classification report \n\n%s\n" % (classification_report(y_test, prediction,zero_division=0)))
    print()
    print("Confusion Matrix")
    print()
    y_actual = pd.Series(np.array(y_test), name = "actual")
    y_pred = pd.Series(np.array(prediction), name = "prediction")
    df_confusion = pd.crosstab(y_actual, y_pred)
    display(df_confusion)
    print()
    print()

    print("Butuh informasi lebih lengkap? silakan simak di bawah ini : ")
    print('Accuracy Average:', accuracy_score(y_test, prediction))
    print('F1 Macro Average:', f1_score(y_test, prediction, average='macro'))
    print('F1 Micro Average:', f1_score(y_test, prediction, average='micro'))
    print('Precision Macro Average:', precision_score(y_test, prediction, average='macro',zero_division=0))
    print('Precision Micro Average:', precision_score(y_test, prediction, average='micro',zero_division=0))
    print('Recall Macro Average:', recall_score(y_test, prediction, average='macro',zero_division=0))
    print('Recall Micro Average:', recall_score(y_test, prediction, average='micro',zero_division=0))
    print()

In [312]:
df_classif = df.copy()

In [313]:
# Drop kolom ID
df_classif = df_classif.drop(columns="ID")

In [314]:
# Identify categorical features
# categorical_features = ['VendorID', 'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID', 'trip_type']
categorical_features = ['store_and_fwd_flag']

In [315]:
# Convert categorical columns to 'category' dtype
for col in categorical_features:
    df[col] = df[col].astype('category')

In [316]:
# Pembagian fitur & target
X = df_classif.drop('payment_type', axis=1)
y = df_classif['payment_type']

In [317]:
# Identify the indices of categorical features
cat_features_indices = [X.columns.get_loc(col) for col in categorical_features]

In [318]:
# Train and test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [319]:
# Define hyperparameter grid for RandomizedSearchCV
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.03, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5],
    'random_strength': [0, 10, 100],
    'bagging_temperature': [0, 1, 5],
    'border_count': [32, 64, 128],
    'min_data_in_leaf': [1, 5, 10]
}

# Initialize CatBoost model
model = CatBoostClassifier(cat_features=cat_features_indices,
                           random_state=42, verbose=0)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_grid,
                                   n_iter=50, scoring='f1', cv=3, verbose=1, n_jobs=-1)

In [320]:
# Cheking Nan Values in X_train
print("Missing values in X_train: ", X_train.isna().sum().sum())

Missing values in X_train:  0


In [321]:
X_train.head()

Unnamed: 0,VendorID,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,...,pickup_day_3,pickup_day_4,pickup_day_5,pickup_day_6,dropoff_day_1,dropoff_day_2,dropoff_day_3,dropoff_day_4,dropoff_day_5,dropoff_day_6
5587,2.0,0,1.0,41,151,1.0,1.96,10.0,0.0,0.5,...,0,0,0,0,0,0,0,0,0,0
7911,2.0,0,1.0,247,42,6.0,1.17,8.0,1.0,0.5,...,0,1,0,0,0,0,0,1,0,0
3959,2.0,0,1.0,74,193,1.0,4.68,16.0,0.5,0.5,...,1,0,0,0,0,0,1,0,0,0
1908,2.0,0,1.0,41,41,1.0,0.63,4.5,0.0,0.5,...,0,0,0,0,1,0,0,0,0,0
6951,2.0,0,1.0,41,41,1.0,0.97,6.5,0.0,0.5,...,1,0,0,0,0,0,1,0,0,0


In [None]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [None]:
best_params = random_search.best_params_
print(best_params)

best_model = random_search.best_estimator_
print(best_model)

In [None]:
y_pred = best_model.predict(X_test)

# Evaluasi performance
evaluate_classifier_performance(y_pred, y_test)

## 📍 Kaggle

In [None]:
# df_kaggle_classification = processed_df_kaggle.copy()

In [None]:
# y_result = best_model.predict(df_kaggle_classification)

In [None]:
# csv_result = pd.DataFrame({
#     "ID": df_kaggle["ID"],
#     "payment_type": y_result
# })

In [None]:
# csv_result

In [None]:
# from datetime import datetime
# from zoneinfo import ZoneInfo

# utc_plus_7 = ZoneInfo("Asia/Bangkok")
# current_iso_timestamp = datetime.now(utc_plus_7).isoformat()

# filename = f"submission_kasdead_{current_iso_timestamp}.csv"
# csv_result.to_csv(filename, index=False)