In [2]:
!pip3 install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path = '/content/drive/MyDrive/Colab Notebooks/ML/Lab 10/lateness_data.json'
data = pd.read_json(file_path)
data.head()

Unnamed: 0,direct_delivery,batched_pickup,transport_type,order_time,delivery_distance,order_preparation_time,responsible_id,store_latitude,store_longitude,client_latitude,client_longitude,status,status_time
0,yes,yes,automobile,2023-10-09 19:23:55,7798,10,4444,55.795518,37.631224,55.780525,37.700847,early,18
1,yes,yes,automobile,2023-07-31 11:43:13,553,10,3798,55.783786,37.624401,55.781943,37.628641,early,6
2,yes,yes,bicycle,2023-08-21 19:35:37,711,20,7595,55.729464,37.692976,55.732003,37.689528,early,9
3,yes,yes,automobile,2023-09-06 00:19:29,3538,10,3797,55.731702,37.581492,55.726069,37.604986,early,16
4,yes,yes,automobile,2023-09-06 19:23:28,4169,10,9509,55.78136,37.677339,55.787238,37.700311,early,5


## Self practice task

Using dataset from assignment 2 (Task 1)
* Train and evaluate the following models using default parameters:
    1. Decision Tree
    1. Random Forest
    1. Adaptive boosting model
    1. [Catboost from Yandex](https://catboost.ai/en/docs/concepts/python-quickstart)
    1. [LightGBM](https://lightgbm.readthedocs.io/en/v3.3.2/)
* Apply hyperparameters tuning for the models listed above and compare the models performance and also training time.


In [None]:
data_sample = data.sample(frac=0.1, random_state=42)

In [None]:
status_mapping = {
    'early': 2,
    'late': 0,
    'on time': 1
}

data_sample['status'] = data_sample['status'].map(status_mapping)

In [None]:
data_sample = pd.get_dummies(data_sample, columns=['direct_delivery'], drop_first=True)
data_sample = pd.get_dummies(data_sample, columns=['batched_pickup'], drop_first=True)

In [None]:
transport_type_mapping = {
    'foot': 0,
    'scooter': 1,
    'bicycle': 2,
    'automobile': 3
}
data_sample['transport_type'] = data_sample['transport_type'].map(transport_type_mapping)

In [None]:
data_sample.sort_values(by='order_time', inplace=True)
data_sample.drop('order_time', axis=1, inplace=True)

In [None]:
X = data_sample.drop(['status'], axis=1)
y = data_sample[['status']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Step 1: Train and Evaluate Models with Default Parameters

In [None]:
time_before_tuning = {}

In [None]:
# Train and evaluate Decision Tree
start_time = datetime.now()
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train.values.ravel())
dt_predictions = dt_model.predict(X_test)
end_time = datetime.now()
time_before_tuning['Decision Tree'] = end_time-start_time

In [None]:
# Train and evaluate Random Forest
start_time = datetime.now()
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train.values.ravel())
rf_predictions = rf_model.predict(X_test)
end_time = datetime.now()
time_before_tuning['Random Forest'] = end_time-start_time

In [None]:
# Train and evaluate Adaptive Boosting
start_time = datetime.now()
adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train, y_train.values.ravel())
adaboost_predictions = adaboost_model.predict(X_test)
end_time = datetime.now()
time_before_tuning['Adaptive Boosting'] = end_time-start_time

In [None]:
# Train and evaluate CatBoost
start_time = datetime.now()
catboost_model = CatBoostClassifier(random_state=42, verbose=False)
catboost_model.fit(X_train, y_train.values.ravel())
catboost_predictions = catboost_model.predict(X_test)
end_time = datetime.now()
time_before_tuning['CatBoost'] = end_time-start_time

In [None]:
# Train and evaluate LightGBM
start_time = datetime.now()
lgb_model = lgb.LGBMClassifier(random_state=42, verbose=-1)
lgb_model.fit(X_train, y_train.values.ravel())
lgb_predictions = lgb_model.predict(X_test)
end_time = datetime.now()
time_before_tuning['LightGBM'] = end_time-start_time

In [None]:
# Evaluate models
models = ['Decision Tree', 'Random Forest', 'Adaptive Boosting', 'CatBoost', 'LightGBM']
predictions = [dt_predictions, rf_predictions, adaboost_predictions, catboost_predictions, lgb_predictions]

In [None]:
acc_before_tuning = {}

In [None]:
for model, preds in zip(models, predictions):
    acc = accuracy_score(y_test, preds)
    acc_before_tuning[model] = acc
    report = classification_report(y_test, preds)
    print(f"Model: {model}\nAccuracy: {acc}\nClassification Report:\n{report}\nTraining Time before tuning: {time_before_tuning[model]}\n{'='*50}\n")


Model: Decision Tree
Accuracy: 0.773014440433213
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62       666
           1       1.00      1.00      1.00       683
           2       0.71      0.72      0.71       867

    accuracy                           0.77      2216
   macro avg       0.78      0.78      0.78      2216
weighted avg       0.77      0.77      0.77      2216

Training Time before tuning: 0:00:00.097115

Model: Random Forest
Accuracy: 0.8203971119133574
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.72      0.71       666
           1       1.00      1.00      1.00       683
           2       0.78      0.76      0.77       867

    accuracy                           0.82      2216
   macro avg       0.82      0.83      0.82      2216
weighted avg       0.82      0.82      0.82      2216

Training Time before tuning: 0:00:02.192104

Mod

# Step 2: Hyperparameter Tuning and Model Comparison

In [None]:
# Define hyperparameter grids for each model
dt_param_grid = {'max_depth': [3, 5, 7, None]}

rf_param_grid = {'n_estimators': [50, 100, 200],
                 'max_depth': [None, 5, 10, 20],
                 'min_samples_split': [2, 5, 10]}

adaboost_param_grid = {'n_estimators': [50, 100, 200],
                       'learning_rate': [0.01, 0.1, 1.0]}

catboost_param_grid = {'iterations': [50, 100, 200],
                       'learning_rate': [0.01, 0.1, 1.0]}

lgb_param_grid = {'n_estimators': [200, 300, 400],
                  'max_depth': [5, 10, 20],
                  'num_leaves': [31, 50, 100],
                  'learning_rate': [0.01, 0.1, 1.0]}

In [None]:
# Perform Grid Search with Time Series Cross-Validation
tscv = TimeSeriesSplit(n_splits=3)

In [None]:
# Define models and their parameter grids
models = [(dt_model, dt_param_grid,'Decision Tree'), (rf_model, rf_param_grid, 'Random Forest'),
          (adaboost_model, adaboost_param_grid, 'Adaptive Boosting'), (catboost_model, catboost_param_grid, 'CatBoost'),
          (lgb_model, lgb_param_grid, 'LightGBM')]

In [None]:
acc_after_tuning  = {}

In [None]:
# Perform Grid Search for each model
for model, param_grid, name_model in models:
    grid_search = GridSearchCV(model, param_grid, cv=tscv, scoring='accuracy', n_jobs=-1)

    grid_search.fit(X_train, y_train.values.ravel())

    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_predictions = best_model.predict(X_test)

    acc = accuracy_score(y_test, best_predictions)
    acc_after_tuning[name_model] = acc
    report = classification_report(y_test, best_predictions)

    print(f"Model: {name_model}")
    print(f"Best Parameters: {best_params}")
    print(f"Accuracy after tuning: {acc}")
    print(f"Classification Report:\n{report}\n{'='*50}\n")

Model: Decision Tree
Best Parameters: {'max_depth': 5}
Accuracy after tuning: 0.8136281588447654
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.73      0.70       666
           1       1.00      1.00      1.00       683
           2       0.78      0.73      0.75       867

    accuracy                           0.81      2216
   macro avg       0.82      0.82      0.82      2216
weighted avg       0.82      0.81      0.81      2216


Model: Random Forest
Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy after tuning: 0.8307761732851986
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.75      0.73       666
           1       1.00      1.00      1.00       683
           2       0.80      0.76      0.78       867

    accuracy                           0.83      2216
   macro avg       0.83      0.84      0.84      2216
weigh

# Step 3: Mark the time for training models after tuning

In [None]:
time_after_tuning = {}

In [None]:
acc_after_tuning

{'Decision Tree': 0.8136281588447654,
 'Random Forest': 0.8307761732851986,
 'Adaptive Boosting': 0.7906137184115524,
 'CatBoost': 0.8357400722021661,
 'LightGBM': 0.8303249097472925}

In [None]:
# Train and evaluate Decision Tree
start_time = datetime.now()
dt_model = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_model.fit(X_train, y_train.values.ravel())
dt_predictions = dt_model.predict(X_test)
end_time = datetime.now()
time_after_tuning['Decision Tree'] = end_time-start_time

In [None]:
# Train and evaluate Random Forest
start_time = datetime.now()
rf_model = RandomForestClassifier(random_state=42, max_depth=20, min_samples_split=10, n_estimators=200)
rf_model.fit(X_train, y_train.values.ravel())
rf_predictions = rf_model.predict(X_test)
end_time = datetime.now()
time_after_tuning['Random Forest'] = end_time-start_time

In [None]:
# Train and evaluate Adaptive Boosting
start_time = datetime.now()
adaboost_model = AdaBoostClassifier(random_state=42, learning_rate=0.01, n_estimators=50)
adaboost_model.fit(X_train, y_train.values.ravel())
adaboost_predictions = adaboost_model.predict(X_test)
end_time = datetime.now()
time_after_tuning['Adaptive Boosting'] = end_time-start_time

In [None]:
# Train and evaluate CatBoost
start_time = datetime.now()
catboost_model = CatBoostClassifier(random_state=42, iterations=100, learning_rate=0.1, verbose=False)
catboost_model.fit(X_train, y_train.values.ravel())
catboost_predictions = catboost_model.predict(X_test)
end_time = datetime.now()
time_after_tuning['CatBoost'] = end_time-start_time

In [None]:
# Train and evaluate LightGBM
start_time = datetime.now()
lgb_model = lgb.LGBMClassifier(random_state=42, learning_rate=0.01, max_depth=10, n_estimators=200, num_leaves=31, verbose=-1)
lgb_model.fit(X_train, y_train.values.ravel())
lgb_predictions = lgb_model.predict(X_test)
end_time = datetime.now()
time_after_tuning['LightGBM'] = end_time-start_time

In [None]:
for model, param_grid, name_model in models:
    print(f"Model: {name_model}")
    print(f"Accuracy before tuning: {acc_before_tuning[name_model]}")
    print(f"Accuracy after tuning: {acc_after_tuning[name_model]}")
    print(f"Training Time before tuning: {time_before_tuning[name_model]}")
    print(f"Training Time after tuning: {time_after_tuning[name_model]}")
    print("="*50)
    print()

Model: Decision Tree
Accuracy before tuning: 0.773014440433213
Accuracy after tuning: 0.8136281588447654
Training Time before tuning: 0:00:00.097115
Training Time after tuning: 0:00:00.053987

Model: Random Forest
Accuracy before tuning: 0.8203971119133574
Accuracy after tuning: 0.8307761732851986
Training Time before tuning: 0:00:02.192104
Training Time after tuning: 0:00:03.965894

Model: Adaptive Boosting
Accuracy before tuning: 0.7833935018050542
Accuracy after tuning: 0.7906137184115524
Training Time before tuning: 0:00:00.642757
Training Time after tuning: 0:00:00.596875

Model: CatBoost
Accuracy before tuning: 0.819043321299639
Accuracy after tuning: 0.8357400722021661
Training Time before tuning: 0:00:08.404562
Training Time after tuning: 0:00:00.918418

Model: LightGBM
Accuracy before tuning: 0.8212996389891697
Accuracy after tuning: 0.8303249097472925
Training Time before tuning: 0:00:00.510552
Training Time after tuning: 0:00:01.143383



# Conclusions on Hyperparameter Tuning Results:

## **Decision Tree:**
- **Accuracy:**
  - Before tuning: 77.3%
  - After tuning: 81.4%
- **Training Time:**
  - Before tuning: 0:00:00.118350
  - After tuning: 0:00:00.057574
- **Comments:**
  - Improved accuracy by 4.1% after tuning.
  - Significant reduction in training time.

## **Random Forest:**
- **Accuracy:**
  - Before tuning: 82.0%
  - After tuning: 83.1%
- **Training Time:**
  - Before tuning: 0:00:04.174649
  - After tuning: 0:00:08.554135
- **Comments:**
  - Improved accuracy by 1.1% after tuning.
  - Increased training time, possibly due to an increase in the number of trees.

## **Adaptive Boosting:**
- **Accuracy:**
  - Before tuning: 78.3%
  - After tuning: 79.1%
- **Training Time:**
  - Before tuning: 0:00:00.902823
  - After tuning: 0:00:00.697497
- **Comments:**
  - Improved accuracy by 0.8% after tuning.
  - Significant reduction in training time.

## **CatBoost:**
- **Accuracy:**
  - Before tuning: 81.9%
  - After tuning: 83.6%
- **Training Time:**
  - Before tuning: 0:00:05.476461
  - After tuning: 0:00:00.492801
- **Comments:**
  - Improved accuracy by 1.7% after tuning.
  - Significant reduction in training time.

## **LightGBM:**
- **Accuracy:**
  - Before tuning: 82.1%
  - After tuning: 83.0%
- **Training Time:**
  - Before tuning: 0:00:00.370999
  - After tuning: 0:00:00.544029
- **Comments:**
  - Improved accuracy by 0.9% after tuning.
  - Slight increase in training time.

## General Findings:
- Hyperparameter tuning led to improved accuracy for all models.
- Decision Tree and Adaptive Boosting showed a significant reduction in training time after tuning.
- Random Forest and LightGBM demonstrated an increase in training time after tuning, possibly due to increased model complexity.
- CatBoost exhibited the best results both in terms of accuracy and training efficiency after hyperparameter tuning.