In [2]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'df-classify-2022:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4658471%2F7926488%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240324%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240324T110621Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1883af4f477541c327a42ea06385481ecef395a63a5fa639c2bf80114699f84776595a9950239bc7e0666f10c3d34875e92ed71d9d3badb0bf5b773effa425814475b7c21a273df9e926ba5032fbfcf98c8c4f60dd87f54aab77da758b51f1a6365a4244ebcdf03bdf2b834f45ed83bb27074e7ebe026ae907001f7675aa70260b562bf88559fc1485bf4d416bb57be7ab9107088fda2a7f8a7daba5cdfbbb655ca2699f716b3347bed432258048f2716a6cc479c75fcdd8feccb1ff5fc01ceab293855041d523c16aa059f21066aeb3fdbe3f89461a9ae01156b7803c14183a324d631bc3aec5bd9c700f7e36e693f14c1a6bf0713f72b1e90324238c55034d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading df-classify-2022, 3662688 bytes compressed
Downloaded and uncompressed: df-classify-2022
Data source import complete.


In [3]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from xgboost import XGBRFClassifier
from sklearn.linear_model import LogisticRegression

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import Pipeline, make_pipeline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score, recall_score, precision_score, roc_auc_score, classification_report
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler



import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv("/kaggle/input/df-classify-2022/df2.csv")
df


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_DELAY,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE,Status,ORIGIN_AIRPORT_ID_Encoded,DEST_AIRPORT_ID_Encoded
0,9,30,5,2,-6.0,4,164.0,1022.0,0,370,98
1,3,26,6,6,-4.0,7,154.0,912.0,0,170,75
2,5,10,2,7,-4.0,7,91.0,391.0,0,96,80
3,4,19,2,8,99.0,8,64.0,175.0,1,97,252
4,6,26,7,8,82.0,8,84.0,246.0,1,120,303
...,...,...,...,...,...,...,...,...,...,...,...
299995,9,11,7,4,-8.0,5,118.0,694.0,0,183,277
299996,1,17,1,3,-2.0,4,74.0,199.0,0,40,242
299997,6,3,5,8,17.0,8,80.0,345.0,1,194,301
299998,6,25,6,5,-8.0,6,128.0,701.0,0,22,354


In [6]:
# Chọn mẫu stratify
df = df.groupby('Status', group_keys=False).apply(lambda x: x.sample(int(np.rint(50000*len(x)/len(df))))).sample(frac=1).reset_index(drop=True)
df

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_DELAY,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE,Status,ORIGIN_AIRPORT_ID_Encoded,DEST_AIRPORT_ID_Encoded
0,11,4,5,4,-8.0,5,123.0,723.0,0,255,14
1,8,6,6,7,-13.0,8,89.0,475.0,0,245,277
2,9,16,5,3,-6.0,4,278.0,1797.0,0,45,197
3,9,23,5,3,0.0,4,208.0,1448.0,1,19,318
4,7,1,5,7,0.0,8,114.0,702.0,0,44,365
...,...,...,...,...,...,...,...,...,...,...,...
49995,3,9,3,5,-4.0,6,183.0,1236.0,1,114,256
49996,3,27,7,3,-6.0,4,159.0,1107.0,0,229,342
49997,7,1,5,5,73.0,6,337.0,2556.0,1,196,160
49998,2,14,1,5,0.0,6,70.0,197.0,0,128,352


In [7]:
df['Status'].value_counts(normalize=True)

0    0.63392
1    0.36608
Name: Status, dtype: float64

In [8]:
X = df.drop('Status', axis=1)
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# **1. Supper Vector Machine**

## **1.1. No sampling**

In [None]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

[[6238   94]
 [1685 1983]]
              precision    recall  f1-score   support

           0       0.79      0.99      0.88      6332
           1       0.95      0.54      0.69      3668

    accuracy                           0.82     10000
   macro avg       0.87      0.76      0.78     10000
weighted avg       0.85      0.82      0.81     10000



## **1.2. SMOTETomek**

In [None]:
SMOTETomek_pipeline_svm = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
                                    SVC())

SMOTETomek_svm = SMOTETomek_pipeline_svm.fit(X_train, y_train)
y_pred_svm_tm = SMOTETomek_svm.predict(X_test)

print(confusion_matrix(y_test, y_pred_svm_tm))
print(classification_report(y_test, y_pred_svm_tm))

[[6039  293]
 [1319 2349]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      6332
           1       0.89      0.64      0.74      3668

    accuracy                           0.84     10000
   macro avg       0.85      0.80      0.81     10000
weighted avg       0.85      0.84      0.83     10000



## **1.3. SMOTE-RUS**

In [None]:
SMOTERUS_pipeline_svm = make_pipeline(SMOTE(sampling_strategy='minority'),
                                      RandomUnderSampler(sampling_strategy='majority'),
                                      SVC())

SMOTERUS_svm = SMOTERUS_pipeline_svm.fit(X_train, y_train)
y_pred_svm_rus = SMOTERUS_svm.predict(X_test)

print(confusion_matrix(y_test, y_pred_svm_rus))
print(classification_report(y_test, y_pred_svm_rus))

[[6076  256]
 [1351 2317]]
              precision    recall  f1-score   support

           0       0.82      0.96      0.88      6332
           1       0.90      0.63      0.74      3668

    accuracy                           0.84     10000
   macro avg       0.86      0.80      0.81     10000
weighted avg       0.85      0.84      0.83     10000



In [None]:
print("ROC-AUC for No sampling:",  roc_auc_score(y_test, y_pred_svm))
print("ROC-AUC for SMOTETomek:",  roc_auc_score(y_test, y_pred_svm_tm))
print("ROC-AUC for SMOTERUS:",  roc_auc_score(y_test, y_pred_svm_rus))


ROC-AUC for No sampling: 0.7628881807867259
ROC-AUC for SMOTETomek: 0.7970652950411645
ROC-AUC for SMOTERUS: 0.7956249125971077


# **2. Random Forest**

## **2.1. No sampling**

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

[[6016  316]
 [1321 2347]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      6332
           1       0.88      0.64      0.74      3668

    accuracy                           0.84     10000
   macro avg       0.85      0.79      0.81     10000
weighted avg       0.84      0.84      0.83     10000



## **2.2. SMOTETomek**

In [None]:
SMOTETomek_pipeline_rf = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
                                    RandomForestClassifier())

SMOTETomek_rf = SMOTETomek_pipeline_rf.fit(X_train, y_train)
y_pred_rf_rf = SMOTETomek_rf.predict(X_test)

print(confusion_matrix(y_test, y_pred_rf_rf))
print(classification_report(y_test, y_pred_rf_rf))

[[5725  607]
 [1145 2523]]
              precision    recall  f1-score   support

           0       0.83      0.90      0.87      6332
           1       0.81      0.69      0.74      3668

    accuracy                           0.82     10000
   macro avg       0.82      0.80      0.80     10000
weighted avg       0.82      0.82      0.82     10000



## **2.3. SMOTE-RUS**

In [None]:
SMOTERUS_pipeline_rf = make_pipeline(SMOTE(sampling_strategy='minority'),
                                      RandomUnderSampler(sampling_strategy='majority'),
                                      RandomForestClassifier())

SMOTERUS_rf = SMOTERUS_pipeline_rf.fit(X_train, y_train)
y_pred_rf_rus = SMOTERUS_rf.predict(X_test)

print(confusion_matrix(y_test, y_pred_rf_rus))
print(classification_report(y_test, y_pred_rf_rus))

[[5800  532]
 [1195 2473]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      6332
           1       0.82      0.67      0.74      3668

    accuracy                           0.83     10000
   macro avg       0.83      0.80      0.81     10000
weighted avg       0.83      0.83      0.82     10000



In [None]:
print("ROC-AUC for No sampling:",  roc_auc_score(y_test, y_pred_rf))
print("ROC-AUC for SMOTETomek:",  roc_auc_score(y_test, y_pred_rf_rf))
print("ROC-AUC for SMOTERUS:",  roc_auc_score(y_test, y_pred_rf_rus))


ROC-AUC for No sampling: 0.7949764950802937
ROC-AUC for SMOTETomek: 0.7959892491859044
ROC-AUC for SMOTERUS: 0.7950958452367748


# **3. Decision Tree**

## **3.1. No sampling**

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

[[4992 1340]
 [1157 2511]]
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      6332
           1       0.65      0.68      0.67      3668

    accuracy                           0.75     10000
   macro avg       0.73      0.74      0.73     10000
weighted avg       0.75      0.75      0.75     10000



## **3.2. SMOTETomek**

In [None]:
SMOTETomek_pipeline_dt = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
                                    DecisionTreeClassifier())

SMOTETomek_dt = SMOTETomek_pipeline_dt.fit(X_train, y_train)
y_pred_dt_tm = SMOTETomek_dt.predict(X_test)

print(confusion_matrix(y_test, y_pred_dt_tm))
print(classification_report(y_test, y_pred_dt_tm))

[[4944 1388]
 [1182 2486]]
              precision    recall  f1-score   support

           0       0.81      0.78      0.79      6332
           1       0.64      0.68      0.66      3668

    accuracy                           0.74     10000
   macro avg       0.72      0.73      0.73     10000
weighted avg       0.75      0.74      0.74     10000



## **3.3. SMOTE-RUS**

In [None]:
SMOTERUS_pipeline_dt = make_pipeline(SMOTE(sampling_strategy='minority'),
                                      RandomUnderSampler(sampling_strategy='majority'),
                                      DecisionTreeClassifier())

SMOTERUS_dt = SMOTERUS_pipeline_dt.fit(X_train, y_train)
y_pred_dt_rus = SMOTERUS_dt.predict(X_test)

print(confusion_matrix(y_test, y_pred_dt_rus))
print(classification_report(y_test, y_pred_dt_rus))

[[4963 1369]
 [1198 2470]]
              precision    recall  f1-score   support

           0       0.81      0.78      0.79      6332
           1       0.64      0.67      0.66      3668

    accuracy                           0.74     10000
   macro avg       0.72      0.73      0.73     10000
weighted avg       0.75      0.74      0.74     10000



In [None]:
print("ROC-AUC for No sampling:",  roc_auc_score(y_test, y_pred_dt))
print("ROC-AUC for SMOTETomek:",  roc_auc_score(y_test, y_pred_dt_tm))
print("ROC-AUC for SMOTERUS:",  roc_auc_score(y_test, y_pred_dt_rus))


ROC-AUC for No sampling: 0.7364728739311013
ROC-AUC for SMOTETomek: 0.729274750604673
ROC-AUC for SMOTERUS: 0.7285940413788542


# **4. XGBoost**

## **4.1. No sampling**

In [None]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

[[5970  362]
 [1292 2376]]
              precision    recall  f1-score   support

           0       0.82      0.94      0.88      6332
           1       0.87      0.65      0.74      3668

    accuracy                           0.83     10000
   macro avg       0.84      0.80      0.81     10000
weighted avg       0.84      0.83      0.83     10000



## **4.2. SMOTETomek**

In [None]:
SMOTETomek_pipeline_xgb = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
                                    XGBClassifier())

SMOTETomek_xgb = SMOTETomek_pipeline_xgb.fit(X_train, y_train)
y_pred_xgb_tm = SMOTETomek_xgb.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgb_tm))
print(classification_report(y_test, y_pred_xgb_tm))

[[5679  653]
 [1111 2557]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      6332
           1       0.80      0.70      0.74      3668

    accuracy                           0.82     10000
   macro avg       0.82      0.80      0.80     10000
weighted avg       0.82      0.82      0.82     10000



## **4.3. SMOTE-RUS**

In [None]:
SMOTERUS_pipeline_xgb = make_pipeline(SMOTE(sampling_strategy='minority'),
                                      RandomUnderSampler(sampling_strategy='majority'),
                                      XGBClassifier())

SMOTERUS_xgb = SMOTERUS_pipeline_xgb.fit(X_train, y_train)
y_pred_xgb_rus = SMOTERUS_xgb.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgb_rus))
print(classification_report(y_test, y_pred_xgb_rus))

[[5752  580]
 [1172 2496]]
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      6332
           1       0.81      0.68      0.74      3668

    accuracy                           0.82     10000
   macro avg       0.82      0.79      0.80     10000
weighted avg       0.82      0.82      0.82     10000



In [None]:
print("ROC-AUC for No sampling:",  roc_auc_score(y_test, y_pred_xgb))
print("ROC-AUC for SMOTETomek:",  roc_auc_score(y_test, y_pred_xgb_tm))
print("ROC-AUC for SMOTERUS:",  roc_auc_score(y_test, y_pred_xgb_rus))


ROC-AUC for No sampling: 0.7952972593897402
ROC-AUC for SMOTETomek: 0.79699158383341
ROC-AUC for SMOTERUS: 0.7944407971557119


# **5. XGBoost Random Forest**

## **5.1. No sampling**

In [None]:
xgbrf = XGBRFClassifier()
xgbrf.fit(X_train, y_train)
y_pred_xgbrf = xgbrf.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgbrf))
print(classification_report(y_test, y_pred_xgbrf))

[[6025  307]
 [1341 2327]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      6332
           1       0.88      0.63      0.74      3668

    accuracy                           0.84     10000
   macro avg       0.85      0.79      0.81     10000
weighted avg       0.84      0.84      0.83     10000



## **5.2. SMOTETomek**

In [None]:
SMOTETomek_pipeline_xgbrf = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
                                    XGBRFClassifier())

SMOTETomek_xgbrf = SMOTETomek_pipeline_xgbrf.fit(X_train, y_train)
y_pred_xgbrf_tm = SMOTETomek_xgbrf.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgbrf_tm))
print(classification_report(y_test, y_pred_xgbrf_tm))

[[5679  653]
 [1078 2590]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      6332
           1       0.80      0.71      0.75      3668

    accuracy                           0.83     10000
   macro avg       0.82      0.80      0.81     10000
weighted avg       0.83      0.83      0.82     10000



## **5.3. SMOTE-RUS**

In [None]:
SMOTERUS_pipeline_xgbrf = make_pipeline(SMOTE(sampling_strategy='minority'),
                                      RandomUnderSampler(sampling_strategy='majority'),
                                      XGBRFClassifier())

SMOTERUS_xgbrf = SMOTERUS_pipeline_xgbrf.fit(X_train, y_train)
y_pred_xgbrf_rus = SMOTERUS_xgbrf.predict(X_test)

print(confusion_matrix(y_test, y_pred_xgbrf_rus))
print(classification_report(y_test, y_pred_xgbrf_rus))

[[5715  617]
 [1103 2565]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      6332
           1       0.81      0.70      0.75      3668

    accuracy                           0.83     10000
   macro avg       0.82      0.80      0.81     10000
weighted avg       0.83      0.83      0.83     10000



In [None]:
print("ROC-AUC for No sampling:",  roc_auc_score(y_test, y_pred_xgbrf))
print("ROC-AUC for SMOTETomek:",  roc_auc_score(y_test, y_pred_xgbrf_tm))
print("ROC-AUC for SMOTERUS:",  roc_auc_score(y_test, y_pred_xgbrf_rus))


ROC-AUC for No sampling: 0.7929608896598332
ROC-AUC for SMOTETomek: 0.8014899480645985
ROC-AUC for SMOTERUS: 0.8009248001014045


# **6. Logistic Regression**

## **6.1. No sampling**

In [None]:
lgt = LogisticRegression()
lgt.fit(X_train, y_train)
y_pred_lgt = lgt.predict(X_test)

print(confusion_matrix(y_test, y_pred_lgt))
print(classification_report(y_test, y_pred_lgt))

[[5998  334]
 [1285 2383]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      6332
           1       0.88      0.65      0.75      3668

    accuracy                           0.84     10000
   macro avg       0.85      0.80      0.81     10000
weighted avg       0.84      0.84      0.83     10000



## **6.2. SMOTETomek**

In [9]:
SMOTETomek_pipeline_lgt = make_pipeline(SMOTETomek(tomek=TomekLinks(sampling_strategy='majority')),
                                    LogisticRegression())

SMOTETomek_lgt = SMOTETomek_pipeline_lgt.fit(X_train, y_train)
y_pred_lgt_tm = SMOTETomek_lgt.predict(X_test)

print(confusion_matrix(y_test, y_pred_lgt_tm))
print(classification_report(y_test, y_pred_lgt_tm))

[[5703  687]
 [1078 2532]]
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      6390
           1       0.79      0.70      0.74      3610

    accuracy                           0.82     10000
   macro avg       0.81      0.80      0.80     10000
weighted avg       0.82      0.82      0.82     10000



## **6.3. SMOTE-RUS**

In [None]:
SMOTERUS_pipeline_lgt = make_pipeline(SMOTE(sampling_strategy='minority'),
                                      RandomUnderSampler(sampling_strategy='majority'),
                                      LogisticRegression())

SMOTERUS_lgt = SMOTERUS_pipeline_lgt.fit(X_train, y_train)
y_pred_lgt_rus = SMOTERUS_lgt.predict(X_test)

print(confusion_matrix(y_test, y_pred_lgt_rus))
print(classification_report(y_test, y_pred_lgt_rus))

[[5697  635]
 [1096 2572]]
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      6332
           1       0.80      0.70      0.75      3668

    accuracy                           0.83     10000
   macro avg       0.82      0.80      0.81     10000
weighted avg       0.83      0.83      0.82     10000



In [None]:
print("ROC-AUC for No sampling:",  roc_auc_score(y_test, y_pred_lgt))
print("ROC-AUC for SMOTETomek:",  roc_auc_score(y_test, y_pred_lgt_tm))
print("ROC-AUC for SMOTERUS:",  roc_auc_score(y_test, y_pred_lgt_rus))


ROC-AUC for No sampling: 0.7984624496507673
ROC-AUC for SMOTETomek: 0.799687037367449
ROC-AUC for SMOTERUS: 0.8004576467111367


In [11]:
import pickle

with open('SMOTETomek_lgt.pkl', 'wb') as f:
    pickle.dump(SMOTETomek_lgt, f)