In [1]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    confusion_matrix
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    KFold,
    GridSearchCV,
)
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE, SMOTENC, ADASYN
from imblearn.combine import SMOTEENN

In [2]:
def read_file(file_path):
    return pd.read_csv(file_path)

In [3]:
def data_split(df, test_size = None):
    X = df.drop('target', axis = 1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)

    return X_train, X_test, y_train, y_test

In [4]:
def data_augmentation(X_train, y_train):
    # aug = ADASYN(random_state=42)
    aug = SMOTEENN(random_state=42)
    X_train_resampled, y_train_resampled = aug.fit_resample(X_train, y_train)

    return X_train_resampled, y_train_resampled

In [5]:
def best_features(X_train, X_val, y_train):
    k_best_features = int(0.2 * X_train.shape[1])
    selector = SelectKBest(score_func=f_classif, k=k_best_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_val_selected = selector.transform(X_val)

    return X_train_selected, X_val_selected

In [6]:
def random_forest_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = RandomForestClassifier(
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            min_samples_leaf=param_grid['min_samples_leaf'],
            min_samples_split=param_grid['min_samples_split'],
            random_state = 42
        )
    else:
        model = RandomForestClassifier(random_state=42)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [7]:
def parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [10, 100],
        'max_depth': [6, 8, 10, 12],
        'min_samples_leaf': [8, 12, 18],
        'min_samples_split': [8, 16, 20],
    }

    grid_search = GridSearchCV(RandomForestClassifier(random_state = 42), param_grid, cv = 3, scoring = 'roc_auc')
    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [8]:
def xgboost_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = XGBClassifier(
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            learning_rate=param_grid['learning_rate'],
            random_state = 42,
        )
    else:
        model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [9]:
def xgboost_parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500, 600],
        'max_depth': [2, 3, 4, 5],
        'learning_rate': [0.01, 0.05, 0.1],
    }

    grid_search = GridSearchCV(
        XGBClassifier(random_state=42),
        param_grid, 
        cv=5, 
        scoring='f1_macro'
    )

    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [10]:
def lightgbm_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = LGBMClassifier(
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            learning_rate=param_grid['learning_rate'],
            subsample=param_grid['subsample'],
            colsample_bytree=param_grid['colsample_bytree'],
            random_state = 42,
            use_label_encoder=False,
            eval_metric='logloss',
        )
    else:
        model = LGBMClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [11]:
def lightgbm_parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    grid_search = GridSearchCV(
        LGBMClassifier(random_state=42),
        param_grid, 
        cv=3, 
        scoring='roc_auc'
    )

    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [12]:
def adaboost_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = AdaBoostClassifier(
            n_estimators=param_grid['n_estimators'], 
            learning_rate=param_grid['learning_rate'],
            base_estimator__max_depth=param_grid['base_estimator__max_depth'],
            random_state = 42,
        )
    else:
        model = AdaBoostClassifier(random_state=42)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [13]:
def adaboost_parameter_tuning(x_train, y_train):
    param_grid_ada = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1.0],
        'base_estimator__max_depth': [1, 2, 3]
    }

    base_estimator = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(
        AdaBoostClassifier(base_estimator=base_estimator, random_state=42),
        param_grid_ada, 
        cv=3, 
        scoring='roc_auc'
    )

    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [14]:
# PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_StandardScaler.csv'
# PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_RobustScaler.csv'
# PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_Normalizer.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\scaled_train_data_Robust.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\scaled_train_data_Normalizer.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\outlier_scaled_train_data_Robust.csv'
PATH = r'D:\LGAimers\Hackerton\Inheon\data\outlier_scaled_train_data_Standard.csv'
df = read_file(PATH)

In [15]:
X_train, X_val, y_train, y_val = data_split(df, test_size = 0.3)

In [16]:
y_train = y_train.replace('Normal', 0).replace('AbNormal', 1)
y_val = y_val.replace('Normal', 0).replace('AbNormal', 1)

  y_train = y_train.replace('Normal', 0).replace('AbNormal', 1)
  y_val = y_val.replace('Normal', 0).replace('AbNormal', 1)


In [17]:
print(f'X_train.shape = {X_train.shape}')
print(f'X_val.shape = {X_val.shape}')
print(f'y_train.shape = {y_train.shape}')
print(f'y_val.shape = {y_val.shape}')

X_train.shape = (28354, 138)
X_val.shape = (12152, 138)
y_train.shape = (28354,)
y_val.shape = (12152,)


In [18]:
y_train.describe()

count    28354.000000
mean         0.059357
std          0.236295
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: target, dtype: float64

In [19]:
X_train_selected, X_val_selected = best_features(X_train=X_train, X_val=X_val, y_train=y_train)


  f = msb / msw


In [20]:
X_train_resampled, y_train_resampled = data_augmentation(X_train=X_train_selected, y_train=y_train)

In [21]:
print(f'X_train_resampled.shape = {X_train_resampled.shape}')
print(f'y_train_resampled.shape = {y_train_resampled.shape}')

X_train_resampled.shape = (25666, 27)
y_train_resampled.shape = (25666,)


In [22]:
# best_params_adaboost = adaboost_parameter_tuning(X_train_resampled, y_train_resampled)

In [23]:
# grid_search_adaboost_model = adaboost_learning_and_evaluation(
#     x_train=X_train_resampled,
#     y_train=y_train_resampled,
#     x_val=X_val_selected,
#     y_val=y_val,
#     param_grid=best_params_adaboost,
# )

In [24]:
best_params_lgbm = lightgbm_parameter_tuning(X_train_resampled, y_train_resampled)

[LightGBM] [Info] Number of positive: 1218, number of negative: 15892
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1074
[LightGBM] [Info] Number of data points in the train set: 17110, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.071186 -> initscore=-2.568606
[LightGBM] [Info] Start training from score -2.568606
[LightGBM] [Info] Number of positive: 1218, number of negative: 15893
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1070
[LightGBM] [Info] Number of data points in the train set: 17111, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.071182 -> initscore=-2.568669
[Lig

In [25]:
grid_search_lgbm_model = lightgbm_learning_and_evaluation(
    x_train=X_train_resampled,
    y_train=y_train_resampled,
    x_val=X_val_selected,
    y_val=y_val,
    param_grid=best_params_lgbm,
)

[LightGBM] [Info] Number of positive: 1827, number of negative: 23839
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000815 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1555
[LightGBM] [Info] Number of data points in the train set: 25666, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.071184 -> initscore=-2.568648
[LightGBM] [Info] Start training from score -2.568648
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     11485
           1       0.14      0.02      0.04       667

    accuracy                           0.94     12152
   macro avg       0.54      0.51      0.50     12152
weighted avg       0.90      0.94      0.92     12152

ROC-AUC Score: 0.5073262236970326
Confusion Matrix:
 [[11395    90]
 [  652    15]]


In [26]:
best_params_xgb = xgboost_parameter_tuning(X_train_resampled, y_train_resampled)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 600}


In [27]:
grid_search_xgboost_model = xgboost_learning_and_evaluation(
    x_train=X_train_resampled,
    y_train=y_train_resampled,
    x_val=X_val_selected,
    y_val=y_val,
    param_grid=best_params_xgb,
)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96     11485
           1       0.09      0.05      0.07       667

    accuracy                           0.92     12152
   macro avg       0.52      0.51      0.51     12152
weighted avg       0.90      0.92      0.91     12152

ROC-AUC Score: 0.5111737557429383
Confusion Matrix:
 [[11139   346]
 [  632    35]]


In [28]:
best_params_rf = parameter_tuning(X_train_resampled, y_train_resampled)

Best Hyperparameters: {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 100}


In [29]:
grid_search_model = random_forest_learning_and_evaluation(
    x_train=X_train_resampled, 
    y_train=y_train_resampled, 
    x_val=X_val_selected, 
    y_val=y_val, 
    param_grid=best_params_rf,
)

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97     11485
           1       0.09      0.02      0.04       667

    accuracy                           0.93     12152
   macro avg       0.52      0.50      0.50     12152
weighted avg       0.90      0.93      0.91     12152

ROC-AUC Score: 0.5045835158171894
Confusion Matrix:
 [[11332   153]
 [  652    15]]
