In [35]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    confusion_matrix
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    KFold,
    GridSearchCV,
)
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE, SMOTENC, ADASYN

In [36]:
def read_file(file_path):
    return pd.read_csv(file_path)

In [37]:
def data_split(df, test_size = None):
    X = df.drop('target', axis = 1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)

    return X_train, X_test, y_train, y_test

In [38]:
def data_augmentation(X_train, y_train):
    adasyn = ADASYN(random_state=42)
    X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

    return X_train_resampled, y_train_resampled

In [39]:
def best_features(X_train, X_val, y_train):
    k_best_features = 20
    selector = SelectKBest(score_func=f_classif, k=k_best_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_val_selected = selector.transform(X_val)

    return X_train_selected, X_val_selected

In [40]:
def random_forest_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = RandomForestClassifier(
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            min_samples_leaf=param_grid['min_samples_leaf'],
            min_samples_split=param_grid['min_samples_split'],
            random_state = 42
        )
    else:
        model = RandomForestClassifier(random_state=42)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [41]:
def parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [10, 100],
        'max_depth': [6, 8, 10, 12],
        'min_samples_leaf': [8, 12, 18],
        'min_samples_split': [8, 16, 20],
    }

    grid_search = GridSearchCV(RandomForestClassifier(random_state = 42), param_grid, cv = 3, scoring = 'roc_auc')
    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [42]:
PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_StandardScaler.csv'
# PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_RobustScaler.csv'
# PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_Normalizer.csv'
df = read_file(PATH)

In [None]:
def xgboost_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = XGBClassifier(
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            learning_rate=param_grid['learning_rate'],
            subsample=param_grid['subsample'],
            colsample_bytree=param_grid['colsample_bytree'],
            random_state = 42,
            use_label_encoder=False,
            eval_metric='logloss',
        )
    else:
        model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [None]:
def xgboost_parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    grid_search = GridSearchCV(XGBClassifier(random_state = 42), param_grid, cv = 3, scoring = 'roc_auc')
    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [43]:
X_train, X_val, y_train, y_val = data_split(df, test_size = 0.3)

In [44]:
print(f'X_train.shape = {X_train.shape}')
print(f'X_val.shape = {X_val.shape}')
print(f'y_train.shape = {y_train.shape}')
print(f'y_val.shape = {y_val.shape}')

X_train.shape = (28354, 126)
X_val.shape = (12152, 126)
y_train.shape = (28354,)
y_val.shape = (12152,)


In [45]:
y_train.describe()

count    28354.000000
mean         0.940643
std          0.236295
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: target, dtype: float64

In [46]:
X_train_selected, X_val_selected = best_features(X_train=X_train, X_val=X_val, y_train=y_train)


In [47]:
X_train_resampled, y_train_resampled = data_augmentation(X_train=X_train_selected, y_train=y_train)

In [48]:
print(f'X_train_resampled.shape = {X_train_resampled.shape}')
print(f'y_train_resampled.shape = {y_train_resampled.shape}')

X_train_resampled.shape = (53469, 20)
y_train_resampled.shape = (53469,)


In [49]:
non_grid_search_model = random_forest_learning_and_evaluation(
    x_train=X_train_resampled, 
    y_train=y_train_resampled, 
    x_val=X_val_selected, 
    y_val=y_val, 
    param_grid=None
)

Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.33      0.15       667
           1       0.96      0.83      0.89     11485

    accuracy                           0.80     12152
   macro avg       0.53      0.58      0.52     12152
weighted avg       0.91      0.80      0.85     12152

ROC-AUC Score: 0.5786841450846192
Confusion Matrix:
 [[ 221  446]
 [1998 9487]]


In [50]:
best_params = parameter_tuning(X_train_resampled, y_train_resampled)

Best Hyperparameters: {'max_depth': 12, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 100}


In [51]:
grid_search_model = random_forest_learning_and_evaluation(
    x_train=X_train_resampled, 
    y_train=y_train_resampled, 
    x_val=X_val_selected, 
    y_val=y_val, 
    param_grid=best_params
)

Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.42      0.16       667
           1       0.96      0.78      0.86     11485

    accuracy                           0.76     12152
   macro avg       0.53      0.60      0.51     12152
weighted avg       0.91      0.76      0.82     12152

ROC-AUC Score: 0.5987745569966432
Confusion Matrix:
 [[ 278  389]
 [2518 8967]]
