In [1]:
import os
import numpy as np
import pandas as pd
import joblib

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    confusion_matrix
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    KFold,
    GridSearchCV,
)
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE, SMOTENC, ADASYN
from imblearn.combine import SMOTEENN

In [2]:
def read_file(file_path):
    return pd.read_csv(file_path)

In [3]:
def data_split(df, test_size = None):
    X = df.drop('target', axis = 1)
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)

    return X_train, X_test, y_train, y_test

In [4]:
def data_augmentation(X_train, y_train):
    aug = SMOTE(random_state=42)
    # aug = ADASYN(random_state=42)
    # aug = SMOTEENN(random_state=42)
    X_train_resampled, y_train_resampled = aug.fit_resample(X_train, y_train)

    return X_train_resampled, y_train_resampled

In [5]:
def best_features(X_train, X_val, y_train):
    # k_best_features = int(0.2 * X_train.shape[1])
    k_best_features = 15
    selector = SelectKBest(score_func=f_classif, k=k_best_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_val_selected = selector.transform(X_val)

    return X_train_selected, X_val_selected

In [6]:
def random_forest_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = RandomForestClassifier(
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            min_samples_leaf=param_grid['min_samples_leaf'],
            min_samples_split=param_grid['min_samples_split'],
            random_state = 42
        )
    else:
        model = RandomForestClassifier(random_state=42)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [7]:
def parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [10, 100],
        'max_depth': [6, 8, 10, 12],
        'min_samples_leaf': [8, 12, 18],
        'min_samples_split': [8, 16, 20],
    }

    grid_search = GridSearchCV(RandomForestClassifier(random_state = 42), param_grid, cv = 3, scoring = 'roc_auc')
    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [8]:
def xgboost_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = XGBClassifier(
            booster = 'gbtree',
            objective = 'binary:logistic',
            silent=0,
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            learning_rate=param_grid['learning_rate'],
            random_state = 42,
        )
    else:
        model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [9]:
def xgboost_parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300, 400, 500, 600],
        'max_depth': [3, 5, 10, 15],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
    }

    grid_search = GridSearchCV(
        XGBClassifier(random_state=42),
        param_grid, 
        cv=3, 
        scoring='f1'
    )

    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [10]:
def lightgbm_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = LGBMClassifier(
            n_estimators=param_grid['n_estimators'], 
            max_depth=param_grid['max_depth'],
            learning_rate=param_grid['learning_rate'],
            subsample=param_grid['subsample'],
            colsample_bytree=param_grid['colsample_bytree'],
            random_state = 42,
            use_label_encoder=False,
            eval_metric='logloss',
        )
    else:
        model = LGBMClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [11]:
def lightgbm_parameter_tuning(x_train, y_train):
    param_grid = {
        'n_estimators': [50, 100, 150],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    grid_search = GridSearchCV(
        LGBMClassifier(random_state=42),
        param_grid, 
        cv=3, 
        scoring='roc_auc'
    )

    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [12]:
def adaboost_learning_and_evaluation(x_train, y_train, x_val, y_val, param_grid):
    if param_grid:
        model = AdaBoostClassifier(
            n_estimators=param_grid['n_estimators'], 
            learning_rate=param_grid['learning_rate'],
            base_estimator__max_depth=param_grid['base_estimator__max_depth'],
            random_state = 42,
        )
    else:
        model = AdaBoostClassifier(random_state=42)
        
    model.fit(x_train, y_train)

    y_pred = model.predict(x_val)
    print('Classification Report:\n', classification_report(y_val, y_pred))
    print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

    return model

In [13]:
def adaboost_parameter_tuning(x_train, y_train):
    param_grid_ada = {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1.0],
        'base_estimator__max_depth': [1, 2, 3]
    }

    base_estimator = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(
        AdaBoostClassifier(base_estimator=base_estimator, random_state=42),
        param_grid_ada, 
        cv=3, 
        scoring='roc_auc'
    )

    grid_search.fit(x_train, y_train)

    best_params = grid_search.best_params_
    print('Best Hyperparameters:', best_params)
    
    return best_params

In [14]:
def dataframe_to_csv(df, file_name):
    df.to_csv(file_name, index = False)
    print(f'Data Saved as {file_name}')

In [15]:
# PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_StandardScaler.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\preprocessed_train_data_RobustScaler.csv'
# PATH = r'D:\LGAimers\Hackerton\data\preprocessed_train_data_Normalizer.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\scaled_train_data_Robust.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\scaled_train_data_Normalizer.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\outlier_scaled_train_data_Robust.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\outlier_scaled_train_data_Standard.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\use_this_train_data_robust.csv'
# PATH = r'D:\LGAimers\Hackerton\Inheon\data\use_this_train_data_standard.csv'
PATH = r'D:\LGAimers\Hackerton\Inheon\data\use_this_train_data_normalizer.csv'
df = read_file(PATH)

In [None]:
X_train, X_val, y_train, y_val = data_split(df, test_size = 0.2)

In [None]:
y_train = y_train.replace('Normal', 0).replace('AbNormal', 1)
y_val = y_val.replace('Normal', 0).replace('AbNormal', 1)

In [None]:
print(f'X_train.shape = {X_train.shape}')
print(f'X_val.shape = {X_val.shape}')
print(f'y_train.shape = {y_train.shape}')
print(f'y_val.shape = {y_val.shape}')

In [None]:
y_train.describe()

In [None]:
X_train_selected, X_val_selected = best_features(X_train=X_train, X_val=X_val, y_train=y_train)


In [None]:
X_train_resampled, y_train_resampled = data_augmentation(X_train=X_train_selected, y_train=y_train)

In [None]:
print(f'X_train_resampled.shape = {X_train_resampled.shape}')
print(f'y_train_resampled.shape = {y_train_resampled.shape}')

In [None]:
# best_params_adaboost = adaboost_parameter_tuning(X_train_resampled, y_train_resampled)

In [None]:
# grid_search_adaboost_model = adaboost_learning_and_evaluation(
#     x_train=X_train_resampled,
#     y_train=y_train_resampled,
#     x_val=X_val_selected,
#     y_val=y_val,
#     param_grid=best_params_adaboost,
# )

In [None]:
# best_params_lgbm = lightgbm_parameter_tuning(X_train_resampled, y_train_resampled)

In [None]:
# grid_search_lgbm_model = lightgbm_learning_and_evaluation(
#     x_train=X_train_resampled,
#     y_train=y_train_resampled,
#     x_val=X_val_selected,
#     y_val=y_val,
#     param_grid=best_params_lgbm,
# )

In [None]:
model = LGBMClassifier(n_estimators=1000,num_leaves=64,n_jobs=-1, is_unbalance = True,boost_from_average=False, random_state=42)
        
model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_val_selected)
print('Classification Report:\n', classification_report(y_val, y_pred))
print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

In [None]:
best_params_xgb = xgboost_parameter_tuning(X_train_resampled, y_train_resampled)

In [None]:
grid_search_xgboost_model = xgboost_learning_and_evaluation(
    x_train=X_train_resampled,
    y_train=y_train_resampled,
    x_val=X_val_selected,
    y_val=y_val,
    param_grid=best_params_xgb,
)

In [None]:
best_params_rf = parameter_tuning(X_train_resampled, y_train_resampled)

In [None]:
grid_search_model = random_forest_learning_and_evaluation(
    x_train=X_train_resampled, 
    y_train=y_train_resampled, 
    x_val=X_val_selected, 
    y_val=y_val, 
    param_grid=best_params_rf,
)

In [None]:
test_data = pd.read_csv(r'D:\LGAimers\Hackerton\Inheon\data\use_this_test_data_normalizer.csv')

In [None]:
test_data.info()

In [None]:
test_data.describe()

In [None]:
X_train, X_test = best_features(X_train, test_data, y_train)

In [None]:
X_test.shape

In [None]:
model = XGBClassifier(
    booster = 'gbtree',
    objective = 'binary:logistic',
    silent=0,
    n_estimators=best_params_xgb['n_estimators'], 
    max_depth=best_params_xgb['max_depth'],
    learning_rate=best_params_xgb['learning_rate'],
    random_state = 42,
)

In [None]:
model.fit(X_train_resampled, y_train_resampled)


In [None]:
y_pred = model.predict(X_val_selected)
print('Classification Report:\n', classification_report(y_val, y_pred))
print('ROC-AUC Score:', roc_auc_score(y_val, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))

In [None]:
y_test_pred = model.predict(X_test)

In [None]:
real_X_test = pd.read_csv(r'D:\LGAimers\Hackerton\Inheon\data\test.csv')

In [None]:
y_test_pred = pd.Series(y_test_pred)
print(y_test_pred.shape)
print(real_X_test.shape)

In [None]:
X_plus_y = pd.concat([real_X_test, y_test_pred], axis = 1)

In [None]:
X_plus_y = X_plus_y.drop('target', axis = 1)
X_plus_y.rename(columns = {0: 'target'}, inplace = True)
print(X_plus_y)

In [None]:
submission = pd.concat([X_plus_y['Set ID'], X_plus_y['target']], axis = 1)

In [None]:
submission

In [None]:
dataframe_to_csv(submission, 'submission.csv')