# Imports

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, make_scorer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb
from sklearn.model_selection import StratifiedKFold
from skopt import BayesSearchCV
import  optuna
from os import truncate
from optuna.visualization import plot_optimization_history, plot_slice
import pickle

# Data import

Since the test data provided cannot be used as test data for the models as the labels for it are missing, we will additionally split the training data into training and test data. Provided test data will from now on be referred to as validation data.

In [3]:
train_data = pd.read_csv('artificial_train.data',
                         sep=' ', header=None).iloc[:, :-1]

In [5]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496
1,483,458,460,487,587,475,526,479,485,469,...,463,478,487,338,513,486,483,492,510,517
2,487,542,499,468,448,471,442,478,480,477,...,487,481,492,650,506,501,480,489,499,498
3,480,491,510,485,495,472,417,474,502,476,...,491,480,474,572,454,469,475,482,494,461
4,484,502,528,489,466,481,402,478,487,468,...,488,479,452,435,486,508,481,504,495,511


In [7]:
train_labels = pd.read_csv('artificial_train.labels', sep=' ', header=None).replace(-1, 0, inplace=False)

In [8]:
train_labels.head()

Unnamed: 0,0
0,0
1,0
2,0
3,1
4,1


In [9]:
valid_data = pd.read_csv('artificial_test.data', sep=' ', header=None).iloc[:, :-1]

In [10]:
valid_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,483,454,513,495,523,469,453,477,506,479,...,455,480,543,259,413,520,485,498,523,510
1,485,508,493,487,478,472,504,476,479,475,...,486,480,535,534,514,452,484,495,548,477
2,483,521,507,475,493,486,421,475,496,483,...,491,476,498,495,508,528,486,465,508,503
3,474,504,576,480,553,483,524,478,483,483,...,521,475,470,463,509,525,479,467,552,517
4,495,474,523,479,495,488,485,476,497,478,...,510,471,522,343,509,520,475,493,506,491


In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.25, random_state=42)

# Data Exploration

In [None]:
train_data.shape

(2000, 500)

In [None]:
train_data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,...,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,481.7225,483.4525,510.166,483.3845,501.6125,479.259,480.1095,476.565,486.7935,478.789,...,482.4585,478.8115,486.3565,496.5655,493.4995,510.893,478.2195,483.309,507.977,490.266
std,6.421769,30.186294,38.899165,9.059895,41.389418,6.795956,40.575925,1.384461,15.043836,7.19092,...,20.890033,4.011735,23.967366,127.635442,34.81902,37.459353,5.880613,13.559847,37.224297,25.825273
min,462.0,381.0,370.0,453.0,371.0,459.0,334.0,471.0,430.0,455.0,...,407.0,463.0,391.0,130.0,368.0,398.0,457.0,435.0,363.0,403.0
25%,477.0,464.0,485.0,477.0,475.0,475.0,452.75,476.0,477.0,474.0,...,468.0,476.0,471.0,404.0,470.0,486.0,474.0,474.0,482.0,473.0
50%,482.0,483.0,510.5,483.0,500.0,479.0,480.0,477.0,487.0,479.0,...,482.0,479.0,486.0,504.0,492.0,511.0,478.0,483.0,508.0,490.0
75%,486.0,503.0,536.0,490.0,528.0,484.0,506.25,477.0,496.25,484.0,...,496.0,481.0,502.0,586.0,517.0,535.0,482.0,492.0,533.0,507.25
max,503.0,600.0,654.0,519.0,688.0,505.0,611.0,481.0,536.0,503.0,...,549.0,497.0,566.0,920.0,615.0,661.0,500.0,535.0,644.0,583.0


In [None]:
nan_mask = train_data.isna().any()
nan_columns = nan_mask[nan_mask].index
print("Column indexes with NaN values:", nan_columns)

Column indexes with NaN values: Int64Index([], dtype='int64')


# Data Preparation

First, we need to prepare data for classic models by selecting the features we want to use, transforming and processing the data.
To do so, we will apply different approaches used for feature selection.

In [None]:
processed_train = X_train.copy(deep=True)

In [None]:
processed_test = X_test.copy(deep=True)

In [None]:
processed_valid = valid_data.copy(deep=True)

## Boruta

During the data exploration phase, we have already seen that there are a lot of features that are not relevant for the prediction of the target variable. To select the relevant features, we will use the Boruta algorithm. This algorithm is a wrapper built around the Random Forest algorithm. It works by creating shadow features that are random permutations of the original features. Then, it trains a Random Forest classifier on the dataset and compares the importance of each feature with the importance of the shadow features. If the feature importance is higher than the maximum shadow feature importance, the feature is marked as important. The algorithm iterates until all features are marked as important or unimportant. The features marked as important are the ones that are relevant for the prediction of the target variable.

In [None]:
boruta = BorutaPy(RandomForestClassifier(random_state=42, n_jobs=-1, class_weight='balanced', max_depth=5), n_estimators='auto', verbose=2, random_state=42, max_iter=150)

In [None]:
boruta.fit(processed_train.values, y_train.values.ravel())

In [None]:
boruta.ranking_

In [None]:
selected_rf_features = pd.DataFrame({'Feature':list(processed_train.columns),
                                       'Ranking':boruta.ranking_}).sort_values(by='Ranking')
selected_rf_features

Unnamed: 0,Feature,Ranking
378,378,1
241,241,1
204,204,1
105,105,1
433,433,1
...,...,...
423,423,476
228,228,477
7,7,478
276,276,479


In [None]:
column_set=selected_rf_features[selected_rf_features["Ranking"]==1].index
column_set

Int64Index([378, 241, 204, 105, 433, 338, 336, 424, 475, 442, 318,  48, 451,
            453,  64, 472, 493,  28, 128, 153, 281],
           dtype='int64')

In [None]:
processed_train=processed_train[column_set]
processed_train.head()

Unnamed: 0,378,241,204,105,433,338,336,424,475,442,...,48,451,453,64,472,493,28,128,153,281
1738,487,464,527,695,489,449,429,472,440,481,...,479,493,585,455,474,612,505,495,487,471
548,542,444,403,398,485,614,590,483,404,598,...,509,468,509,607,513,532,464,469,481,468
936,517,499,476,589,522,476,425,484,485,613,...,503,468,622,449,533,656,468,486,533,499
1389,474,434,461,444,445,667,490,486,376,691,...,470,466,597,511,563,634,463,474,424,440
1607,443,473,478,298,561,549,577,495,449,490,...,441,470,464,615,481,477,469,458,598,523


## Side note

Other feature selection methods were also tested, but they did not yield better results than the Boruta algorithm. The other methods that were tested are:
- SelectKBest
- Mutual Information
- Recursive Feature Elimination
- Variance Threshold
- VIF (Variance Inflation Factor)
- Outlier Detection

## Test & validation data transformation

In [None]:
processed_test = processed_test[column_set]

In [None]:
processed_valid = processed_valid[column_set]

## Data Export

In [None]:
processed_train.to_pickle('results/processed_train.pkl')

In [None]:
processed_test.to_pickle('results/processed_test.pkl')

In [None]:
processed_valid.to_pickle('results/processed_valid.pkl')

In [None]:
y_train.to_pickle('results/y_train.pkl')

In [None]:
y_test.to_pickle('results/y_test.pkl')

# Models

Among classical machine learning models three were selected:

- XGBoost

- LightGBM

- Catboost

As they are known for their good overall performance.

Firstly, each of the models was fitted without any hyperparameter tuning to see the base score. Then, for each of the models a separate Optuna study was executed in order to tune the model. Balanced accuracy was used as an evaluation metric, and each trial used both train and test datasets and crossvalidation to avoid overfitting. 

In [None]:
scorer = make_scorer(balanced_accuracy_score)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
def convert_pred(pred_arr):
  return [1 if pred > 0.5 else 0 for pred in pred_arr]

## XGB Classifier

In [None]:
xgb.set_config(verbosity=2)

### Base model

In [None]:
xgb_base = xgb.XGBClassifier(random_state=42, n_jobs=-1)

In [None]:
xgb_base.fit(processed_train.values, y_train, verbose=True)

In [None]:
y_pred_train = xgb_base.predict(processed_train.values)
balanced_accuracy_score(y_pred_train, y_train)

1.0

In [None]:
y_pred_test = xgb_base.predict(processed_test.values)
balanced_accuracy_score(y_pred_test, y_test)

0.8602264175113289

### Hyperparameter tuning

#### Optuna

In [None]:
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.1, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
        'gamma': trial.suggest_float('gamma', 1e-9, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, 1.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
    }

    model = xgb.XGBClassifier(random_state=42, early_stopping_rounds=10, **params, verbose=False, verbose_eval=False)

    model.fit(processed_train, y_train, eval_set=[(processed_test, y_test)])

    y_pred = model.predict(processed_test)

    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    return balanced_acc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500, gc_after_trial=True)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  500
Best trial:
  Value:  0.885894174306789
  Params: 
    booster: gbtree
    learning_rate: 0.03851163184324964
    max_depth: 10
    min_child_weight: 1
    subsample: 0.9269347750394223
    colsample_bytree: 0.6674138872477267
    gamma: 0.1468850924028178
    reg_alpha: 1.963084042726269e-05
    reg_lambda: 1.24534061187496e-05
    scale_pos_weight: 1.2699378507687176


In [None]:
plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
xgb_tuned = xgb.XGBClassifier(random_state=42, early_stopping_rounds=10, n_jobs=-1, objective='binary:logistic',
                            eval_metric='logloss', **trial.params)

In [None]:
xgb_tuned.fit(processed_train, y_train, eval_set=[(processed_test, y_test)])

In [None]:
y_pred_train = xgb_tuned.predict(processed_train)
balanced_accuracy_score(y_pred_train, y_train)

0.9993333333333334

In [None]:
y_pred_test = xgb_tuned.predict(processed_test)
balanced_accuracy_score(y_pred_test, y_test)

0.8871019517205958

In [None]:
pickle.dump(xgb_tuned, open('results/xgb_tuned.pkl', 'wb'))

## LightGBM

### Base model

In [None]:
lgbm_base = lgb.LGBMClassifier(random_state=42, n_jobs=-1)

In [None]:
lgbm_base.fit(processed_train, y_train)

In [None]:
y_pred_train = lgbm_base.predict(processed_train.values)
y_pred_train_binary = convert_pred(y_pred_train)
balanced_accuracy_score(y_pred_train_binary, y_train)

1.0

In [None]:
y_pred_test = lgbm_base.predict(processed_test.values)
y_pred_test_binary = convert_pred(y_pred_test)
balanced_accuracy_score(y_pred_test_binary, y_test)

0.8522201405902228

### Hyperparameter tuning

#### Optuna

In [None]:
def objective(trial):
    params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'metric': 'binary_logloss',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 1, 15),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-5, 1e2),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-9, np.e, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-9, np.e, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 200),
        'random_state': 42,
        'verbose': -1,
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(processed_train, y_train, eval_set=[(processed_test, y_test)])

    pred = model.predict(processed_test, num_iteration=model.best_iteration_)
    accuracy = balanced_accuracy_score(y_test, convert_pred(pred))

    return accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  1000
Best trial:
  Value:  0.8879662074593193
  Params: 
    learning_rate: 0.12576008660695415
    n_estimators: 541
    max_depth: 12
    min_child_samples: 2
    min_child_weight: 0.03180667989977826
    subsample: 0.5652597584887905
    colsample_bytree: 0.3942795795048955
    reg_alpha: 0.016038828480566168
    reg_lambda: 4.292291576611074e-05
    num_leaves: 194


In [None]:
plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
lgbm_tuned = lgb.LGBMClassifier(objective='binary', random_state=42, boosting_type='gbdt', metric='binary_logloss', **trial.params)

In [None]:
lgbm_tuned.fit(processed_train, y_train, eval_set=[(processed_test, y_test)])

In [None]:
pred = lgbm_tuned.predict(processed_train, num_iteration=model.best_iteration_)
balanced_accuracy_score(y_train, convert_pred(pred))

1.0

In [None]:
pred = lgbm_tuned.predict(processed_test, num_iteration=model.best_iteration_)
balanced_accuracy_score(y_test, convert_pred(pred))

0.8879662074593193

In [None]:
pickle.dump(lgbm_tuned, open('results/models/lgbm_tuned.pkl', 'wb'))

## CatBoost

### Base model

In [None]:
ctb_base = ctb.CatBoostClassifier(random_state=42, logging_level='Silent')

In [None]:
ctb_base.fit(processed_train, y_train)

<catboost.core.CatBoostClassifier at 0x7f9b05e03910>

In [None]:
y_pred_train = ctb_base.predict(processed_train)
balanced_accuracy_score(y_pred_train, y_train)

0.9773350708179036

In [None]:
y_pred_test = ctb_base.predict(processed_test)
balanced_accuracy_score(y_pred_test, y_test)

0.8620071684587813

### Hyperparameter tuning

#### Optuna

In [None]:
def objective(trial):
    # Define hyperparameters to tune
    params = {
        'iterations': trial.suggest_int('iterations', 100, 2000),
        'learning_rate': trial.suggest_float("learning_rate", 1e-3, 1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1e-8, 100, log=True),
        'bootstrap_type': trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_strength': trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
        'od_type': trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        'od_wait': trial.suggest_int("od_wait", 10, 50),
        'auto_class_weights': trial.suggest_categorical('auto_class_weights', [None, 'Balanced']),
        'eval_metric': trial.suggest_categorical('eval_metric', ['BalancedAccuracy']),
        'verbose': trial.suggest_categorical('verbose', [False]),
    }

    # Create CatBoost classifier with specified hyperparameters
    model = ctb.CatBoostClassifier(**params)

    # Train the model
    model.fit(processed_train, y_train, eval_set=(processed_test, y_test), early_stopping_rounds=20, verbose=False)

    # Make predictions on the validation set
    y_pred = model.predict(processed_test)

    # Calculate balanced accuracy as the evaluation metric
    balanced_acc = balanced_accuracy_score(y_test, y_pred)

    return balanced_acc

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

In [None]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials:  500
Best trial:
  Value:  0.9060544968719499
  Params: 
    iterations: 590
    learning_rate: 0.07543722631797796
    depth: 10
    l2_leaf_reg: 0.3649992694634968
    bootstrap_type: Bayesian
    border_count: 255
    random_strength: 2.6538127002527956
    bagging_temperature: 1.020367816819722
    od_type: Iter
    od_wait: 43
    auto_class_weights: None
    eval_metric: BalancedAccuracy
    verbose: False


In [None]:
plot_optimization_history(study)

In [None]:
plot_slice(study)

In [None]:
ctb_tuned = ctb.CatBoostClassifier(random_state=42, **trial.params)

In [None]:
 ctb_tuned.fit(processed_train, y_train, eval_set=[(processed_test, y_test)], early_stopping_rounds=50, verbose=False)

<catboost.core.CatBoostClassifier at 0x7f9b0574a760>

In [None]:
pred = ctb_tuned.predict(processed_train)
balanced_accuracy_score(y_train, convert_pred(pred))

0.9806657434057661

In [None]:
pred = ctb_tuned.predict(processed_test)
balanced_accuracy_score(y_test, convert_pred(pred))

0.8700219203507256

In [None]:
pickle.dump(ctb_tuned, open('results/models/ctb_tuned.pkl', 'wb'))

## Comparison & result prediction

In [49]:
models = {
    "XGBoost": {"train_bac": balanced_accuracy_score(y_train, xgb_tuned.predict(processed_train)),
                "test_bac": balanced_accuracy_score(y_test, xgb_tuned.predict(processed_test))},
    "LightGBM": {"train_bac": balanced_accuracy_score(y_train, convert_pred(lgbm_tuned.predict(processed_train))),
                "test_bac": balanced_accuracy_score(y_test, convert_pred(lgbm_tuned.predict(processed_test)))},
    "Catboost": {"train_bac": balanced_accuracy_score(y_train, ctb_tuned.predict(processed_train)),
                "test_bac": balanced_accuracy_score(y_test, ctb_tuned.predict(processed_test))},
}

scores_df = pd.DataFrame(columns=["model", "train_bac", "test_bac"])

for model, model_data in models.items():
    train_bac = model_data["train_bac"]
    test_bac = model_data["test_bac"]
    scores_df = scores_df.append({"model": model, "train_bac": train_bac, "test_bac": test_bac}, ignore_index=True)
scores_df = scores_df.sort_values(by="model")

scores_df

Unnamed: 0,model,train_bac,test_bac
2,Catboost,0.980666,0.870022
1,LightGBM,1.0,0.887966
0,XGBoost,0.999334,0.885894


Since LightGBM had the best scores, it will was used for result prediction on artificial_test.data.

In [47]:
artificial_model_prediction = pd.DataFrame(lgbm_tuned.predict_proba(processed_valid)[:, 1], columns=["313343_317028"])
artificial_model_prediction

Unnamed: 0,313343_317028
0,0.002621
1,0.000527
2,0.662176
3,0.999411
4,0.041828
...,...
595,0.986006
596,0.017186
597,0.999427
598,0.999570


In [50]:
artificial_model_prediction.to_csv('results/313343_317028_artificial_model_prediction.txt', sep='\t', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=99cf932f-7077-4164-8bfd-a72e4818ff9c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>