In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier 
import pickle


In [26]:
train_no_label = pd.read_csv('../../data/artificial_train.data', sep=' ', header=None)
train_label = pd.read_csv('../../data/artificial_train.labels', sep=' ', header=None)
test_no_label = pd.read_csv('../../data/artificial_test.data', sep=' ', header=None)

# change the label to 0 and 1
train_label = train_label.replace(-1, 0)

#drop NaN columns
train_no_label = train_no_label.dropna(axis=1)
test_no_label = test_no_label.dropna(axis=1)

# rename the columns
train_no_label_columns = ["c" + str(i) for i in range(1, len(train_no_label.columns) + 1)]
train_no_label.columns = train_no_label_columns

test_no_label_columns = ["c" + str(i) for i in range(1, len(test_no_label.columns) + 1)]
test_no_label.columns = test_no_label_columns

# merge the label and the data
train = pd.concat([train_no_label, train_label], axis=1)
train.rename(columns={0: 'label'}, inplace=True)

test_no_label.rename(columns={0: 'label'}, inplace=True)

# copy the data
import copy as cp
test = cp.deepcopy(test_no_label)

# split the data
from sklearn.model_selection import train_test_split

train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train['label'])

In [22]:
# load selected features
with open('selected_features/columns_to_keep_boruta.pickle', 'rb') as f:
    selected_features = pickle.load(f)
    print(selected_features)
    print(type(selected_features))

['c29', 'c49', 'c65', 'c106', 'c129', 'c154', 'c242', 'c282', 'c283', 'c319', 'c337', 'c339', 'c379', 'c434', 'c443', 'c452', 'c454', 'c473', 'c476', 'c494']
<class 'list'>


In [23]:
# load best params
with open('optuna/best_params.pickle', 'rb') as f:
    best_params = pickle.load(f)


best_params = best_params['catboost']['bor']
print(best_params)

{'iterations': 322, 'learning_rate': 0.15542656215841885, 'depth': 8, 'l2_leaf_reg': 9}


In [27]:
predictor = CatBoostClassifier(**best_params)
predictor.fit(train[selected_features], train['label'], eval_set=(val[selected_features], val['label']), verbose=False)

<catboost.core.CatBoostClassifier at 0x16850dd50>

In [32]:
test_results = predictor.predict(val[selected_features])
test_results

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,

In [33]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(val['label'], test_results)

0.92

In [44]:
test_results = predictor.predict(test[selected_features])
test_results, len(test_results)

(array([0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
        0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1,
        1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
        1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
        0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 

In [42]:
predictor.predict_proba(test_no_label)[:, 1]

array([5.75518809e-02, 5.45495752e-03, 5.18781952e-01, 9.90222581e-01,
       1.07195533e-01, 9.92194170e-01, 3.58015940e-02, 7.37735755e-03,
       5.99259953e-01, 9.97965933e-01, 6.56561667e-01, 9.67046798e-01,
       2.53741425e-02, 9.74545675e-01, 9.47111553e-01, 1.07699334e-02,
       2.00536607e-01, 9.96672189e-01, 2.18440329e-01, 8.77513878e-03,
       1.08387280e-01, 2.23836169e-02, 1.25339706e-02, 8.13866855e-01,
       9.88545962e-01, 9.97402367e-01, 6.95286850e-01, 1.45765704e-01,
       4.77311501e-01, 7.97523366e-03, 3.10725927e-02, 9.72998562e-01,
       9.91485371e-01, 2.11807661e-01, 9.61173286e-01, 4.41350931e-02,
       7.10342266e-01, 9.89052586e-01, 8.83456462e-01, 9.78070954e-01,
       9.01110693e-01, 9.87031677e-01, 7.15581021e-01, 9.62268326e-01,
       3.73126767e-01, 9.95612004e-01, 7.00356338e-02, 9.88421075e-01,
       1.03965514e-02, 3.38789201e-01, 9.97196134e-01, 6.41869904e-01,
       8.48414273e-02, 8.31761376e-01, 9.70206874e-01, 1.37187098e-02,
      

In [43]:
# predict on test data with no label
test_results = predictor.predict_proba(test_no_label)[:,1]

test_results = pd.DataFrame(test_results)
test_results.rename(columns={0: '313420_313435'}, inplace=True)
# save the results to a csv file
test_results.to_csv('313420_313435_artifical_model_prediction.txt', index=False)