In [18]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier


## Wczytanie danych

In [19]:
X = pd.read_csv('data/artificial_train.data', header=None, sep=' ').drop(500,axis=1)
X.shape

(2000, 500)

In [20]:
y = pd.read_csv('data/artificial_train.labels', header=None, sep=' ')
y.shape

(2000, 1)

In [21]:
X_test = pd.read_csv('data/artificial_test.data', header=None, sep=' ').drop(500,axis=1)
X_test.shape

(600, 500)

## Preprocessing

In [22]:
def correlated_columns(X, index=0.98):

    correlation_matrix = X.corr()
    correlation_matrix = correlation_matrix.abs()

    upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

    to_drop = [column for column in upper.columns if any(upper[column] > index)]

    return to_drop
    

In [23]:
to_drop = correlated_columns(X)
X = X.drop(to_drop, axis=1)
X.shape

(2000, 490)

In [24]:
X_test = X_test.drop(to_drop, axis=1)
X_test.shape

(600, 490)

In [25]:
y.replace(-1,0,inplace=True)

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

## Trenowanie modeli

In [27]:
def balanced_accuracy(preds,dtrain):
    labels = dtrain.get_label()
    preds = np.round(preds)
    return 'balanced_accuracy',balanced_accuracy_score(labels, preds)

### Trenowanie modeli - Bayes Search

#### XGBoost

In [11]:
def xgb_cv(max_depth, gamma, min_child_weight, subsample, colsample_bytree, learning_rate):
    params = {
        'objective': 'binary:logistic',
        'max_depth': int(max_depth),
        'gamma': gamma,
        'min_child_weight': min_child_weight,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'learning_rate': learning_rate,
        'n_jobs': -1,
        'random_state': 42
    }
    dtrain = xgb.DMatrix(X_train, label=y_train)
    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=5, custom_metric=balanced_accuracy, maximize=True, seed=42)
    return cv_result['test-balanced_accuracy-mean'].iloc[-1]

In [13]:
params = []
score = []

for i in range(1,10):
    print(i)
    pbounds = {
        'max_depth': (3, 15),
        'gamma': (0, 1),
        'min_child_weight': (0, 5),
        'subsample': (0.5, 1),
        'colsample_bytree': (0.5, 1),
        'learning_rate': (0.001, 0.3)
        }

    optimizer = BayesianOptimization(f=xgb_cv, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=i, n_iter=50)

    xgb_params = {
        'objective': 'binary:logistic',
        'max_depth': int(optimizer.max['params']['max_depth']),
        'gamma': optimizer.max['params']['gamma'],
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'subsample': optimizer.max['params']['subsample'],
        'colsample_bytree': optimizer.max['params']['colsample_bytree'],
        'learning_rate': optimizer.max['params']['learning_rate'],
        'n_jobs': -1,
        'random_state': 42
        }

    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_val)
    bas_xgb = balanced_accuracy_score(y_val, y_pred_xgb)
    params.append(optimizer.max['params'])
    score.append(bas_xgb)

results = {'params': params, 'score': score}
results = pd.DataFrame(results)
results.to_csv('manual_predictions\\xgb_results.csv', index=False)

1
|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6557   [0m | [0m0.6873   [0m | [0m0.9507   [0m | [0m0.2199   [0m | [0m10.18    [0m | [0m0.7801   [0m | [0m0.578    [0m |
| [95m2        [0m | [95m0.7157   [0m | [95m0.836    [0m | [95m0.05202  [0m | [95m0.03498  [0m | [95m3.047    [0m | [95m4.997    [0m | [95m0.9011   [0m |
| [0m3        [0m | [0m0.7082   [0m | [0m0.5621   [0m | [0m0.7127   [0m | [0m0.2225   [0m | [0m4.877    [0m | [0m3.169    [0m | [0m0.9424   [0m |
| [95m4        [0m | [95m0.7193   [0m | [95m1.0      [0m | [95m0.0      [0m | [95m0.001    [0m | [95m3.0      [0m | [95m0.0      [0m | [95m0.5      [0m |
| [95m5        [0m | [95m0.7339   [0m | [95m1.0      [0m | [95m0.0      [0m | [95m0.3      [0m | [95m15.0     [0m | [95m5.0  

In [13]:
xgb_result = pd.read_csv('results\\xgb_results.csv')
xgb_result.sort_values(by='score', ascending=False, inplace=True)
best_params = xgb_result.iloc[0,0]
best_params = eval(best_params)
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_jobs'] = -1
best_params['random_state'] = 42
model_xgb = xgb.XGBClassifier(**best_params)
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_val)
balanced_accuracy_score(y_val, y_pred_xgb)

0.83

In [14]:
np.savetxt('manual_predictions\\xgb_pred2.txt',model_xgb.predict_proba(X_test)[:,1])

#### Usuwanie nieznaczących zmiennych 

In [14]:
X_train.drop(np.where(model_xgb.feature_importances_ == 0)[0], axis=1, inplace=True)
print(X_train.shape)
X_val.drop(np.where(model_xgb.feature_importances_ == 0)[0], axis=1, inplace=True)
print(X_val.shape)
X_test.drop(np.where(model_xgb.feature_importances_ == 0)[0], axis=1, inplace=True)
print(X_test.shape)

(1600, 474)
(400, 474)
(600, 474)


#### XGBoost z usuniętymi zmiennymi

In [15]:
params = []
score = []

for i in range(1,10):
    print(i)
    pbounds = {
        'max_depth': (3, 15),
        'gamma': (0, 1),
        'min_child_weight': (0, 5),
        'subsample': (0.5, 1),
        'colsample_bytree': (0.5, 1),
        'learning_rate': (0.001, 0.3)
        }

    optimizer = BayesianOptimization(f=xgb_cv, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=i, n_iter=50)

    xgb_params = {
        'objective': 'binary:logistic',
        'max_depth': int(optimizer.max['params']['max_depth']),
        'gamma': optimizer.max['params']['gamma'],
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'subsample': optimizer.max['params']['subsample'],
        'colsample_bytree': optimizer.max['params']['colsample_bytree'],
        'learning_rate': optimizer.max['params']['learning_rate'],
        'n_jobs': -1,
        'random_state': 42
        }

    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_xgb.fit(X_train, y_train)
    y_pred_xgb = model_xgb.predict(X_val)
    bas_xgb = balanced_accuracy_score(y_val, y_pred_xgb)
    params.append(optimizer.max['params'])
    score.append(bas_xgb)

results = {'params': params, 'score': score}
results = pd.DataFrame(results)
results.to_csv('results\\xgb2_results.csv', index=False)

1
|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.6378   [0m | [0m0.6873   [0m | [0m0.9507   [0m | [0m0.2199   [0m | [0m10.18    [0m | [0m0.7801   [0m | [0m0.578    [0m |
| [95m2        [0m | [95m0.709    [0m | [95m0.836    [0m | [95m0.05202  [0m | [95m0.03498  [0m | [95m3.047    [0m | [95m4.997    [0m | [95m0.9011   [0m |
| [0m3        [0m | [0m0.6698   [0m | [0m0.5621   [0m | [0m0.7127   [0m | [0m0.2225   [0m | [0m4.877    [0m | [0m3.169    [0m | [0m0.9424   [0m |
| [0m4        [0m | [0m0.6823   [0m | [0m0.6052   [0m | [0m0.9925   [0m | [0m0.06496  [0m | [0m11.41    [0m | [0m2.478    [0m | [0m0.6438   [0m |
| [0m5        [0m | [0m0.7065   [0m | [0m0.8948   [0m | [0m0.5355   [0m | [0m0.2685   [0m | [0m8.488    [0m | [0m2.112    [0m | [0m

Ponieważ usunięcie zmiennych nie przyspieszyło uczenia oraz pogorszyło wyniki, pozostawiamy wszystkie zmienne.

#### CatBoost


In [11]:
def catboost_cv(depth, l2_leaf_reg, learning_rate, subsample):
    params = {
        'iterations': 100,
        'depth': int(depth),
        'l2_leaf_reg': l2_leaf_reg,
        'learning_rate': learning_rate,
        'subsample': subsample,
        'random_seed': 42,
        'verbose': False,
    }
    model = CatBoostClassifier(**params)
    cv_score = cross_val_score(model, X_train, y_train, scoring='balanced_accuracy', cv=3, error_score=0)

    return cv_score.mean()

In [11]:
params = []
score = []

for i in [1,3,5,7,9]:
    print(i)
    pbounds = {
        'depth': (3, 10),
        'l2_leaf_reg': (1, 10),
        'learning_rate': (0.001, 0.3),
        'subsample': (0.5, 1)
        }
    
    optimizer = BayesianOptimization(f=catboost_cv, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=i, n_iter=50)

    catboost_params = {
        'iterations': 100,
        'depth': int(optimizer.max['params']['depth']),
        'l2_leaf_reg': optimizer.max['params']['l2_leaf_reg'],
        'learning_rate': optimizer.max['params']['learning_rate'],
        'subsample': optimizer.max['params']['subsample'],
        'random_seed': 42,
        'verbose': False,
        }

    model_catboost = CatBoostClassifier(**catboost_params)
    model_catboost.fit(X_train, y_train)
    y_pred_catboost = model_catboost.predict(X_val)
    bas_catboost = balanced_accuracy_score(y_val, y_pred_catboost)
    params.append(optimizer.max['params'])
    score.append(bas_catboost)

results = {'params': params, 'score': score}
results = pd.DataFrame(results)
results.to_csv('results\\catboost_results.csv', index=False)
    

1
|   iter    |  target   |   depth   | l2_lea... | learni... | subsample |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.7462   [0m | [0m5.622    [0m | [0m9.556    [0m | [0m0.2199   [0m | [0m0.7993   [0m |
| [95m2        [0m | [95m0.7937   [0m | [95m9.973    [0m | [95m1.041    [0m | [95m0.1923   [0m | [95m0.9348   [0m |
| [0m3        [0m | [0m0.6863   [0m | [0m3.345    [0m | [0m1.166    [0m | [0m0.1806   [0m | [0m0.8619   [0m |
| [0m4        [0m | [0m0.6925   [0m | [0m3.031    [0m | [0m1.363    [0m | [0m0.1405   [0m | [0m0.6761   [0m |
| [0m5        [0m | [0m0.7419   [0m | [0m10.0     [0m | [0m3.636    [0m | [0m0.1121   [0m | [0m0.5173   [0m |
| [0m6        [0m | [0m0.7394   [0m | [0m8.248    [0m | [0m1.0      [0m | [0m0.001    [0m | [0m0.5      [0m |
| [95m7        [0m | [95m0.8019   [0m | [95m9.188    [0m | [95m10.0     [0m | [95m0.3      [0m | [95m

In [13]:
catboost_result = pd.read_csv('results\\catboost_results.csv')
catboost_result.sort_values(by='score', ascending=False, inplace=True)
best_params = catboost_result.iloc[0,0]
best_params = eval(best_params)
best_params['iterations'] = 100
best_params['depth'] = int(best_params['depth'])
best_params['verbose'] = False
best_params['random_seed'] = 42
model_catboost = CatBoostClassifier(**best_params)
model_catboost.fit(X_train, y_train)
y_pred_catboost = model_catboost.predict(X_val)
balanced_accuracy_score(y_val, y_pred_catboost)

0.8574999999999999

In [14]:
np.savetxt('manual_predictions\\catboost_pred.txt',model_catboost.predict_proba(X_test)[:,1])

#### RandomForest


In [15]:
y_train_rf = y_train.values.ravel()

In [19]:
def rf_cv(n_estimators, max_depth, min_samples_split, max_features):
    val = cross_val_score(
        RandomForestClassifier(
            n_estimators=int(n_estimators),
            max_depth=int(max_depth),
            min_samples_split=int(min_samples_split),
            max_features=min(max_features, 0.999),
            random_state=42,
            n_jobs=-1
        ),
        X_train,
        y_train_rf,
        scoring='balanced_accuracy',
        cv=3,
        error_score=0
    ).mean()
    return val

In [21]:
params = []
score = []

for i in [1,3,5,7,9]:
    print(i)
    pbounds = {
        'n_estimators': (10, 1000),
        'max_depth': (1, 100),
        'min_samples_split': (2, 10),
        'max_features': (0.1, 0.999)
    }

    optimizer = BayesianOptimization(f=rf_cv, pbounds=pbounds, random_state=42)
    optimizer.maximize(init_points=i, n_iter=50)

    rf_params = {
        'n_estimators': int(optimizer.max['params']['n_estimators']),
        'max_depth': int(optimizer.max['params']['max_depth']),
        'min_samples_split': int(optimizer.max['params']['min_samples_split']),
        'max_features': min(optimizer.max['params']['max_features'], 0.999),
        'n_jobs': -1,
        'random_state': 42
        }

    model_rf = RandomForestClassifier(**rf_params)
    model_rf.fit(X_train, y_train_rf)
    y_pred_rf = model_rf.predict(X_val)
    bas_rf = balanced_accuracy_score(y_val, y_pred_rf)
    params.append(optimizer.max['params'])
    score.append(bas_rf)

results = {'params': params, 'score': score}
results = pd.DataFrame(results)
results.to_csv('results\\rf_results.csv', index=False)

1
|   iter    |  target   | max_depth | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.7607   [0m | [0m38.08    [0m | [0m0.9547   [0m | [0m7.856    [0m | [0m602.7    [0m |
| [0m2        [0m | [0m0.7263   [0m | [0m72.59    [0m | [0m0.7006   [0m | [0m3.783    [0m | [0m73.26    [0m |
| [0m3        [0m | [0m0.7269   [0m | [0m40.33    [0m | [0m0.4106   [0m | [0m7.401    [0m | [0m601.9    [0m |
| [0m4        [0m | [0m0.6106   [0m | [0m1.443    [0m | [0m0.1363   [0m | [0m5.732    [0m | [0m358.6    [0m |
| [95m5        [0m | [95m0.7613   [0m | [95m32.33    [0m | [95m0.999    [0m | [95m9.016    [0m | [95m604.5    [0m |
| [0m6        [0m | [0m0.735    [0m | [0m31.92    [0m | [0m0.4721   [0m | [0m8.986    [0m | [0m595.0    [0m |
| [0m7        [0m | [0m0.7488   [0m | [0m34.37    [0m | [0m0.8702   [0m | [0m2.442    [0m | [0m608.0 

In [22]:
rf_results = pd.read_csv('results\\rf_results.csv')
rf_results.sort_values(by='score', ascending=False, inplace=True)
best_params = rf_results.iloc[0,0]
best_params = eval(best_params)
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split'])
best_params['n_jobs'] = -1
best_params['random_state'] = 42
model_rf = RandomForestClassifier(**best_params)
model_rf.fit(X_train, y_train_rf)
y_pred_rf = model_rf.predict(X_val)
balanced_accuracy_score(y_val, y_pred_rf)

In [26]:
np.savetxt('manual_predictions\\rf_pred.txt',model_rf.predict_proba(X_test)[:,1])

### Trenowanie modeli - ręczne strojenie hiperparametrów

#### Catboost 

In [50]:
catboost_params = {
    'iterations': 1000,
    'depth': 8,
    'l2_leaf_reg': 8.17326686810523,
    'learning_rate': 0.03,
    'random_seed': 42,
    'subsample': 1,
    'verbose': True,
    }

model_catboost = CatBoostClassifier(**catboost_params)
model_catboost.fit(X_train, y_train)
y_pred_catboost = model_catboost.predict(X_val)
balanced_accuracy_score(y_val, y_pred_catboost)

0:	learn: 0.6745840	total: 84.3ms	remaining: 1m 24s
1:	learn: 0.6669312	total: 145ms	remaining: 1m 12s
2:	learn: 0.6476679	total: 204ms	remaining: 1m 7s
3:	learn: 0.6298051	total: 261ms	remaining: 1m 5s
4:	learn: 0.6174825	total: 319ms	remaining: 1m 3s
5:	learn: 0.6132733	total: 379ms	remaining: 1m 2s
6:	learn: 0.6016497	total: 436ms	remaining: 1m 1s
7:	learn: 0.5932198	total: 495ms	remaining: 1m 1s
8:	learn: 0.5889814	total: 553ms	remaining: 1m
9:	learn: 0.5788455	total: 611ms	remaining: 1m
10:	learn: 0.5722345	total: 669ms	remaining: 1m
11:	learn: 0.5671348	total: 730ms	remaining: 1m
12:	learn: 0.5566142	total: 789ms	remaining: 59.9s
13:	learn: 0.5457912	total: 847ms	remaining: 59.7s
14:	learn: 0.5433978	total: 907ms	remaining: 59.5s
15:	learn: 0.5391510	total: 966ms	remaining: 59.4s
16:	learn: 0.5310907	total: 1.02s	remaining: 59.2s
17:	learn: 0.5220474	total: 1.08s	remaining: 59.1s
18:	learn: 0.5149752	total: 1.14s	remaining: 59s
19:	learn: 0.5061085	total: 1.2s	remaining: 58.8s
20

0.87

In [46]:
np.savetxt('manual_predictions\\catboost_pred2.txt',model_catboost.predict_proba(X_test)[:,1])

#### MLP

In [174]:
y_train_mlp = y_train.values.ravel()
standard_scaler = StandardScaler()
X_train_mlp = standard_scaler.fit_transform(X_train)
X_val_mlp = standard_scaler.transform(X_val)
model_mlp = MLPClassifier(hidden_layer_sizes=(256,256,256), activation='relu', solver='adam', alpha=0.1, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.01, max_iter=1000, shuffle=True, random_state=42, verbose=False, early_stopping=True, validation_fraction=0.2)
model_mlp.fit(X_train_mlp, y_train_mlp)
y_pred_mlp = model_mlp.predict(X_val_mlp)
print(balanced_accuracy_score(y_train, model_mlp.predict(X_train_mlp)))
balanced_accuracy_score(y_val, y_pred_mlp)

0.865


0.5800000000000001

#### SVC

In [216]:
y_train_svc = y_train.values.ravel()
standard_scaler = StandardScaler()
X_train_svc = standard_scaler.fit_transform(X_train)
X_val_svc = standard_scaler.transform(X_val)
model_svc = SVC(kernel='poly', degree=3, gamma='scale', C=4, random_state=42)
model_svc.fit(X_train_svc, y_train_svc)
y_pred_svc = model_svc.predict(X_val_svc)
print(balanced_accuracy_score(y_train, model_svc.predict(X_train_svc)))
balanced_accuracy_score(y_val, y_pred_svc)


1.0


0.59

### Trenowanie modeli - BayesSearchCV

In [11]:
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [12]:
space = {
        'iterations': Integer(100, 1000),
        'depth': Integer(6, 12),
        'l2_leaf_reg': Real(6, 12),
        'learning_rate': Real(0.01, 0.5),
        'subsample': Real(0.8, 1)
        }
model = CatBoostClassifier(random_seed=42, verbose=False)

In [13]:
opt = BayesSearchCV(estimator=model, search_spaces=space, scoring='balanced_accuracy', cv=3, n_iter=50, random_state=42, verbose=10, error_score=0)

In [14]:
opt.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START depth=8, iterations=755, l2_leaf_reg=11.597207993087004, learning_rate=0.1647418008086539, subsample=0.9340295896537869
[CV 1/3; 1/1] END depth=8, iterations=755, l2_leaf_reg=11.597207993087004, learning_rate=0.1647418008086539, subsample=0.9340295896537869;, score=0.803 total time=  42.5s
[CV 2/3; 1/1] START depth=8, iterations=755, l2_leaf_reg=11.597207993087004, learning_rate=0.1647418008086539, subsample=0.9340295896537869
[CV 2/3; 1/1] END depth=8, iterations=755, l2_leaf_reg=11.597207993087004, learning_rate=0.1647418008086539, subsample=0.9340295896537869;, score=0.827 total time=  41.1s
[CV 3/3; 1/1] START depth=8, iterations=755, l2_leaf_reg=11.597207993087004, learning_rate=0.1647418008086539, subsample=0.9340295896537869
[CV 3/3; 1/1] END depth=8, iterations=755, l2_leaf_reg=11.597207993087004, learning_rate=0.1647418008086539, subsample=0.9340295896537869;, score=0.827 total time=  41.0s
Fitting 

In [15]:
pd.DataFrame(opt.cv_results_).to_csv('results/catboost2_results.csv')


In [28]:
y_pred_catboost = opt.predict(X_val)
balanced_accuracy_score(y_val, y_pred_catboost)

0.8625