In [16]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import MDS

from metrics import default_competition_metric

In [17]:
np.random.seed(3666)

In [18]:
X_train = np.load("../data/x_train.npy")
y_train = np.load("../data/y_train.npy")
X_val = np.load("../data/x_val.npy")
y_val = np.load("../data/y_val.npy")

# Actual model with blending ensemble
testing on features [100, 102, 105]

In [19]:
rf_params = {
    "n_estimators": 1600,
    "min_samples_split": 2,
    "min_samples_leaf": 4,
    "max_features": "log2",
    "max_depth": 10,
    "bootstrap": False,
}
svm_params = {
    'kernel': 'rbf',
    'gamma': 0.01,
    'C': 1,
}
xgboost_params = {
    'n_estimators': 1000,
    'max_depth': 3,
    'learning_rate': 0.01,
    'colsample_bytree': 0.75,
}
features = [100, 102, 105]

In [20]:
X_train = X_train[:, features]
X_val = X_val[:, features]
number_of_features = len(features)

In [21]:
train_set_X = X_train
train_set_y = y_train
val_set_X = X_val
val_set_y = y_val

In [None]:
mds = MDS(
    n_components=1,
    random_state=21,
    n_init=6,
    normalized_stress=False,
    eps=0.0001,
    max_iter=600,
)
# this step should be calculated in every cross-validation fold so 5 * 10 = 50 times
# but it takes a lot of time so we will do it only once and our results will be biased
# but in validation it is done by the book
train_mds = mds.fit_transform(train_set_X)
val_mds = mds.fit_transform(val_set_X)

In [24]:
train_set_X_with_mds = np.concatenate([train_set_X, train_mds], axis=1)
val_set_X_with_mds = np.concatenate([val_set_X, val_mds], axis=1)

### testing on cv

In [25]:
seeds = [21, 22, 23, 24, 25, 5111, 23525, 34934, 343243]
rf_results = []
svm_results = []
xgboost_results = []
ensemble_results = []

for num, seed in enumerate(seeds):
    cv_folds = 5
    np.random.seed(seed)
    idx = np.array(range(X_train.shape[0]))
    np.random.shuffle(idx)
    rf = []
    svm = []
    xgboost = []
    ensemble = []

    idx = idx % cv_folds
    for j in range(cv_folds):
        X_train_cv = train_set_X_with_mds[idx != j]
        X_val_cv = train_set_X_with_mds[idx == j]
        X_train_cv_with_mds = train_set_X_with_mds[idx != j]
        X_val_cv_with_mds = train_set_X_with_mds[idx == j]
        y_train_cv = y_train[idx != j]
        y_val_cv = y_train[idx == j]

        model1 = xgb.XGBClassifier(**xgboost_params)
        model1.fit(X_train_cv, y_train_cv)

        model2 = SVC(probability=True, **svm_params)
        model2.fit(X_train_cv, y_train_cv)

        model3 = RandomForestClassifier(**rf_params)
        model3.fit(X_train_cv_with_mds, y_train_cv)
        
        predictions1_train = model1.predict_proba(X_train_cv_with_mds)[:, 1]
        predicions2_train = model2.predict_proba(X_train_cv_with_mds)[:, 1]
        predictions3_train = model3.predict_proba(X_train_cv_with_mds)[:, 1]

        predictions1 = model1.predict_proba(X_val_cv_with_mds)[:, 1]
        predictions2 = model2.predict_proba(X_val_cv_with_mds)[:, 1]
        predictions3 = model3.predict_proba(X_val_cv_with_mds)[:, 1]
        
        X_big = np.concatenate([X_train_cv_with_mds, predictions1_train.reshape(-1, 1), predictions1_train.reshape(-1, 1), predictions1_train.reshape(-1, 1)], axis=1)

        ensemble_model = RandomForestClassifier(n_estimators=1000, max_depth=3, min_samples_split=2, min_samples_leaf=1, bootstrap=False)
        
        ensemble_model.fit(X_big, y_train_cv)
        final_predictions_proba = ensemble_model.predict_proba(np.concatenate([X_val_cv_with_mds, predictions1.reshape(-1, 1), predictions2.reshape(-1, 1), predictions3.reshape(-1, 1)], axis=1))[:, 1]

        xgboost.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions1, k=number_of_features
            )
        )
        svm.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions2, k=number_of_features
            )
        )
        rf.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions3, k=number_of_features
            )
        )
        ensemble.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=final_predictions_proba, k=number_of_features
            )
        )

    xgboost_results.append(np.mean(xgboost))
    svm_results.append(np.mean(svm))
    rf_results.append(np.mean(rf))
    ensemble_results.append(np.mean(ensemble))


print("XGBoost: ", np.mean(xgboost_results), np.std(xgboost_results))
print("SVM: ", np.mean(svm_results), np.std(svm_results))
print("RF: ", np.mean(rf_results), np.std(rf_results))
print("Ensemble: ", np.mean(ensemble_results), np.std(ensemble_results))

XGBoost:  6975.0 78.39536550927825
SVM:  7136.111111111111 30.869598292906762
RF:  6672.222222222223 67.64330923269955
Ensemble:  7062.5 41.247895569215274


### testing on validation set

In [26]:
model1 = xgb.XGBClassifier(**xgboost_params)
model1.fit(train_set_X_with_mds, train_set_y)

model2 = SVC(probability=True, **svm_params)
model2.fit(train_set_X_with_mds, train_set_y)

model3 = RandomForestClassifier(**rf_params)
model3.fit(train_set_X_with_mds, train_set_y)

# Make predictions on the validation set
predictions1 = model1.predict_proba(val_set_X_with_mds)[:, 1]
predictions2 = model2.predict_proba(val_set_X_with_mds)[:, 1]
predictions3 = model3.predict_proba(val_set_X_with_mds)[:, 1]

predictions1_train = model1.predict_proba(train_set_X_with_mds)[:, 1]
predicions2_train = model2.predict_proba(train_set_X_with_mds)[:, 1]
predictions3_train = model3.predict_proba(train_set_X_with_mds)[:, 1]

X_big = np.concatenate([train_set_X_with_mds, predictions1_train.reshape(-1, 1), predictions1_train.reshape(-1, 1), predictions1_train.reshape(-1, 1)], axis=1)

ensemble_model = RandomForestClassifier(n_estimators=1000, max_depth=3, min_samples_split=2, min_samples_leaf=1, bootstrap=False)

ensemble_model.fit(X_big, train_set_y)
final_predictions_proba = ensemble_model.predict_proba(np.concatenate([val_set_X_with_mds, predictions1.reshape(-1, 1), predictions2.reshape(-1, 1), predictions3.reshape(-1, 1)], axis=1))[:, 1]


# Combine the predictions
final_score = default_competition_metric(
    y_val, y_pred_proba=final_predictions_proba, k=number_of_features
)
print(
    f"xgb score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions1, k=number_of_features)}"
)
print(
    f"svm score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions2, k=number_of_features)}"
)
print(
    f"rf score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions3, k=number_of_features)}"
)
print(f"Ensemble score on validation set: {final_score}")

xgb score on validation set: 6650.0
svm score on validation set: 6950.0
rf score on validation set: 6700.0
Ensemble score on validation set: 6650.0
