In [1]:
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import MDS

from metrics import default_competition_metric

In [2]:
np.random.seed(3666)

In [3]:
X_train = np.load("../data/x_train.npy")
y_train = np.load("../data/y_train.npy")
X_val = np.load("../data/x_val.npy")
y_val = np.load("../data/y_val.npy")

# Actual model
testing on features [100, 102, 103, 105]

In [4]:
rf_params = {
    "n_estimators": 1600,
    "min_samples_split": 2,
    "min_samples_leaf": 4,
    "max_features": "log2",
    "max_depth": 10,
    "bootstrap": False,
}
svm_params = {
    'kernel': 'rbf',
    'gamma': 0.01,
    'C': 1,
}
xgboost_params = {
    'n_estimators': 1000,
    'max_depth': 3,
    'learning_rate': 0.01,
    'colsample_bytree': 0.75,
}
features = [100, 102, 103, 105]

In [5]:
X_train = X_train[:, features]
X_val = X_val[:, features]
number_of_features = len(features)

In [6]:
train_set_X = X_train
train_set_y = y_train
val_set_X = X_val
val_set_y = y_val

### testing on cv

In [9]:
seeds = [21, 22, 23, 24, 25, 5111, 23525, 34934, 343243]
rf_results = []
svm_results = []
xgboost_results = []
ensemble_results = []

for num, seed in enumerate(seeds):
    cv_folds = 5
    np.random.seed(seed)
    idx = np.array(range(X_train.shape[0]))
    np.random.shuffle(idx)
    rf = []
    svm = []
    xgboost = []
    ensemble = []

    idx = idx % cv_folds
    for j in range(cv_folds):
        X_train_cv = train_set_X[idx != j]
        X_val_cv = train_set_X[idx == j]
        y_train_cv = y_train[idx != j]
        y_val_cv = y_train[idx == j]

        model1 = xgb.XGBClassifier(**xgboost_params)
        model1.fit(X_train_cv, y_train_cv)

        model2 = SVC(probability=True, **svm_params)
        model2.fit(X_train_cv, y_train_cv)

        model3 = RandomForestClassifier(**rf_params)
        model3.fit(X_train_cv, y_train_cv)

        predictions1 = model1.predict_proba(X_val_cv)[:, 1]
        predictions2 = model2.predict_proba(X_val_cv)[:, 1]
        predictions3 = model3.predict_proba(X_val_cv)[:, 1]
        final_predictions_proba = (predictions1 + predictions2 + predictions3) / 3

        xgboost.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions1, k=number_of_features
            )
        )
        svm.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions2, k=number_of_features
            )
        )
        rf.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions3, k=number_of_features
            )
        )
        ensemble.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=final_predictions_proba, k=number_of_features
            )
        )

    xgboost_results.append(np.mean(xgboost))
    svm_results.append(np.mean(svm))
    rf_results.append(np.mean(rf))
    ensemble_results.append(np.mean(ensemble))


print("XGBoost: ", np.mean(xgboost_results), np.std(xgboost_results))
print("SVM: ", np.mean(svm_results), np.std(svm_results))
print("RF: ", np.mean(rf_results), np.std(rf_results))
print("Ensemble: ", np.mean(ensemble_results), np.std(ensemble_results))

XGBoost:  6772.222222222223 81.38936291102006
SVM:  6927.777777777777 46.68815642769892
RF:  6341.666666666667 115.16895800904378
Ensemble:  6766.666666666667 62.36095644623236


### testing on validation set

In [10]:
model1 = xgb.XGBClassifier(**xgboost_params)
model1.fit(train_set_X, train_set_y)

model2 = SVC(probability=True, **svm_params)
model2.fit(train_set_X, train_set_y)

model3 = RandomForestClassifier(**rf_params)
model3.fit(train_set_X, train_set_y)

# Make predictions on the validation set
predictions1 = model1.predict_proba(val_set_X)[:, 1]
predictions2 = model2.predict_proba(val_set_X)[:, 1]
predictions3 = model3.predict_proba(val_set_X)[:, 1]

# Combine the predictions
final_predictions_proba = (predictions1 + predictions2 + predictions3) / 3
final_score = default_competition_metric(
    y_val, y_pred_proba=final_predictions_proba, k=number_of_features
)
print(
    f"xgb score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions1, k=number_of_features)}"
)
print(
    f"svm score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions2, k=number_of_features)}"
)
print(
    f"rf score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions3, k=number_of_features)}"
)
print(f"Ensemble score on validation set: {final_score}")

xgb score on validation set: 6400.0
svm score on validation set: 6750.0
rf score on validation set: 6300.0
Ensemble score on validation set: 6500.0


## Actual models
testing on features [100, 102, 105]

In [11]:
X_train = np.load("../data/x_train.npy")
y_train = np.load("../data/y_train.npy")
X_val = np.load("../data/x_val.npy")
y_val = np.load("../data/y_val.npy")

In [12]:
rf_params = {
    "n_estimators": 1600,
    "min_samples_split": 2,
    "min_samples_leaf": 4,
    "max_features": "log2",
    "max_depth": 10,
    "bootstrap": False,
}
svm_params = {
    'kernel': 'rbf',
    'gamma': 0.01,
    'C': 1,
}
xgboost_params = {
    'n_estimators': 1000,
    'max_depth': 3,
    'learning_rate': 0.01,
    'colsample_bytree': 0.75,
}
features = [100, 102, 105]

In [13]:
X_train = X_train[:, features]
X_val = X_val[:, features]
number_of_features = len(features)

In [14]:
train_set_X = X_train
train_set_y = y_train
val_set_X = X_val
val_set_y = y_val

In [15]:
train_set_X = np.concatenate([train_set_X], axis=1)
val_set_X = np.concatenate([val_set_X], axis=1)

### testing on cv

In [16]:
seeds = [21, 22, 23, 24, 25, 5111, 23525, 34934, 343243]
rf_results = []
svm_results = []
xgboost_results = []
ensemble_results = []

for num, seed in enumerate(seeds):
    cv_folds = 5
    np.random.seed(seed)
    idx = np.array(range(X_train.shape[0]))
    np.random.shuffle(idx)
    rf = []
    svm = []
    xgboost = []
    ensemble = []

    idx = idx % cv_folds
    for j in range(cv_folds):
        X_train_cv = train_set_X[idx != j]
        X_val_cv = train_set_X[idx == j]
        y_train_cv = y_train[idx != j]
        y_val_cv = y_train[idx == j]

        model1 = xgb.XGBClassifier(**xgboost_params)
        model1.fit(X_train_cv, y_train_cv)

        model2 = SVC(probability=True, **svm_params)
        model2.fit(X_train_cv, y_train_cv)

        model3 = RandomForestClassifier(**rf_params)
        model3.fit(X_train_cv, y_train_cv)

        predictions1 = model1.predict_proba(X_val_cv)[:, 1]
        predictions2 = model2.predict_proba(X_val_cv)[:, 1]
        predictions3 = model3.predict_proba(X_val_cv)[:, 1]
        final_predictions_proba = (predictions1 + predictions2 + predictions3) / 3

        xgboost.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions1, k=number_of_features
            )
        )
        svm.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions2, k=number_of_features
            )
        )
        rf.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=predictions3, k=number_of_features
            )
        )
        ensemble.append(
            default_competition_metric(
                y_val_cv, y_pred_proba=final_predictions_proba, k=number_of_features
            )
        )

    xgboost_results.append(np.mean(xgboost))
    svm_results.append(np.mean(svm))
    rf_results.append(np.mean(rf))
    ensemble_results.append(np.mean(ensemble))


print("XGBoost: ", np.mean(xgboost_results), np.std(xgboost_results))
print("SVM: ", np.mean(svm_results), np.std(svm_results))
print("RF: ", np.mean(rf_results), np.std(rf_results))
print("Ensemble: ", np.mean(ensemble_results), np.std(ensemble_results))

XGBoost:  6684.722222222223 57.66816525562971
SVM:  6797.222222222223 60.88975060217493
RF:  6572.222222222223 76.09793073223736
Ensemble:  6736.111111111111 50.49905267416067


### testing on validation set

In [17]:
model1 = xgb.XGBClassifier(**xgboost_params)
model1.fit(train_set_X, train_set_y)

model2 = SVC(probability=True, **svm_params)
model2.fit(train_set_X, train_set_y)

model3 = RandomForestClassifier(**rf_params)
model3.fit(train_set_X, train_set_y)

# Make predictions on the validation set
predictions1 = model1.predict_proba(val_set_X)[:, 1]
predictions2 = model2.predict_proba(val_set_X)[:, 1]
predictions3 = model3.predict_proba(val_set_X)[:, 1]

# Combine the predictions
final_predictions_proba = (predictions1 + predictions2 + predictions3) / 3
final_score = default_competition_metric(
    y_val, y_pred_proba=final_predictions_proba, k=number_of_features
)
print(
    f"xgb score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions1, k=number_of_features)}"
)
print(
    f"svm score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions2, k=number_of_features)}"
)
print(
    f"rf score on validation set: {default_competition_metric(y_val, y_pred_proba=predictions3, k=number_of_features)}"
)
print(f"Ensemble score on validation set: {final_score}")

xgb score on validation set: 6850.0
svm score on validation set: 7000.0
rf score on validation set: 6800.0
Ensemble score on validation set: 7100.0


## Final prediction

In [None]:
x_train_original = np.loadtxt("../data/x_train.txt")
y_train_original = np.loadtxt("../data/y_train.txt")
x_test_original = np.loadtxt("../data/x_test.txt")

In [None]:
x_train_original = x_train_original[:, features]
x_test_original = x_test_original[:, features]

In [None]:
mds = MDS(
    n_components=1,
    random_state=21,
    n_init=6,
    normalized_stress=False,
    eps=0.0001,
    max_iter=600,
)
train_original_mds = mds.fit_transform(x_train_original)
test_original_mds = mds.fit_transform(x_test_original)

In [None]:
x_train_original_with_mds = np.concatenate(
    [x_train_original, train_original_mds], axis=1
)
x_test_original_with_mds = np.concatenate([x_test_original, test_original_mds], axis=1)

In [None]:
model1 = xgb.XGBClassifier(**xgboost_params)
model1.fit(x_train_original, y_train_original)

model2 = SVC(probability=True, **svm_params)
model2.fit(x_train_original, y_train_original)

model3 = RandomForestClassifier(**rf_params)
model3.fit(x_train_original_with_mds, y_train_original)

# Make predictions on the validation set
predictions1 = model1.predict_proba(x_test_original)[:, 1]
predictions2 = model2.predict_proba(x_test_original)[:, 1]
predictions3 = model3.predict_proba(x_test_original_with_mds)[:, 1]

# Combine the predictions
final_predictions_proba = (predictions1 + predictions2 + predictions3) / 3

In [None]:
n = len(final_predictions_proba)
top_02 = np.argsort(final_predictions_proba)[::-1][: n // 5]
y_pred = np.zeros(n)
y_pred[top_02] = 1

In [None]:
y_pred

In [None]:
# this should equal to 1000
sum(y_pred)

In [None]:
def create_submission_file(final_predictions, columns_used):
    """
    final_predictions: binary predictions for the test set
    columns_used: list of column indexes used in the models

    This function should create a submission file for the test set
    """
    final_predictions = np.where(final_predictions == 1)[0]
    final_predictions = final_predictions + 1
    columns_used = [val + 1 for val in columns_used]
    print(f"You predicted as positive {len(final_predictions)} samples")
    print(f"You used: {len(columns_used)} columns")
    pd.DataFrame(final_predictions).to_csv("320637_obs.txt", header=False, index=False)
    pd.DataFrame(columns_used).to_csv("320637_vars.txt", header=False, index=False)

In [None]:
create_submission_file(y_pred, features)