In [25]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import numpy as np

In [26]:
X_train = pd.read_csv("../data/x_train.txt", sep=" ", header=None)
y_train = pd.read_csv("../data/y_train.txt", header=None)
X_test = pd.read_csv("../data/x_test.txt", sep=" ", header=None)
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = y_train.values.ravel()

In [27]:
features = [105, 100, 101, 102]
X_train = X_train[:, features]
X_test = X_test[:, features]

In [28]:
x1, x2, y1, y2 = train_test_split(X_train, y_train, test_size=0.2)

In [29]:
svm = SVC(kernel="poly", degree=4, gamma="scale", coef0=0.01, probability=True)
xgb = XGBClassifier(
    n_estimators=75,
    learning_rate=0.01,
    min_child_weight=0,
    subsample=0.5,
    reg_lambda=0.1,
    max_depth=2,
    tree_method="approx",
)

In [30]:
svm.fit(X_train, y_train)
xgb.fit(X_train, y_train)
svm_pred = svm.predict_proba(X_test)
xgb_pred = xgb.predict_proba(X_test)
proba = np.mean([svm_pred, xgb_pred], axis=0)

proba

array([[0.53437499, 0.46562501],
       [0.45653929, 0.54346071],
       [0.47613438, 0.52386562],
       ...,
       [0.22709504, 0.77290496],
       [0.46681607, 0.53318393],
       [0.57097896, 0.42902104]])

In [31]:
df = pd.DataFrame(proba)
df.index = df.index + 1
df.columns = ["proba0", "proba1"]
indices = df.sort_values("proba1", ascending=False).head(1000).index

df

Unnamed: 0,proba0,proba1
1,0.534375,0.465625
2,0.456539,0.543461
3,0.476134,0.523866
4,0.425781,0.574219
5,0.473207,0.526793
...,...,...
4996,0.484465,0.515535
4997,0.482886,0.517114
4998,0.227095,0.772905
4999,0.466816,0.533184


In [32]:
list(indices)

[2902,
 4096,
 3250,
 4002,
 1745,
 3181,
 2814,
 150,
 558,
 2801,
 4016,
 4627,
 3041,
 3881,
 1642,
 2257,
 2714,
 2320,
 2974,
 4622,
 1031,
 3864,
 2592,
 4210,
 1110,
 1214,
 617,
 4312,
 2643,
 2603,
 885,
 1860,
 3681,
 1872,
 4122,
 4998,
 4393,
 369,
 4699,
 1960,
 273,
 2938,
 3977,
 3880,
 2748,
 2555,
 781,
 3609,
 927,
 4895,
 4221,
 4285,
 4729,
 2922,
 4596,
 1439,
 4138,
 742,
 2787,
 4439,
 1956,
 4940,
 1330,
 3343,
 3411,
 1680,
 2672,
 1139,
 3198,
 189,
 4451,
 2261,
 4581,
 26,
 3885,
 3813,
 3856,
 524,
 603,
 449,
 4381,
 263,
 4926,
 3715,
 3334,
 1360,
 1834,
 1097,
 4786,
 2076,
 424,
 3019,
 1381,
 2205,
 1429,
 3321,
 2191,
 1516,
 1892,
 116,
 3165,
 4385,
 4139,
 1514,
 232,
 2297,
 219,
 1879,
 4738,
 3860,
 492,
 4913,
 1400,
 792,
 2036,
 4198,
 862,
 1111,
 2275,
 1689,
 906,
 352,
 1054,
 705,
 1775,
 2215,
 3148,
 3962,
 1292,
 640,
 1285,
 4253,
 4811,
 4709,
 622,
 4658,
 555,
 3800,
 4218,
 285,
 3895,
 1870,
 1458,
 809,
 1567,
 303,
 1191,
 48

In [33]:
with open("../results/solution.txt", "w") as file:
    for number in list(indices):
        file.write(f"{number}\n")

In [34]:
with open("../results/used_features.txt", "w") as file:
    for feature in features:
        file.write(f"{feature+1}\n")