In [15]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from utils import load_train_data, load_train_labels, load_test_data

In [2]:
data = load_train_data()
labels = load_train_labels()

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)
y_train[y_train == -1] = 0
y_test[y_test == -1] = 0

## Feature selection

In [3]:
rfe = RFE(
    estimator=RandomForestClassifier(n_estimators=50, max_depth=4, random_state=42),
    n_features_to_select=100,
    step=10,
    verbose=1,
)

In [4]:
rfe.fit(X_train, y_train.reshape(-1))
X_train = rfe.transform(X_train)
X_test = rfe.transform(X_test)

Fitting estimator with 500 features.
Fitting estimator with 490 features.
Fitting estimator with 480 features.
Fitting estimator with 470 features.
Fitting estimator with 460 features.
Fitting estimator with 450 features.
Fitting estimator with 440 features.
Fitting estimator with 430 features.
Fitting estimator with 420 features.
Fitting estimator with 410 features.
Fitting estimator with 400 features.
Fitting estimator with 390 features.
Fitting estimator with 380 features.
Fitting estimator with 370 features.
Fitting estimator with 360 features.
Fitting estimator with 350 features.
Fitting estimator with 340 features.
Fitting estimator with 330 features.
Fitting estimator with 320 features.
Fitting estimator with 310 features.
Fitting estimator with 300 features.
Fitting estimator with 290 features.
Fitting estimator with 280 features.
Fitting estimator with 270 features.
Fitting estimator with 260 features.
Fitting estimator with 250 features.
Fitting estimator with 240 features.
F

## Model

### XGboost

In [5]:
xgb = XGBClassifier(n_estimators=50, max_depth=5)
xgb.fit(X_train, y_train)

In [6]:
pred_xgb = xgb.predict(X_test)

In [7]:
balanced_accuracy_score(y_test, pred_xgb)

0.7908830443726125

### SVC

In [8]:
svc = SVC(C=100)
svc.fit(X_train, y_train.reshape(-1))
pred_svc = svc.predict(X_test)
balanced_accuracy_score(y_test, pred_svc)

0.7502203937702028

### LightGBM

In [9]:
lgbm = LGBMClassifier(n_estimators=100, verbose=0)
lgbm.fit(X_train, y_train.reshape(-1))
pred_lgbm = lgbm.predict(X_test)
balanced_accuracy_score(y_test, pred_lgbm)

0.8061269468116368

### Logistic Regression

In [10]:
logistic = LogisticRegression(penalty="l2", C=1, verbose=0, max_iter=100)
logistic.fit(X_train, y_train.reshape(-1))
pred_logistic = logistic.predict(X_test)
balanced_accuracy_score(y_test, pred_logistic)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5590471642668234

### Stack

In [11]:
stack = StackingClassifier(
    estimators=[
        ("xgboost", XGBClassifier(n_estimators=50, max_depth=5)),
        ("svc", SVC(C=100)),
        ("lgbm", LGBMClassifier(n_estimators=100, verbose=0)),
        ("logistic", LogisticRegression(penalty="l2", C=1, verbose=0)),
    ],
    cv=10,
)
stack.fit(X_train, y_train.reshape(-1))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:
pred_stack = stack.predict(X_test)
balanced_accuracy_score(y_test, pred_stack)

0.8227115780193947

In [19]:
test_data = load_test_data()
test_data = rfe.transform(test_data)

In [24]:
pd.DataFrame(
    data=stack.predict_proba(stack.predict_proba(test_data)[:, 1])[:, 1], columns=["proba"]
).to_csv("../../results/test_pred.txt", index=False)