In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt

from sklearn.multioutput import MultiOutputClassifier

import os
import time
import warnings

warnings.filterwarnings("ignore")

In [2]:
SEED = 22
NFOLDS = 5
np.random.seed(SEED)

In [3]:
train = pd.read_csv("../input/lish-moa/train_features.csv")
targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
test = pd.read_csv("../input/lish-moa/test_features.csv")
sub = pd.read_csv("../input/lish-moa/sample_submission.csv")

In [4]:
X = train.iloc[:, 1:].to_numpy()
X_test = test.iloc[:, 1:].to_numpy()
y = targets.iloc[:, 1:].to_numpy()

In [5]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))

clf = Pipeline([(
    "encode", CountEncoder(cols=[0, 2])),
    ("classify", classifier)
])

In [6]:
params = {'classify__estimator__colsample_bytree': 0.6522,
          'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0503,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 8,
          'classify__estimator__min_child_weight': 31.5800,
          'classify__estimator__n_estimators': 183,
          'classify__estimator__subsample': 0.8639
         }

_ = clf.set_params(**params)

In [7]:
oof_preds = np.zeros(y.shape)
test_preds = np.zeros((test.shape[0], y.shape[1]))
oof_loses = []
kf = KFold(n_splits=NFOLDS)

start = time.time()

for fn, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print("Starting fold: ", fn)
    X_train, X_val = X[trn_idx], X[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]
    
    ctl_mask = X_train[:,0]=='ctl_vehicle'
    X_train = X_train[~ctl_mask,:]
    y_train = y_train[~ctl_mask]
    
    clf.fit(X_train, y_train)
    
    val_preds = clf.predict_proba(X_val)
    val_preds = np.array(val_preds)[:,:,1].T
    oof_preds[val_idx] = val_preds
    
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    oof_loses.append(loss)
    preds = clf.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T
    test_preds += preds / NFOLDS

training_time = (time.time() - start) / 60    

print(f"Training time: {training_time} minutes")
print(oof_loses)
print("Mean OOF loss: ", np.mean(oof_loses))

Starting fold:  0
Starting fold:  1
Starting fold:  2
Starting fold:  3
Starting fold:  4
Training time: 25.21780607700348 minutes
[0.016939544630693568, 0.017023601611208604, 0.01683115541901799, 0.016979776385080775, 0.017148126425451143]
Mean OOF loss:  0.016984440894290416


In [8]:
control_mask = X_test["cp_type"]=='ctl_vehicle'

test_preds[control_mask] = 0

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [9]:
sub.iloc[:, 1:] = test_preds
sub.to_csv("submission.csv", index=False)