In [1]:
import numpy as np
import pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA


np.random.seed(0)

In [2]:
train_val = pd.read_csv(
    "/kaggle/input/tabular-playground-series-may-2022/train.csv"
)

feature_cols = [
    col for col in train_val.columns
    if col not in ("id", "target", "f_27")
]

train, val = train_test_split(train_val)

X_train, X_val = train[feature_cols], val[feature_cols]
y_train, y_val = train["target"], val["target"]

In [3]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.99)),
        ("model", XGBClassifier(
            objective="binary:logistic",
            n_estimators=100,
            verbosity=1,
            ),
        ),
    ],
    verbose=1,
)

In [4]:
pipeline.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.4s
[Pipeline] ............... (step 2 of 3) Processing pca, total=   1.2s
[Pipeline] ............. (step 3 of 3) Processing model, total= 4.6min


Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=0.99)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, early_stopping_rounds=None,
                               enable_categorical=False, eval_metric=None,
                               gamma=0, gpu_id=-1, grow_policy='depthwise',
                               importance_type=None, interaction_constraints='',
                               learning_rate=0.300000012, max_bin=256,
                               max_cat_to_onehot=4, max_delta_step=0,
                               max_depth=6, max_leaves=0, min_child_weight=1,
                               missing=nan, monotone_constraints='()',
                               n_estimators=100, n_jobs=0, num_parallel_tree=1,
                               predictor='auto', random_state

In [5]:
for X, y in ((X_train, y_train), (X_val, y_val)):
    y_pred = pipeline.predict(X)
    print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82    346720
           1       0.81      0.79      0.80    328280

    accuracy                           0.81    675000
   macro avg       0.81      0.81      0.81    675000
weighted avg       0.81      0.81      0.81    675000

              precision    recall  f1-score   support

           0       0.80      0.81      0.80    115441
           1       0.80      0.78      0.79    109559

    accuracy                           0.80    225000
   macro avg       0.80      0.80      0.80    225000
weighted avg       0.80      0.80      0.80    225000



In [6]:
test = pd.read_csv(
    "/kaggle/input/tabular-playground-series-may-2022/test.csv"
)

In [7]:
y_pred = pipeline.predict(test[feature_cols])
output = pd.DataFrame(
    {
        "id": test["id"],
        "target": y_pred
        
    }
)

output.to_csv("/kaggle/working/submission.csv", index=False)