# Tabular Playground Series - Sep 2021

[https://www.kaggle.com/c/tabular-playground-series-sep-2021/overview](https://www.kaggle.com/c/tabular-playground-series-sep-2021/overview)

In [31]:
import pandas as pd
import numpy as np
import random
import time
import os
import gc

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

from xgboost import XGBClassifier

import warnings
warnings.simplefilter('ignore')

## Parameters

In [32]:
N_SPLITS = 5
N_ESTIMATORS = 25001
EARLY_STOPPING_ROUNDS = 3048 # very important, well protects against overfitting
VERBOSE = 1000 # faster and more clearly
SEED = 42

In [33]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Preprocessing

In [34]:
train = pd.read_csv('data/train.csv')
test  = pd.read_csv('data/test.csv')

In [41]:
train.set_index('id', inplace=True)
test.set_index('id', inplace=True)

In [43]:
features = test.columns.to_list()
TARGET = 'claim'

target = train[TARGET].copy()

## Feature Engineering

Idea taken from https://www.kaggle.com/realtimshady/single-simple-lightgbm Missing feature values are replaced depending on the type of distribution.

In [45]:
train['n_missing'] = train[features].isna().sum(axis=1)
train['min'] = train[features].min(axis=1)
train['sem']= train[features].sem(axis=1)
train['quantile'] = train[features].quantile(axis = 1)

test['n_missing'] = test[features].isna().sum(axis=1)
test['min'] = test[features].min(axis=1)
test['sem']= test[features].sem(axis=1)
test['quantile'] = test[features].quantile(axis=1)

features += ['n_missing','min','sem','quantile']
n_missing = train['n_missing'].copy()

In [46]:
pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', StandardScaler()) #StandardScaler RobustScaler
])
train[features] = pipeline.fit_transform(train[features])
test[features] = pipeline.transform(test[features])

train.shape, test.shape

((957919, 123), (493474, 122))

## Random Forest

In [68]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(train, test_size=0.2, random_state=42)

X_train = train_df[features]
y_train = train_df['claim']
X_valid = valid_df[features]
y_valid = valid_df['claim']

In [86]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_samples_split=1000)
rf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

y, y_pred, y_pred_prob = y_train, rf.predict(X_train), rf.predict_proba(X_train)[:,1]
print(classification_report(y, y_pred))
print(f"Train AUC: {roc_auc_score(y, y_pred_prob)}")

y, y_pred, y_pred_prob = y_valid, rf.predict(X_valid), rf.predict_proba(X_valid)[:,1]
print(classification_report(y, y_pred))
print(f"Valid AUC: {roc_auc_score(y, y_pred_prob)}")

              precision    recall  f1-score   support

           0       0.99      1.00      1.00    384251
           1       1.00      0.99      1.00    382084

    accuracy                           1.00    766335
   macro avg       1.00      1.00      1.00    766335
weighted avg       1.00      1.00      1.00    766335

Train AUC: 0.9999774968571538
              precision    recall  f1-score   support

           0       0.86      0.65      0.74     96153
           1       0.72      0.89      0.79     95431

    accuracy                           0.77    191584
   macro avg       0.79      0.77      0.77    191584
weighted avg       0.79      0.77      0.77    191584

Valid AUC: 0.7975208448603008


              precision    recall  f1-score   support

           0       0.87      0.69      0.77    384251
           1       0.74      0.90      0.81    382084

    accuracy                           0.79    766335
   macro avg       0.81      0.79      0.79    766335
weighted avg       0.81      0.79      0.79    766335

Train AUC: 0.9608403844858073
              precision    recall  f1-score   support

           0       0.86      0.65      0.74     96153
           1       0.72      0.90      0.80     95431

    accuracy                           0.77    191584
   macro avg       0.79      0.77      0.77    191584
weighted avg       0.79      0.77      0.77    191584

Valid AUC: 0.8013681618511015