In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cat-in-the-dat-ii/sample_submission.csv
/kaggle/input/cat-in-the-dat-ii/train.csv
/kaggle/input/cat-in-the-dat-ii/test.csv


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# Load Data
df = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/train.csv')
test_df = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/test.csv')

# Data Overview
print(df.info())
print(df.head())

# Feature Engineering
# 1. Handling Missing Values
imputer = SimpleImputer(strategy='most_frequent')
df_features = df.drop(['id', 'target'], axis=1)
test_features = test_df.drop(['id'], axis=1)

df[df_features.columns] = imputer.fit_transform(df_features)
test_df[test_features.columns] = imputer.transform(test_features)

# Ensure column names are strings
df.columns = df.columns.astype(str)
test_df.columns = test_df.columns.astype(str)

# 2. Encoding Techniques
# Binary Encoding
for col in ['bin_3', 'bin_4']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    test_df[col] = LabelEncoder().fit_transform(test_df[col].astype(str))

# One-Hot Encoding for Nominal Categorical Features
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
nominal_cols = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
df_nom = pd.DataFrame(ohe.fit_transform(df[nominal_cols]))
test_nom = pd.DataFrame(ohe.transform(test_df[nominal_cols]))

# Combine Encoded Features
df = pd.concat([df, df_nom], axis=1)
test_df = pd.concat([test_df, test_nom], axis=1)
df.drop(nominal_cols, axis=1, inplace=True)
test_df.drop(nominal_cols, axis=1, inplace=True)

# Convert all column names to strings after one-hot encoding
df.columns = df.columns.astype(str)
test_df.columns = test_df.columns.astype(str)

# 3. Ordinal Encoding
ordinal_cols = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
for col in ordinal_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))
    test_df[col] = LabelEncoder().fit_transform(test_df[col].astype(str))

# 4. Force Numeric Conversion and Imputation
for col in df.columns.drop(['id', 'target']):
    df[col] = pd.to_numeric(df[col], errors='coerce')
    test_df[col] = pd.to_numeric(test_df[col], errors='coerce')

imputer = SimpleImputer(strategy='mean')
df[df.columns.drop(['id', 'target'])] = imputer.fit_transform(df.drop(['id', 'target'], axis=1))
test_df[test_df.columns.drop(['id'])] = imputer.transform(test_df.drop(['id'], axis=1))

# 5. Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(['id', 'target'], axis=1))
df_scaled = pd.DataFrame(scaled_features, columns=df.columns.drop(['id', 'target']))

test_scaled_features = scaler.transform(test_df.drop('id', axis=1))
test_scaled = pd.DataFrame(test_scaled_features, columns=test_df.columns.drop('id'))

# Combine ID and Target Back
df = pd.concat([df[['id', 'target']], df_scaled], axis=1)
test_df = pd.concat([test_df[['id']], test_scaled], axis=1)

# Splitting the data
X = df.drop(['id', 'target'], axis=1)
y = df['target']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify no NaNs exist
print(f"NaN values in X_train: {np.isnan(X_train).sum().sum()}")

# Model Building and Tuning
def build_model(model, params, search_type='grid'):
    if search_type == 'grid':
        search = GridSearchCV(model, params, cv=5, scoring='roc_auc')
    else:
        search = RandomizedSearchCV(model, params, n_iter=20, cv=5, scoring='roc_auc', random_state=42)
        
    search.fit(X_train, y_train)
    best_model = search.best_estimator_
        
    y_pred = best_model.predict(X_valid)
    print(f"{model.__class__.__name__} Accuracy: {accuracy_score(y_valid, y_pred):.4f}")
    print(f"{model.__class__.__name__} AUC-ROC: {roc_auc_score(y_valid, y_pred):.4f}")
        
    return best_model

# Random Forest with GridSearchCV (using HistGradientBoostingClassifier)
rf_params = {
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [2, 5, 10]
}
rf_model = build_model(HistGradientBoostingClassifier(), rf_params, 'grid')

# XGBoost with RandomizedSearchCV
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
xgb_model = build_model(xgb.XGBClassifier(), xgb_params, 'random')

# LightGBM with GridSearchCV
lgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1]
}
lgb_model = build_model(lgb.LGBMClassifier(), lgb_params, 'grid')

# CatBoost with RandomizedSearchCV
cb_params = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.2]
}
cb_model = build_model(cb.CatBoostClassifier(verbose=0), cb_params, 'random')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600000 entries, 0 to 599999
Data columns (total 25 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      600000 non-null  int64  
 1   bin_0   582106 non-null  float64
 2   bin_1   581997 non-null  float64
 3   bin_2   582070 non-null  float64
 4   bin_3   581986 non-null  object 
 5   bin_4   581953 non-null  object 
 6   nom_0   581748 non-null  object 
 7   nom_1   581844 non-null  object 
 8   nom_2   581965 non-null  object 
 9   nom_3   581879 non-null  object 
 10  nom_4   581965 non-null  object 
 11  nom_5   582222 non-null  object 
 12  nom_6   581869 non-null  object 
 13  nom_7   581997 non-null  object 
 14  nom_8   582245 non-null  object 
 15  nom_9   581927 non-null  object 
 16  ord_0   581712 non-null  float64
 17  ord_1   581959 non-null  object 
 18  ord_2   581925 non-null  object 
 19  ord_3   582084 non-null  object 
 20  ord_4   582070 non-null  object 
 21  ord_5   58

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  new_unnormalized_variance -= correction**2 / new_sample_count
  return var <= upper_bound


NaN values in X_train: 480000
HistGradientBoostingClassifier Accuracy: 0.8196
HistGradientBoostingClassifier AUC-ROC: 0.5446
XGBClassifier Accuracy: 0.8206
XGBClassifier AUC-ROC: 0.5495
[LightGBM] [Info] Number of positive: 71971, number of negative: 312029
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.081847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 380
[LightGBM] [Info] Number of data points in the train set: 384000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.187424 -> initscore=-1.466833
[LightGBM] [Info] Start training from score -1.466833
[LightGBM] [Info] Number of positive: 71971, number of negative: 312029
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.077253 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not 