In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss

from xgboost import XGBClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

train_df = pd.read_csv('~/rep/kaggle/playground-series-s5e8/train.csv')
test_df = pd.read_csv('~/rep/kaggle/playground-series-s5e8/test.csv')

In [2]:
train_df = train_df.drop(columns=['id'])
train_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,29,services,single,secondary,no,1282,no,yes,unknown,4,jul,1006,2,-1,0,unknown,1
749996,69,retired,divorced,tertiary,no,631,no,no,cellular,19,aug,87,1,-1,0,unknown,0
749997,50,blue-collar,married,secondary,no,217,yes,no,cellular,17,apr,113,1,-1,0,unknown,0
749998,32,technician,married,secondary,no,-274,no,no,cellular,26,aug,108,6,-1,0,unknown,0


In [3]:
target = 'y'
feature_cols = [col for col in train_df.columns if target not in col]
categorical_columns = ['job','marital','education','default','housing','loan','contact','month','poutcome']
print(feature_cols)

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'month', 'duration', 'campaign', 'previous', 'poutcome']


In [4]:
# Create a copy of the dataframes to avoid modifying originals
train_scaled = train_df.copy()
test_scaled = test_df.copy()

# Select columns to scale (exclude target column)
# - Only consider numeric columns
numeric_cols = train_scaled.select_dtypes(include=["number"]).columns
numeric_cols = [col for col in numeric_cols if col != target]
scaler = StandardScaler()

# Fit the scaler on the training data (numeric columns)
scaler.fit(train_scaled[numeric_cols])

# Transform both train and test data
train_scaled[numeric_cols] = scaler.transform(train_scaled[numeric_cols])
test_scaled[numeric_cols] = scaler.transform(test_scaled[numeric_cols])

In [5]:
# Create a copy of train and test dataframes to avoid modifying original dataframes
train_encoded = train_scaled.copy()
test_encoded = test_scaled.copy()

# Initialize label encoder
le = LabelEncoder()

# Apply label encoding to each categorical column
for column in categorical_columns:
    print(f'Encoding: {column} ...')
    # Fit the label encoder on the train data
    le.fit(train_encoded[column])
    
    # Transform both train and test data using the same encoder
    train_encoded[column] = le.transform(train_encoded[column])
    if column in test_encoded.columns:
        # Handle cases where test set may have unseen labels by using fillna
        test_encoded[column] = test_encoded[column].map(lambda s: le.transform([s])[0] if s in le.classes_ else None)
        test_encoded[column].fillna(-1, inplace=True)
        test_encoded[column] = test_encoded[column].astype(int)

Encoding: job ...
Encoding: marital ...
Encoding: education ...
Encoding: default ...
Encoding: housing ...
Encoding: loan ...
Encoding: contact ...
Encoding: month ...
Encoding: poutcome ...


In [6]:
# Determine feature set
X = train_encoded[feature_cols]
X_test = test_encoded[feature_cols]

# Extract target values
y = train_encoded[target]

# Initialize the model with the specified parameters
model = XGBClassifier()

In [None]:
# run a grid search
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 6, 9],
    "learning_rate": [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring="roc_auc", cv=5, verbose=1)

grid_search.fit(X, y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
