In [9]:
import os
import json

import pandas as pd
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

import category_encoders as ce

import warnings

warnings.filterwarnings('ignore')

In [10]:
X = pd.read_csv('../../data/start_dataset.csv')
y = pd.read_csv('../../data/y.csv', dtype='int8')

# with open('../data/binned/interaction_constraints.json', 'r') as file:
#     interaction_constraints = json.load(file)

# print(interaction_constraints)

for col in X.filter(like='_binned').columns:
    X[col] = X[col].astype('category')

X.filter(like='_binned').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Empty DataFrame


In [14]:
def train_and_evaluate(df, target, preprocessing=None):
    """
    Train an XGBClassifier with specified preprocessing on the training data and evaluate it on the validation data.

    Parameters:
    df (DataFrame): Data
    target (Series): Target data
    random_state (int): Random state for reproducibility
    enable_categorical (bool): Enable categorical feature support
    preprocessing (str): Preprocessing method ('Normalization', 'Standardization', or None)

    Returns:
    float: The F1 score of the model on the validation data
    """
    X_train, X_val, y_train, y_val = train_test_split(df, target, test_size=0.2, random_state=42, stratify=target)

    num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
    cat_cols = X_train.select_dtypes(include=['object', 'category']).columns

    if preprocessing == 'Normalization':
        num_transformer = MinMaxScaler()
    elif preprocessing == 'Standardization':
        num_transformer = StandardScaler()
    else:
        num_transformer = 'passthrough'
        
    cat_transformer = ce.OneHotEncoder(cols=cat_cols, use_cat_names=True, drop_invariant=True, return_df=True)

    preprocessor = ColumnTransformer([
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])
    
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)

    model = XGBClassifier(random_state=42, enable_categorical=True)
    model.fit(X_train_preprocessed, y_train, eval_set=[(X_train_preprocessed, y_train), (X_val_preprocessed, y_val)], verbose=0, early_stopping_rounds=25)

    y_pred = model.predict(X_val_preprocessed)
    f1_score_val = f1_score(y_val, y_pred, pos_label=1)

    return f1_score_val

In [15]:
train_and_evaluate(X, y, preprocessing='Normalization')

0.5805168986083499

In [16]:
train_and_evaluate(X, y, preprocessing='Standardization')

0.5805168986083499