In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [None]:
# Define the path to your Parquet file or directory
dataPath = "data_clean1.parquet"  # Adjust for single file or directory
# Read the Parquet data into a DataFrame
df_model = pd.read_parquet(dataPath)

In [None]:
X = df_model.drop('label', axis=1)
y = df_model['label']

In [9]:
# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64', 'float32']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Define the numerical transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

# Define the categorical transformer (if you have categorical features)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [None]:
pipeline_lg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(use_label_encoder=False, eval_metric='logloss'))
])

In [None]:
pipeline_xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)