In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from imblearn.over_sampling import SMOTE
import optuna

# Load data
train_data = pd.read_csv('/kaggle/input/ml-fundamentals-and-applications-2024-08/final_proj_data.csv')
test_data = pd.read_csv('/kaggle/input/ml-fundamentals-and-applications-2024-08/final_proj_test.csv')
sample_submission = pd.read_csv('/kaggle/input/ml-fundamentals-and-applications-2024-08/final_proj_sample_submission.csv')

# Identify numerical and categorical features
numeric_features = train_data.select_dtypes(include=['number']).columns.drop('y', errors='ignore')
categorical_features = train_data.select_dtypes(include=['object']).columns

# Remove numeric features with all missing values
numeric_features = [col for col in numeric_features if train_data[col].notna().any()]

# Define pipelines for numerical and categorical preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Separate features and target variable
X = train_data.drop(columns=['y'])
y = train_data['y']

# Preprocess data
X_preprocessed = preprocessor.fit_transform(X)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Hyperparameter tuning function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'random_state': 42
    }
    print(f"Trying parameters: {params}")
    model = GradientBoostingClassifier(**params)
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_val)
    score = f1_score(y_val, y_pred, average='binary')
    print(f"Trial completed with F1 Score: {score:.4f}")
    return score

# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# Best parameters
best_params = study.best_params
print("Best hyperparameters:", best_params)

# Train final model with best parameters
model = GradientBoostingClassifier(**best_params, random_state=42)
model.fit(X_train_res, y_train_res)

# Function to evaluate model performance
def evaluate_model(y_true, y_pred):
    print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_true, y_pred):.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_true, y_pred)}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='binary'):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='binary'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='binary'):.4f}")

# Validate the model
y_pred = model.predict(X_val)
evaluate_model(y_val, y_pred)

# Preprocess test data
X_test_preprocessed = preprocessor.transform(test_data)

# Make predictions on test set
test_predictions = model.predict(X_test_preprocessed)

# Create submission file
submission = pd.DataFrame({
    'index': sample_submission['index'], 
    'y': test_predictions
})
submission.to_csv('submission.csv', index=False)