In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, roc_curve
)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load data
train_df = pd.read_csv('../data/playground-series-s5e3/train.csv')
test_df = pd.read_csv('../data/playground-series-s5e3/test.csv')

# Prepare features and target
X = train_df.drop(['id', 'rainfall'], axis=1)
y = train_df['rainfall']
X_test_full = test_df.drop(['id'], axis=1)

# Enhanced data preprocessing pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create preprocessing pipeline
preprocessor = make_pipeline(
    SimpleImputer(strategy='median', add_indicator=True),
    StandardScaler()
)

# Apply preprocessing
X = preprocessor.fit_transform(X)
X_test_full = preprocessor.transform(X_test_full)

# Convert to numpy arrays and ensure proper data types
X = np.asarray(X, dtype=np.float32)
X_test_full = np.asarray(X_test_full, dtype=np.float32)

# Detailed data validation with logging
def validate_data(data, name):
    print(f'\nValidating {name}...')
    print(f'Shape: {data.shape}')
    print(f'Memory usage: {data.nbytes / 1024 / 1024:.2f} MB')
    
    # Check for NaN values
    nan_count = np.isnan(data).sum()
    if nan_count > 0:
        nan_indices = np.where(np.isnan(data))
        print(f'NaN values found in columns: {np.unique(nan_indices[1])}')
        print(f'NaN counts per column: {np.isnan(data).sum(axis=0)}')
        raise ValueError(
            f'Found {nan_count} NaN values in {name} at indices: {nan_indices}'
        )
    
    # Check for infinite values
    inf_count = np.isinf(data).sum()
    if inf_count > 0:
        inf_indices = np.where(np.isinf(data))
        print(f'Infinite values found in columns: {np.unique(inf_indices[1])}')
        raise ValueError(
            f'Found {inf_count} infinite values in {name} at indices: {inf_indices}'
        )
    
    # Check value ranges
    print(f'Min values: {np.min(data, axis=0)}')
    print(f'Max values: {np.max(data, axis=0)}')
    print(f'Mean values: {np.mean(data, axis=0)}')
    print(f'Std values: {np.std(data, axis=0)}')
    print(f'Validation of {name} completed successfully\n')

validate_data(X, 'training features')
validate_data(X_test_full, 'test features')
validate_data(y.values.reshape(-1, 1), 'training labels')

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# Forward stepwise feature selection
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.pipeline import Pipeline

# Initialize models
log_reg = LogisticRegression(random_state=42, max_iter=1000)
xgb = XGBClassifier(
    random_state=42,
    n_estimators=100,
    max_depth=3,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='auc'
)

# Create feature selection pipeline
selector = SequentialFeatureSelector(
    xgb,
    n_features_to_select='auto',
    direction='forward',
    scoring='roc_auc',
    cv=5
)

# Fit selector on training data
selector.fit(X_train, y_train)

# Transform datasets
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)
X_test_full = selector.transform(X_test_full)

print(f'Selected {X_train.shape[1]} features out of {selector.n_features_in_}')

In [None]:
# 10-fold cross validation for both models
print('Logistic Regression Cross-validation:')
log_reg_cv = cross_val_score(log_reg, X_train, y_train, cv=10, scoring='accuracy')
print(f'CV scores: {log_reg_cv}')
print(f'Mean CV accuracy: {log_reg_cv.mean():.4f}')

print('\nXGBoost Cross-validation:')
xgb_cv = cross_val_score(xgb, X_train, y_train, cv=10, scoring='accuracy')
print(f'CV scores: {xgb_cv}')
print(f'Mean CV accuracy: {xgb_cv.mean():.4f}')

In [None]:
# Train models on full training set
log_reg.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Make predictions
log_reg_pred = log_reg.predict(X_test)
log_reg_pred_proba = log_reg.predict_proba(X_test)[:, 1]

xgb_pred = xgb.predict(X_test)
xgb_pred_proba = xgb.predict_proba(X_test)[:, 1]

# Make predictions on full test set
test_pred_log_reg = log_reg.predict(X_test_full)
test_pred_xgb = xgb.predict(X_test_full)

In [None]:
# Evaluate models
def evaluate_model(name, y_true, y_pred, y_pred_proba):
    print(f'\n{name} Validation Set Metrics:')
    print(f'Accuracy: {accuracy_score(y_true, y_pred):.4f}')
    print(f'Precision: {precision_score(y_true, y_pred):.4f}')
    print(f'Recall: {recall_score(y_true, y_pred):.4f}')
    print(f'F1 Score: {f1_score(y_true, y_pred):.4f}')
    print(f'ROC AUC: {roc_auc_score(y_true, y_pred_proba):.4f}')
    
    # Calculate AUC using integration
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    auc = np.trapz(tpr, fpr)
    print(f'Calculated AUC: {auc:.4f}')
    return fpr, tpr, auc

# Evaluate Logistic Regression
log_reg_fpr, log_reg_tpr, log_reg_auc = evaluate_model(
    'Logistic Regression', y_test, log_reg_pred, log_reg_pred_proba
)

# Evaluate XGBoost
xgb_fpr, xgb_tpr, xgb_auc = evaluate_model(
    'XGBoost', y_test, xgb_pred, xgb_pred_proba
)

In [None]:
# Confusion matrices
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Logistic Regression confusion matrix
cm_log_reg = confusion_matrix(y_test, log_reg_pred)
sns.heatmap(cm_log_reg, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title('Logistic Regression Confusion Matrix')
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')

# XGBoost confusion matrix
cm_xgb = confusion_matrix(y_test, xgb_pred)
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues', ax=ax2)
ax2.set_title('XGBoost Confusion Matrix')
ax2.set_xlabel('Predicted')
ax2.set_ylabel('Actual')

plt.tight_layout()
plt.show()

In [None]:
# ROC curves comparison
plt.figure(figsize=(8, 6))
plt.plot(log_reg_fpr, log_reg_tpr, label=f'Logistic Regression (AUC = {log_reg_auc:.4f})')
plt.plot(xgb_fpr, xgb_tpr, label=f'XGBoost (AUC = {xgb_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.title('ROC Curves Comparison')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.show()