In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Load datasets
train_data = pd.read_csv('train_set.csv')
test_data = pd.read_csv('test_set.csv')

# Separate features and target
X_train = train_data.drop(['Y', 'RecordId'], axis=1)
y_train = train_data['Y']
X_test = test_data.drop('RecordId', axis=1)

# Step 1: Handle missing values using SimpleImputer
print("Handling missing values...")
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Step 2: Apply correlation filter to X_train and match for X_test
print("Applying correlation filter...")
X_train_df = pd.DataFrame(X_train, columns=train_data.drop(['Y', 'RecordId'], axis=1).columns)  # Preserve column names
correlation_matrix = X_train_df.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Set a threshold and drop highly correlated features from X_train
correlation_threshold = 0.88
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > correlation_threshold)]
X_train_df = X_train_df.drop(columns=to_drop)

# Apply the same feature drops to X_test to match feature count
X_test_df = pd.DataFrame(X_test, columns=test_data.drop(['RecordId'], axis=1).columns).drop(columns=to_drop, errors='ignore')

# Convert back to numpy arrays
X_train = X_train_df.values
X_test = X_test_df.values

# Step 3: Normalize data using MinMaxScaler
print("Normalizing data...")
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 4: Initialize XGBoost classifier
xgb = XGBClassifier(
    n_estimators=11000,
    learning_rate=0.004,
    max_depth=3,
    subsample=0.73,
    colsample_bytree=0.73,
    eval_metric='logloss',
    gamma=0.1,
    reg_alpha=0.01,
    reg_lambda=1
)

# Step 5: Perform Stratified K-Fold Cross-Validation
print("Performing Stratified K-Fold Cross-Validation...")
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    xgb.fit(X_train_fold, y_train_fold)
    val_accuracy = xgb.score(X_val_fold, y_val_fold)
    fold_accuracies.append(val_accuracy)
    print(f"Fold {fold + 1} Accuracy: {val_accuracy:.2f}")

avg_accuracy = np.mean(fold_accuracies)
print(f"\nAverage Stratified K-Fold Accuracy: {avg_accuracy:.2f}")

# Train the model on the entire training dataset
xgb.fit(X_train, y_train)

# Step 7: Predict on the test set
print("Predicting on the test set...")
y_test_pred_prob = xgb.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class

# Step 8: Prepare the submission file
submission = pd.DataFrame({
    'RecordId': test_data['RecordId'],
    'Y': y_test_pred_prob
})
submission.to_csv('XGBoosttest1.csv', index=False)
print("Submission file 'XGBoosttest1.csv' created successfully.")

# Step 9: Generate ROC curve for validation set
print("Generating ROC curve...")
y_val_prob = xgb.predict_proba(X_val_fold)[:, 1]
fpr, tpr, thresholds = roc_curve(y_val_fold, y_val_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, label=f'XGBoost ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for XGBoost Classifier')
plt.legend(loc='best')
plt.show()
