In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed (e.g., from one-hot encoder)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning) # Often related to sparse output default

# Load the data
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

print("Data loaded successfully.")
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# --- Feature Engineering & Preprocessing ---

def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years
    current_year = datetime.datetime.now().year
    # Use a reasonable reference year based on Dt_Customer if available, otherwise current year
    try:
        max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
    except: # Handle cases where Dt_Customer might not exist or be parseable easily
        reference_year = current_year

    # Replace very old birth years (e.g., < 1910) with NaN to be imputed later
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan

    # Calculate Age (handle potential NaNs in Year_Birth temporarily)
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)


    # 2. Process Dt_Customer
    # Convert 'Dt_Customer' to datetime, coercing errors
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)

    # Find the latest date for calculating tenure if not provided (from training set)
    if is_train:
        global global_latest_date # Store latest date from training set globally
        global_latest_date = df_processed['Dt_Customer'].max() + datetime.timedelta(days=1)
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        # Fallback if called on test set first or global_latest_date isn't set
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1) # Use start of next year as reference
        print(f"Warning: Using fallback latest date: {latest_date_to_use}")


    # Calculate Customer_Lifetime (Tenure in days)
    # Handle potential NaT dates resulting from coerce errors
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    df_processed['Customer_Lifetime'].fillna(df_processed['Customer_Lifetime'].median(), inplace=True) # Impute NaNs created by NaT
    df_processed.drop('Dt_Customer', axis=1, inplace=True)


    # 3. Simplify Marital Status
    # Consolidate categories
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner',
        'Together': 'Partner',
        'Absurd': 'Single',
        'Alone': 'Single',
        'YOLO': 'Single',
        'Widow': 'Single',
        'Divorced':'Single'
         }) # Grouping Married/Together and others into Single for simplicity

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'}) # Group '2n Cycle' with 'Master'

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # Optionally drop original columns if 'Children' is deemed sufficient
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True)

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns (identified during EDA)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore') # errors='ignore' in case they were already dropped

    return df_processed

# Preprocess training data
train_df_processed = preprocess_data(train_df, is_train=True)
# Preprocess test data using the latest date from training data
test_df_processed = preprocess_data(test_df, is_train=False, latest_date=global_latest_date)

print("Feature engineering complete.")
print("\nTrain Data Info after processing:")
train_df_processed.info()
print("\nTest Data Info after processing:")
test_df_processed.info()


# --- Model Training ---

# Separate features (X) and target (y)
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']
X_test = test_df_processed.drop('ID', axis=1)

# Align columns - crucial after feature engineering if columns were added/dropped differently (shouldn't happen here but good practice)
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0 # Add missing columns to test set with default value (0)

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0 # Add missing columns to train set with default value (0) - less likely

X_test = X_test[train_cols] # Ensure order is the same


# Identify column types for preprocessing
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Create preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')), # Impute missing numericals (Age, Income, Customer_Lifetime)
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')), # Impute missing categoricals (if any)
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Use sparse_output=False for easier debugging if needed
])

# Create a column transformer to apply different pipelines to different columns
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough') # Keep any columns not specified (though there shouldn't be any here)


# Define the model
# GradientBoostingClassifier often works well. random_state for reproducibility.
# Consider tuning hyperparameters later using GridSearchCV or RandomizedSearchCV
model = GradientBoostingClassifier(n_estimators=150, # Increased slightly
                                 learning_rate=0.08, # Slightly decreased
                                 max_depth=4,       # Increased slightly
                                 subsample=0.8,     # Added subsampling
                                 random_state=42)

# Create the full pipeline: preprocess + model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train the model on the entire training dataset
print("\nTraining the model...")
pipeline.fit(X, y)
print("Model training complete.")

# --- Prediction ---

# Predict on the preprocessed test data
print("Predicting on test data...")
test_predictions = pipeline.predict(X_test)
print("Prediction complete.")

# --- Submission File Generation ---

# Create the submission DataFrame
submission_df = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions})

# Save the submission file
submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully.")
print(submission_df.head())
print(f"\nPredicted target distribution:\n{submission_df['Target'].value_counts(normalize=True)}")

# Optional: Evaluate on the training set (for sanity check, not a true performance measure)
train_preds = pipeline.predict(X)
train_accuracy = accuracy_score(y, train_preds)
train_roc_auc = roc_auc_score(y, pipeline.predict_proba(X)[:, 1]) # Use probabilities for AUC
print(f"\n--- Training Set Evaluation (Sanity Check) ---")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"ROC AUC: {train_roc_auc:.4f}")
# print("Classification Report:\n", classification_report(y, train_preds))

Data loaded successfully.
Train shape: (1567, 29)
Test shape: (673, 28)
Feature engineering complete.

Train Data Info after processing:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   1567 non-null   int64  
 1   Education            1567 non-null   object 
 2   Marital_Status       1567 non-null   object 
 3   Income               1550 non-null   float64
 4   Kidhome              1567 non-null   int64  
 5   Teenhome             1567 non-null   int64  
 6   Recency              1567 non-null   int64  
 7   MntWines             1544 non-null   float64
 8   MntFruits            1567 non-null   int64  
 9   MntMeatProducts      1561 non-null   float64
 10  MntFishProducts      1567 non-null   int64  
 11  MntSweetProducts     1567 non-null   int64  
 12  MntGoldProds         1555 non-null   float64
 13  N

  df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['Customer_Lifetime'].fillna(df_processed['Customer_Lifetime'].median(), inplace=True) # Impute NaNs created by NaT
  max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
  df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
  df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_date

Model training complete.
Predicting on test data...
Prediction complete.

Submission file 'submission.csv' created successfully.
      ID  Target
0   4390       1
1  10478       1
2   1081       1
3   4261       1
4   9916       0

Predicted target distribution:
Target
0    0.624071
1    0.375929
Name: proportion, dtype: float64

--- Training Set Evaluation (Sanity Check) ---
Accuracy: 0.9713
ROC AUC: 0.9970


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed (e.g., from one-hot encoder)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning) # Often related to sparse output default

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    # test_df = pd.read_csv("test.csv") # Not needed for CV evaluation
except FileNotFoundError:
    print("Make sure train.csv is in the same directory.")
    exit()

print("Training data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (same as before) ---
def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years & Calculate Age
    current_year = datetime.datetime.now().year
    try:
        max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
    except:
        reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)

    # 2. Process Dt_Customer & Calculate Customer_Lifetime
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date
        global_latest_date = df_processed['Dt_Customer'].max() + datetime.timedelta(days=1)
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)

    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    # Impute median *before* dropping Dt_Customer to handle NaTs properly
    median_lifetime = df_processed['Customer_Lifetime'].median() # Calculate median only once
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)

    # 3. Simplify Marital Status
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True) # Keep originals for now

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    return df_processed

# --- Apply Preprocessing to Training Data ---
train_df_processed = preprocess_data(train_df.copy(), is_train=True) # Use copy to be safe
print("Preprocessing complete.")

# --- Prepare Data for CV ---
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']

# Identify column types (ensure this happens *after* preprocessing)
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# Check if Customer_Lifetime needs explicit imputation placeholder if it wasn't numeric initially
if 'Customer_Lifetime' in numerical_features:
     print("Customer_Lifetime treated as numerical.")
else:
     print("Warning: Customer_Lifetime might not be numerical after preprocessing.")


# --- Define Preprocessing Steps ---
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough')

# --- Define Model (Using the same parameters as your previous run) ---
# You can adjust these parameters later based on CV results
model = GradientBoostingClassifier(n_estimators=150,
                                 learning_rate=0.08,
                                 max_depth=4,
                                 subsample=0.8,
                                 random_state=42)

# --- Create Full Pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# --- Set up K-Fold Cross-Validation ---
N_SPLITS = 5 # Number of folds (5 or 10 are common)
RANDOM_STATE_KFOLD = 42 # For reproducible splits

# Use StratifiedKFold to maintain target class distribution in each fold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

print(f"\nStarting {N_SPLITS}-Fold Cross-Validation...")

# --- Perform Cross-Validation and Calculate Scores ---

# Accuracy Scores
accuracy_scores = cross_val_score(pipeline, X, y, cv=skf, scoring='accuracy', n_jobs=-1) # n_jobs=-1 uses all processors

# ROC AUC Scores
# Note: cross_val_score calculates ROC AUC based on predict_proba internally
roc_auc_scores = cross_val_score(pipeline, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

print("Cross-Validation finished.")

# --- Report Results ---
print("\n--- Cross-Validation Results ---")
print(f"Accuracy Scores per Fold: {accuracy_scores}")
print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Std Dev Accuracy: {np.std(accuracy_scores):.4f}")
print("-" * 30)
print(f"ROC AUC Scores per Fold: {roc_auc_scores}")
print(f"Mean ROC AUC: {np.mean(roc_auc_scores):.4f}")
print(f"Std Dev ROC AUC: {np.std(roc_auc_scores):.4f}")
print("-" * 30)

# --- Note on Final Training ---
print("\nNOTE: The scores above are estimates of generalization performance.")
print("For the final submission, you should train the pipeline on the *entire* training set (X, y)")
print("and then predict on the preprocessed test set.")
print("Example final training step (run this *after* CV and hyperparameter tuning):")
print("# pipeline.fit(X, y)")
print("# test_predictions = pipeline.predict(X_test) # Assuming X_test is preprocessed test data")

Training data loaded successfully.
Preprocessing complete.
Customer_Lifetime treated as numerical.

Starting 5-Fold Cross-Validation...
Cross-Validation finished.

--- Cross-Validation Results ---
Accuracy Scores per Fold: [0.82484076 0.82165605 0.84345048 0.83067093 0.82428115]
Mean Accuracy: 0.8290
Std Dev Accuracy: 0.0078
------------------------------
ROC AUC Scores per Fold: [0.90629269 0.90923839 0.91769972 0.92064824 0.90418388]
Mean ROC AUC: 0.9116
Std Dev ROC AUC: 0.0064
------------------------------

NOTE: The scores above are estimates of generalization performance.
For the final submission, you should train the pipeline on the *entire* training set (X, y)
and then predict on the preprocessed test set.
Example final training step (run this *after* CV and hyperparameter tuning):
# pipeline.fit(X, y)
# test_predictions = pipeline.predict(X_test) # Assuming X_test is preprocessed test data


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv") # Needed for final submission ID mapping
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (same as before) ---
def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years & Calculate Age
    current_year = datetime.datetime.now().year
    try:
        max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
    except:
        reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)

    # 2. Process Dt_Customer & Calculate Customer_Lifetime
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date
        global_latest_date = df_processed['Dt_Customer'].max() + datetime.timedelta(days=1) # Define global latest date from train set
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else: # Fallback if test is processed first somehow (shouldn't happen here)
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
        print(f"Warning: Using fallback latest date: {latest_date_to_use}")


    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)

    # 3. Simplify Marital Status
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True) # Keep originals for now

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    return df_processed

# --- Apply Preprocessing ---
train_df_processed = preprocess_data(train_df.copy(), is_train=True)
# Preprocess test data using the date derived from training data
test_df_processed = preprocess_data(test_df.copy(), is_train=False, latest_date=global_latest_date)
print("Preprocessing complete.")


# --- Prepare Data ---
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']
X_test = test_df_processed.drop('ID', axis=1)

# Align columns after preprocessing (important!)
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0

X_test = X_test[train_cols] # Ensure order is the same


# Identify column types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# --- Define Preprocessing Steps ---
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Set sparse_output=True for large datasets if memory is an issue
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough')


# --- Define Base Model ---
# We will tune the parameters of this model
base_model = GradientBoostingClassifier(random_state=42)

# --- Create Full Pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', base_model) # Placeholder name 'classifier'
])

# --- Define Parameter Grid for RandomizedSearchCV ---
# Adjust ranges based on previous results and desired exploration
param_dist = {
    'classifier__n_estimators': [50, 80, 100, 150, 200], # Range around potentially good values
    'classifier__learning_rate': [0.01, 0.02, 0.05, 0.08, 0.1, 0.15], # Wider range, including lower values
    'classifier__max_depth': [2, 3, 4], # Focus on shallower trees to reduce overfitting
    'classifier__min_samples_leaf': [5, 10, 15, 20], # Force more samples per leaf
    'classifier__min_samples_split': [10, 20, 30], # Force more samples for a split
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9], # Explore subsampling ratios
    'classifier__max_features': ['sqrt', 'log2', 0.7, 0.8, None] # Limit features per split
}

# --- Set up K-Fold Strategy (same as before) ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV ---
N_ITER = 50 # Number of parameter settings to sample. Increase for more thorough search (e.g., 100), decrease for speed.
SCORING_METRIC = 'roc_auc' # Optimize for ROC AUC, common for binary classification

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1, # Use all available CPU cores
    random_state=42, # For reproducible search results
    verbose=1 # Set to 1 or 2 to see progress
)

print(f"\nStarting RandomizedSearchCV with {N_ITER} iterations for {SCORING_METRIC}...")
random_search.fit(X, y)
print("RandomizedSearchCV finished.")

# --- Report Best Results ---
print("\n--- Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search.best_score_:.4f}")
print("Best Parameters:")
# Nicely print the best parameters found
best_params = random_search.best_params_
for param, value in best_params.items():
    print(f"  {param}: {value}")

# --- Train Final Model with Best Parameters ---
print("\nTraining final model on the entire training set using best parameters...")
# The best estimator found by RandomizedSearchCV is already fitted on the full data
# if refit=True (default), but we fit it explicitly for clarity.
# Alternatively, you could just use: best_pipeline = random_search.best_estimator_
best_pipeline = pipeline # Start with the original pipeline structure
best_pipeline.set_params(**best_params) # Set the best parameters found
best_pipeline.fit(X, y)
print("Final model training complete.")

# --- Predict on Test Data ---
print("Predicting on test data using the tuned model...")
test_predictions = best_pipeline.predict(X_test)
print("Prediction complete.")

# --- Generate Submission File ---
submission_df = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions})
submission_filename = 'submission_tuned_gbc.csv' # New filename
submission_df.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully.")
print(submission_df.head())
print(f"\nPredicted target distribution:\n{submission_df['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* model on the training set
train_preds_tuned = best_pipeline.predict(X)
train_accuracy_tuned = accuracy_score(y, train_preds_tuned)
train_roc_auc_tuned = roc_auc_score(y, best_pipeline.predict_proba(X)[:, 1])
print(f"\n--- Tuned Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_tuned:.4f}")
print("(Compare these to the initial overfit scores and the CV scores)")

Data loaded successfully.
Preprocessing complete.

Starting RandomizedSearchCV with 50 iterations for roc_auc...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
RandomizedSearchCV finished.

--- Hyperparameter Tuning Results ---
Best Score (roc_auc): 0.9239
Best Parameters:
  classifier__subsample: 0.6
  classifier__n_estimators: 200
  classifier__min_samples_split: 20
  classifier__min_samples_leaf: 20
  classifier__max_features: sqrt
  classifier__max_depth: 2
  classifier__learning_rate: 0.08

Training final model on the entire training set using best parameters...
Final model training complete.
Predicting on test data using the tuned model...
Prediction complete.

Submission file 'submission_tuned_gbc.csv' created successfully.
      ID  Target
0   4390       1
1  10478       1
2   1081       1
3   4261       1
4   9916       0

Predicted target distribution:
Target
0    0.619614
1    0.380386
Name: proportion, dtype: float64

--- Tuned Model Training Set Evaluation -

In [3]:
pip install xgboost

^C
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer # Ensure make_scorer is imported if needed, though cross_val_score handles it
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings
import xgboost
try:
    from xgboost import XGBClassifier
except ImportError:
    print("XGBoost not found. Please install it using: pip install xgboost")
    exit()

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (Identical to previous step) ---
def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years & Calculate Age
    current_year = datetime.datetime.now().year
    try:
        # Attempt to get reference year from Dt_Customer
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True) # Drop temporary column
    except Exception as e: # Broad exception for safety if Dt_Customer is missing or unparseable
        print(f"Warning: Could not parse Dt_Customer for reference year. Using current year. Error: {e}")
        reference_year = current_year

    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)

    # 2. Process Dt_Customer & Calculate Customer_Lifetime
    # Convert 'Dt_Customer' to datetime, coercing errors
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)

    # Find the latest date for calculating tenure if not provided (from training set)
    if is_train:
        global global_latest_date # Store latest date from training set globally
        # Handle case where all Dt_Customer might be NaT after coercion
        valid_dates = df_processed['Dt_Customer'].dropna()
        if not valid_dates.empty:
            global_latest_date = valid_dates.max() + datetime.timedelta(days=1)
        else:
            # Fallback if no valid dates found in training set
            global_latest_date = datetime.datetime(reference_year + 1, 1, 1)
            print(f"Warning: No valid Dt_Customer found in training set. Using fallback latest date: {global_latest_date}")
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        # Fallback if called on test set first or global_latest_date isn't set
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
        print(f"Warning: Using fallback latest date for test set: {latest_date_to_use}")

    # Calculate Customer_Lifetime (Tenure in days)
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    # Impute median *before* dropping Dt_Customer to handle NaTs properly
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)


    # 3. Simplify Marital Status
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # Keep original Kidhome/Teenhome for now, might be useful features
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True)

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    return df_processed

# --- Apply Preprocessing ---
train_df_processed = preprocess_data(train_df.copy(), is_train=True)
# Check if global_latest_date was set correctly
if 'global_latest_date' not in globals():
     print("Error: global_latest_date not set during training preprocessing. Exiting.")
     # Handle this case appropriately, maybe define a default or raise error
     # For now, let's set a default, but ideally the training data processing should succeed
     global_latest_date = datetime.datetime.now() + datetime.timedelta(days=1)
     print(f"Using current date as fallback for global_latest_date: {global_latest_date}")

test_df_processed = preprocess_data(test_df.copy(), is_train=False, latest_date=global_latest_date)
print("Preprocessing complete.")


# --- Prepare Data ---
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']
X_test = test_df_processed.drop('ID', axis=1)

# Align columns after preprocessing
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0

X_test = X_test[train_cols] # Ensure order is the same


# Identify column types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# --- Define Preprocessing Steps (Same as before) ---
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Set sparse=True for large data if needed
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough')


# --- Define Base Model: XGBoost ---
# Use_label_encoder=False is recommended for newer XGBoost versions
# eval_metric='logloss' or 'auc' are common for binary classification
base_model_xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# --- Create Full Pipeline with XGBoost ---
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', base_model_xgb) # Step name remains 'classifier'
])

# --- Define Parameter Grid for XGBoost RandomizedSearchCV ---
# These ranges are starting points; adjust based on results or computational budget
param_dist_xgb = {
    'classifier__n_estimators': [100, 150, 200, 300, 400], # Number of boosting rounds
    'classifier__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15], # Step size shrinkage
    'classifier__max_depth': [2, 3, 4, 5], # Maximum depth of a tree
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], # Fraction of samples used per tree
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0], # Fraction of features used per tree
    'classifier__gamma': [0, 0.1, 0.2, 0.5], # Minimum loss reduction required to make a further partition
    'classifier__reg_alpha': [0, 0.001, 0.01, 0.1], # L1 regularization term
    'classifier__reg_lambda': [0.5, 1, 1.5] # L2 regularization term (default is 1)
    # Add 'min_child_weight': [1, 3, 5] if needed (minimum sum of instance weight needed in a child)
}

# --- Set up K-Fold Strategy (same as before) ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV for XGBoost ---
N_ITER_XGB = 75 # Increase iterations for potentially better results (vs 50 for GBC)
SCORING_METRIC = 'roc_auc' # Optimize for ROC AUC

random_search_xgb = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_dist_xgb,
    n_iter=N_ITER_XGB,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1, # Use all available CPU cores
    random_state=42, # For reproducible search results
    verbose=1 # Set to 1 or 2 to see progress
)

print(f"\nStarting RandomizedSearchCV for XGBoost with {N_ITER_XGB} iterations for {SCORING_METRIC}...")
random_search_xgb.fit(X, y)
print("XGBoost RandomizedSearchCV finished.")

# --- Report Best XGBoost Results ---
print("\n--- XGBoost Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search_xgb.best_score_:.4f}")
print("Best Parameters:")
best_params_xgb = random_search_xgb.best_params_
for param, value in best_params_xgb.items():
    print(f"  {param}: {value}")

# --- Train Final XGBoost Model with Best Parameters ---
print("\nTraining final XGBoost model on the entire training set using best parameters...")
# The best estimator is automatically refit on the whole training data by RandomizedSearchCV
best_pipeline_xgb = random_search_xgb.best_estimator_
# Explicit refit just to be sure (Optional, default behavior of RS CV is refit=True)
# best_pipeline_xgb.fit(X, y)
print("Final XGBoost model training complete.")


# --- Predict on Test Data with Tuned XGBoost ---
print("Predicting on test data using the tuned XGBoost model...")
test_predictions_xgb = best_pipeline_xgb.predict(X_test)
print("Prediction complete.")

# --- Generate Submission File for XGBoost ---
submission_df_xgb = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_xgb})
submission_filename_xgb = 'submission_tuned_xgb.csv' # New filename
submission_df_xgb.to_csv(submission_filename_xgb, index=False)

print(f"\nSubmission file '{submission_filename_xgb}' created successfully.")
print(submission_df_xgb.head())
print(f"\nPredicted target distribution (XGBoost):\n{submission_df_xgb['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* XGBoost model on the training set
train_preds_xgb_tuned = best_pipeline_xgb.predict(X)
train_accuracy_xgb_tuned = accuracy_score(y, train_preds_xgb_tuned)
train_roc_auc_xgb_tuned = roc_auc_score(y, best_pipeline_xgb.predict_proba(X)[:, 1])
print(f"\n--- Tuned XGBoost Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_xgb_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_xgb_tuned:.4f}")

Data loaded successfully.
Preprocessing complete.

Numerical features (27): ['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Age', 'Customer_Lifetime', 'Children', 'Total_Mnt', 'Total_Purchases', 'Total_CmpAccepted']
Categorical features (2): ['Education', 'Marital_Status']

Starting RandomizedSearchCV for XGBoost with 75 iterations for roc_auc...
Fitting 5 folds for each of 75 candidates, totalling 375 fits
XGBoost RandomizedSearchCV finished.

--- XGBoost Hyperparameter Tuning Results ---
Best Score (roc_auc): 0.9236
Best Parameters:
  classifier__subsample: 0.8
  classifier__reg_lambda: 0.5
  classifier__reg_alpha: 0.01
  classifier__n_estimators: 150
  classifier__max_depth: 3
  classifier__lea

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier # Keep GBC
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer # Added FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (MODIFIED) ---
def preprocess_data_v2(df, is_train=True, latest_date=None):
    """Applies feature engineering (v2) and basic cleaning."""
    df_processed = df.copy()

    # --- Original Preprocessing ---
    # 1. Handle Birth Year & Age
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception as e:
        print(f"Warning: Could not parse Dt_Customer for reference year. Using current year. Error: {e}")
        reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True) # Keep Age

    # 2. Process Dt_Customer & Lifetime + Extract Date Features
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v2 # Use a new global var name if running in same session
        valid_dates = df_processed['Dt_Customer'].dropna()
        if not valid_dates.empty:
            global_latest_date_v2 = valid_dates.max() + datetime.timedelta(days=1)
        else:
            global_latest_date_v2 = datetime.datetime(reference_year + 1, 1, 1)
            print(f"Warning: No valid Dt_Customer. Using fallback latest date: {global_latest_date_v2}")
        latest_date_to_use = global_latest_date_v2
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
        print(f"Warning: Using fallback latest date for test: {latest_date_to_use}")

    # --> NEW: Extract Date Features BEFORE calculating lifetime and dropping
    df_processed['Enroll_Month'] = df_processed['Dt_Customer'].dt.month
    df_processed['Enroll_Year'] = df_processed['Dt_Customer'].dt.year
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer'].dt.dayofweek
    # Impute NaNs in date features (e.g., with mode or median year/month)
    df_processed['Enroll_Month'].fillna(df_processed['Enroll_Month'].mode()[0], inplace=True)
    df_processed['Enroll_Year'].fillna(df_processed['Enroll_Year'].median(), inplace=True)
    df_processed['Enroll_DayOfWeek'].fillna(df_processed['Enroll_DayOfWeek'].mode()[0], inplace=True)


    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True) # Now drop Dt_Customer

    # 3. Simplify Marital Status (Keeping original for now - let's test)
    # df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({ ... }) # Keep original

    # 4. Simplify Education (Keeping original for now - let's test)
    # df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'}) # Keep original

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    # Impute NaNs in spending columns *before* summing (using 0 or median)
    for col in mnt_cols:
        df_processed[col] = df_processed[col].fillna(0) # Simple imputation with 0 for spending
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    # --- NEW Features ---
    # Ratio Features (handle division by zero)
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt']).fillna(0)
    # Replace inf values that might result from 0/0
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)


    # Income related (handle division by zero and potential NaNs in Income)
    # Impute Income NaNs *before* using it in calculations
    income_median = df_processed['Income'].median()
    df_processed['Income'].fillna(income_median, inplace=True)
    # Add 1 to avoid division by zero if Children=0 and partner=1 (or single=1)
    num_people = df_processed['Children'] + df_processed['Marital_Status'].apply(lambda x: 1 if x=='Single' else 2) # Simple adult estimate
    df_processed['Income_per_Person'] = (df_processed['Income'] / num_people.replace(0, 1)).fillna(0) # Replace 0 people with 1


    # Spending per Purchase (handle division by zero)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Total_Purchases'].replace(0, 1)).fillna(0)


    return df_processed

# --- Apply NEW Preprocessing ---
train_df_processed_v2 = preprocess_data_v2(train_df.copy(), is_train=True)
if 'global_latest_date_v2' not in globals(): # Check the new global var
     global_latest_date_v2 = datetime.datetime.now() + datetime.timedelta(days=1)
     print(f"Error: global_latest_date_v2 not set. Using fallback: {global_latest_date_v2}")
test_df_processed_v2 = preprocess_data_v2(test_df.copy(), is_train=False, latest_date=global_latest_date_v2)
print("V2 Preprocessing complete.")

# --- Prepare Data (using v2 processed data) ---
X_v2 = train_df_processed_v2.drop(['ID', 'Target'], axis=1)
y_v2 = train_df_processed_v2['Target'] # Target remains the same
X_test_v2 = test_df_processed_v2.drop('ID', axis=1)

# Align columns after V2 preprocessing
train_cols_v2 = X_v2.columns
test_cols_v2 = X_test_v2.columns

missing_in_test_v2 = set(train_cols_v2) - set(test_cols_v2)
for c in missing_in_test_v2:
    X_test_v2[c] = 0
missing_in_train_v2 = set(test_cols_v2) - set(train_cols_v2)
for c in missing_in_train_v2:
    X_v2[c] = 0

X_test_v2 = X_test_v2[train_cols_v2] # Ensure order is the same

# --- Define Preprocessing Steps (Potentially updated if features changed type) ---
numerical_features_v2 = X_v2.select_dtypes(include=np.number).columns.tolist()
categorical_features_v2 = X_v2.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nV2 Numerical features ({len(numerical_features_v2)}): {numerical_features_v2}")
print(f"V2 Categorical features ({len(categorical_features_v2)}): {categorical_features_v2}")


# Log transformer function
log_transformer = FunctionTransformer(np.log1p, validate=False) # validate=False to handle 0s after log1p

# Update Numerical Pipeline to include Log Transform for specific skewed features
# Identify potentially skewed features (Income, Spending)
skewed_num_features = ['Income', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
                       'MntSweetProducts', 'MntGoldProds', 'Total_Mnt', 'Income_per_Person', 'Spending_per_Purchase']
# Make sure these features actually exist after preprocessing
skewed_num_features = [f for f in skewed_num_features if f in numerical_features_v2]
other_num_features = [f for f in numerical_features_v2 if f not in skewed_num_features]


numerical_pipeline_v2 = Pipeline([
    # Impute FIRST
    ('imputer_num', SimpleImputer(strategy='median')),
    # Apply log transform only to skewed columns (using ColumnTransformer within Pipeline - tricky!)
    # Easier approach: Apply log transform in preprocess_data_v2 or handle separately if needed.
    # For simplicity here, let's apply StandardScaler to all imputed numericals.
    # Consider log transform within preprocess_data_v2 if it proves beneficial.
    ('scaler', StandardScaler())
])

categorical_pipeline_v2 = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Update Preprocessor
preprocessor_v2 = ColumnTransformer([
    ('num', numerical_pipeline_v2, numerical_features_v2), # Apply updated pipeline to all numerical
    ('cat', categorical_pipeline_v2, categorical_features_v2)
], remainder='passthrough')


# --- Define Base Model: Gradient Boosting (Retuning this one) ---
base_model_gbc = GradientBoostingClassifier(random_state=42)

# --- Create Full Pipeline with GBC V2 ---
pipeline_gbc_v2 = Pipeline([
    ('preprocessor', preprocessor_v2),
    ('classifier', base_model_gbc)
])

# --- Define Parameter Grid for GBC RandomizedSearchCV (Centered around previous best) ---
param_dist_gbc_v2 = {
    'classifier__n_estimators': [150, 200, 250, 300], # Explore higher values slightly
    'classifier__learning_rate': [0.02, 0.05, 0.08, 0.1], # Narrower range around 0.08
    'classifier__max_depth': [2, 3], # Keep focusing on shallow trees
    'classifier__min_samples_leaf': [15, 20, 25], # Stay around the previous best
    'classifier__min_samples_split': [15, 20, 30], # Stay around the previous best
    'classifier__subsample': [0.5, 0.6, 0.7], # Explore around 0.6
    'classifier__max_features': ['sqrt', 'log2'] # Keep simpler options
}

# --- Set up K-Fold Strategy (same as before) ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV for GBC V2 ---
N_ITER_GBC_V2 = 50 # Number of iterations for retuning
SCORING_METRIC = 'roc_auc'

random_search_gbc_v2 = RandomizedSearchCV(
    estimator=pipeline_gbc_v2,
    param_distributions=param_dist_gbc_v2,
    n_iter=N_ITER_GBC_V2,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

print(f"\nStarting RandomizedSearchCV for GBC (V2 Features) with {N_ITER_GBC_V2} iterations for {SCORING_METRIC}...")
random_search_gbc_v2.fit(X_v2, y_v2) # Use V2 features and original target
print("GBC V2 RandomizedSearchCV finished.")

# --- Report Best GBC V2 Results ---
print("\n--- GBC V2 Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search_gbc_v2.best_score_:.4f}")
print("Best Parameters:")
best_params_gbc_v2 = random_search_gbc_v2.best_params_
for param, value in best_params_gbc_v2.items():
    print(f"  {param}: {value}")

# --- Train Final GBC V2 Model with Best Parameters ---
print("\nTraining final GBC V2 model on the entire training set using best parameters...")
best_pipeline_gbc_v2 = random_search_gbc_v2.best_estimator_
print("Final GBC V2 model training complete.")

# --- Predict on Test Data with Tuned GBC V2 ---
print("Predicting on test data using the tuned GBC V2 model...")
test_predictions_gbc_v2 = best_pipeline_gbc_v2.predict(X_test_v2) # Use V2 test features
print("Prediction complete.")

# --- Generate Submission File for GBC V2 ---
submission_df_gbc_v2 = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_gbc_v2})
submission_filename_gbc_v2 = 'submission_tuned_gbc_v2_features.csv' # New filename
submission_df_gbc_v2.to_csv(submission_filename_gbc_v2, index=False)

print(f"\nSubmission file '{submission_filename_gbc_v2}' created successfully.")
print(submission_df_gbc_v2.head())
print(f"\nPredicted target distribution (GBC V2 Features):\n{submission_df_gbc_v2['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* GBC V2 model on the training set
train_preds_gbc_v2_tuned = best_pipeline_gbc_v2.predict(X_v2)
train_accuracy_gbc_v2_tuned = accuracy_score(y_v2, train_preds_gbc_v2_tuned)
train_roc_auc_gbc_v2_tuned = roc_auc_score(y_v2, best_pipeline_gbc_v2.predict_proba(X_v2)[:, 1])
print(f"\n--- Tuned GBC V2 Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_gbc_v2_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_gbc_v2_tuned:.4f}")

Data loaded successfully.
V2 Preprocessing complete.

V2 Numerical features (35): ['Income', 'Kidhome', 'Teenhome', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1', 'AcceptedCmp2', 'Complain', 'Age', 'Enroll_Month', 'Enroll_Year', 'Enroll_DayOfWeek', 'Customer_Lifetime', 'Children', 'Total_Mnt', 'Total_Purchases', 'Total_CmpAccepted', 'Wine_Ratio', 'Meat_Ratio', 'Fruit_Ratio', 'Income_per_Person', 'Spending_per_Purchase']
V2 Categorical features (2): ['Education', 'Marital_Status']

Starting RandomizedSearchCV for GBC (V2 Features) with 50 iterations for roc_auc...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
GBC V2 RandomizedSearchCV finished.

--- GBC V2 Hyperparameter Tuning Results ---
Best Score (roc_auc): 0.9214
Best Parameters:
  classi

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold # Keep for reference if needed later
# Removed RandomizedSearchCV as we are using pre-found params
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Functions (V1 and V2) ---

# Function V1 (leading to 0.845 score)
def preprocess_data_v1(df, is_train=True, latest_date=None):
    df_processed = df.copy()
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v1
        valid_dates = df_processed['Dt_Customer'].dropna()
        global_latest_date_v1 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v1
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    for col in mnt_cols: df_processed[col] = df_processed[col].fillna(0) # Impute before sum
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    # Impute Income (might be needed if not done before FE)
    income_median = df_processed['Income'].median()
    df_processed['Income'].fillna(income_median, inplace=True)
    # Impute Age (might be needed if Year_Birth had NaNs)
    age_median = df_processed['Age'].median()
    df_processed['Age'].fillna(age_median, inplace=True)
    return df_processed

# Function V2 (leading to 0.848 score) - simplified, assuming it's the same as last run
def preprocess_data_v2(df, is_train=True, latest_date=None):
    df_processed = df.copy()
    # --- Previous steps: Age, Lifetime, Date Features ---
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v2
        valid_dates = df_processed['Dt_Customer'].dropna()
        global_latest_date_v2 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v2
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
    df_processed['Enroll_Month'] = df_processed['Dt_Customer'].dt.month.fillna(df_processed['Dt_Customer'].dt.month.mode()[0])
    df_processed['Enroll_Year'] = df_processed['Dt_Customer'].dt.year.fillna(df_processed['Dt_Customer'].dt.year.median())
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer'].dt.dayofweek.fillna(df_processed['Dt_Customer'].dt.dayofweek.mode()[0])
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)

    # --- V2 specific additions / kept originals ---
    # Marital_Status kept original
    # Education kept original
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    for col in mnt_cols: df_processed[col] = df_processed[col].fillna(0)
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt']).fillna(0)
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)
    income_median = df_processed['Income'].median()
    df_processed['Income'].fillna(income_median, inplace=True)
    num_people = df_processed['Children'] + df_processed['Marital_Status'].apply(lambda x: 1 if x in ['Single','Divorced','Widow','Absurd','Alone','YOLO'] else 2)
    df_processed['Income_per_Person'] = (df_processed['Income'] / num_people.replace(0, 1)).fillna(0)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Total_Purchases'].replace(0, 1)).fillna(0)
    # Impute Age (might be needed if Year_Birth had NaNs)
    age_median = df_processed['Age'].median()
    df_processed['Age'].fillna(age_median, inplace=True)
    return df_processed

# --- Apply Preprocessing V1 ---
train_df_processed_v1 = preprocess_data_v1(train_df.copy(), is_train=True)
if 'global_latest_date_v1' not in globals(): global_latest_date_v1 = datetime.datetime.now() + datetime.timedelta(days=1)
test_df_processed_v1 = preprocess_data_v1(test_df.copy(), is_train=False, latest_date=global_latest_date_v1)
print("V1 Preprocessing complete.")

# --- Apply Preprocessing V2 ---
train_df_processed_v2 = preprocess_data_v2(train_df.copy(), is_train=True)
if 'global_latest_date_v2' not in globals(): global_latest_date_v2 = datetime.datetime.now() + datetime.timedelta(days=1)
test_df_processed_v2 = preprocess_data_v2(test_df.copy(), is_train=False, latest_date=global_latest_date_v2)
print("V2 Preprocessing complete.")


# --- Prepare Data V1 ---
X_v1 = train_df_processed_v1.drop(['ID', 'Target'], axis=1)
y_v1 = train_df_processed_v1['Target']
X_test_v1 = test_df_processed_v1.drop('ID', axis=1)
# Align
train_cols_v1 = X_v1.columns
missing_in_test_v1 = set(train_cols_v1) - set(X_test_v1.columns)
for c in missing_in_test_v1: X_test_v1[c] = 0
missing_in_train_v1 = set(X_test_v1.columns) - set(train_cols_v1)
for c in missing_in_train_v1: X_v1[c] = 0
X_test_v1 = X_test_v1[train_cols_v1]

# --- Prepare Data V2 ---
X_v2 = train_df_processed_v2.drop(['ID', 'Target'], axis=1)
y_v2 = train_df_processed_v2['Target']
X_test_v2 = test_df_processed_v2.drop('ID', axis=1)
# Align
train_cols_v2 = X_v2.columns
missing_in_test_v2 = set(train_cols_v2) - set(X_test_v2.columns)
for c in missing_in_test_v2: X_test_v2[c] = 0
missing_in_train_v2 = set(X_test_v2.columns) - set(train_cols_v2)
for c in missing_in_train_v2: X_v2[c] = 0
X_test_v2 = X_test_v2[train_cols_v2]


# --- Define Preprocessing Pipelines (Need separate ones for V1 and V2 features) ---

# Pipeline V1 Definition
numerical_features_v1 = X_v1.select_dtypes(include=np.number).columns.tolist()
categorical_features_v1 = X_v1.select_dtypes(exclude=np.number).columns.tolist()
numerical_pipeline_v1 = Pipeline([('imputer_num', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_pipeline_v1 = Pipeline([('imputer_cat', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v1 = ColumnTransformer([
    ('num', numerical_pipeline_v1, numerical_features_v1),
    ('cat', categorical_pipeline_v1, categorical_features_v1)], remainder='passthrough')

# Pipeline V2 Definition
numerical_features_v2 = X_v2.select_dtypes(include=np.number).columns.tolist()
categorical_features_v2 = X_v2.select_dtypes(exclude=np.number).columns.tolist()
numerical_pipeline_v2 = Pipeline([('imputer_num', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_pipeline_v2 = Pipeline([('imputer_cat', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v2 = ColumnTransformer([
    ('num', numerical_pipeline_v2, numerical_features_v2),
    ('cat', categorical_pipeline_v2, categorical_features_v2)], remainder='passthrough')

# --- Define BEST Hyperparameters found previously ---

# Best parameters for GBC with V1 features (resulted in 0.845 Kaggle score)
# Note: These are the params *you reported* finding previously. Double-check if needed.
best_params_gbc_v1 = {
    'classifier__subsample': 0.6,
    'classifier__n_estimators': 200,
    'classifier__min_samples_split': 20,
    'classifier__min_samples_leaf': 20,
    'classifier__max_features': 'sqrt',
    'classifier__max_depth': 2,
    'classifier__learning_rate': 0.08
}

# Best parameters for GBC with V2 features (resulted in 0.848 Kaggle score)
best_params_gbc_v2 = {
    'classifier__subsample': 0.7,
    'classifier__n_estimators': 300,
    'classifier__min_samples_split': 20,
    'classifier__min_samples_leaf': 20,
    'classifier__max_features': 'log2',
    'classifier__max_depth': 2,
    'classifier__learning_rate': 0.05
}


# --- Build and Train Model 1 (GBC V1) ---
print("Training Model 1 (GBC V1)...")
pipeline_gbc_v1 = Pipeline([
    ('preprocessor', preprocessor_v1),
    ('classifier', GradientBoostingClassifier(random_state=42)) # Base model
])
pipeline_gbc_v1.set_params(**best_params_gbc_v1) # Apply best params
pipeline_gbc_v1.fit(X_v1, y_v1)
print("Model 1 training complete.")

# --- Build and Train Model 2 (GBC V2) ---
print("Training Model 2 (GBC V2)...")
pipeline_gbc_v2 = Pipeline([
    ('preprocessor', preprocessor_v2),
    ('classifier', GradientBoostingClassifier(random_state=42)) # Base model
])
pipeline_gbc_v2.set_params(**best_params_gbc_v2) # Apply best params
pipeline_gbc_v2.fit(X_v2, y_v2)
print("Model 2 training complete.")


# --- Predict Probabilities on Test Set ---
print("Predicting probabilities...")
# IMPORTANT: Use the correctly preprocessed test set for each model!
probs_gbc_v1 = pipeline_gbc_v1.predict_proba(X_test_v1)[:, 1]
probs_gbc_v2 = pipeline_gbc_v2.predict_proba(X_test_v2)[:, 1]

# --- Ensemble Averaging ---
print("Averaging predictions...")
# Simple average (you could also try weighted average if desired)
avg_probs = (probs_gbc_v1 + probs_gbc_v2) / 2

# Convert probabilities to 0/1 using 0.5 threshold
final_predictions = (avg_probs >= 0.5).astype(int)

# --- Generate Submission File ---
submission_df_ensemble = pd.DataFrame({'ID': test_df['ID'], 'Target': final_predictions})
submission_filename_ensemble = 'submission_ensemble_gbc_v1_v2.csv'
submission_df_ensemble.to_csv(submission_filename_ensemble, index=False)

print(f"\nSubmission file '{submission_filename_ensemble}' created successfully.")
print(submission_df_ensemble.head())
print(f"\nPredicted target distribution (Ensemble):\n{submission_df_ensemble['Target'].value_counts(normalize=True)}")

# Optional: Evaluate component models on training data (as a rough check)
train_preds_gbc_v1 = pipeline_gbc_v1.predict(X_v1)
train_roc_auc_gbc_v1 = roc_auc_score(y_v1, pipeline_gbc_v1.predict_proba(X_v1)[:, 1])
print(f"\n--- Model 1 (GBC V1) Training Set Eval ---")
print(f"ROC AUC: {train_roc_auc_gbc_v1:.4f}")

train_preds_gbc_v2 = pipeline_gbc_v2.predict(X_v2)
train_roc_auc_gbc_v2 = roc_auc_score(y_v2, pipeline_gbc_v2.predict_proba(X_v2)[:, 1])
print(f"\n--- Model 2 (GBC V2) Training Set Eval ---")
print(f"ROC AUC: {train_roc_auc_gbc_v2:.4f}")

Data loaded successfully.
V1 Preprocessing complete.
V2 Preprocessing complete.
Training Model 1 (GBC V1)...
Model 1 training complete.
Training Model 2 (GBC V2)...
Model 2 training complete.
Predicting probabilities...
Averaging predictions...

Submission file 'submission_ensemble_gbc_v1_v2.csv' created successfully.
      ID  Target
0   4390       1
1  10478       1
2   1081       1
3   4261       1
4   9916       0

Predicted target distribution (Ensemble):
Target
0    0.616642
1    0.383358
Name: proportion, dtype: float64

--- Model 1 (GBC V1) Training Set Eval ---
ROC AUC: 0.9547

--- Model 2 (GBC V2) Training Set Eval ---
ROC AUC: 0.9569
