In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed (e.g., from one-hot encoder)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning) # Often related to sparse output default

# Load the data
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

print("Data loaded successfully.")
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# --- Feature Engineering & Preprocessing ---

def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years
    current_year = datetime.datetime.now().year
    # Use a reasonable reference year based on Dt_Customer if available, otherwise current year
    try:
        max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
    except: # Handle cases where Dt_Customer might not exist or be parseable easily
        reference_year = current_year

    # Replace very old birth years (e.g., < 1910) with NaN to be imputed later
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan

    # Calculate Age (handle potential NaNs in Year_Birth temporarily)
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)


    # 2. Process Dt_Customer
    # Convert 'Dt_Customer' to datetime, coercing errors
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)

    # Find the latest date for calculating tenure if not provided (from training set)
    if is_train:
        global global_latest_date # Store latest date from training set globally
        global_latest_date = df_processed['Dt_Customer'].max() + datetime.timedelta(days=1)
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        # Fallback if called on test set first or global_latest_date isn't set
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1) # Use start of next year as reference
        print(f"Warning: Using fallback latest date: {latest_date_to_use}")


    # Calculate Customer_Lifetime (Tenure in days)
    # Handle potential NaT dates resulting from coerce errors
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    df_processed['Customer_Lifetime'].fillna(df_processed['Customer_Lifetime'].median(), inplace=True) # Impute NaNs created by NaT
    df_processed.drop('Dt_Customer', axis=1, inplace=True)


    # 3. Simplify Marital Status
    # Consolidate categories
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner',
        'Together': 'Partner',
        'Absurd': 'Single',
        'Alone': 'Single',
        'YOLO': 'Single',
        'Widow': 'Single',
        'Divorced':'Single'
         }) # Grouping Married/Together and others into Single for simplicity

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'}) # Group '2n Cycle' with 'Master'

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # Optionally drop original columns if 'Children' is deemed sufficient
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True)

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns (identified during EDA)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore') # errors='ignore' in case they were already dropped

    return df_processed

# Preprocess training data
train_df_processed = preprocess_data(train_df, is_train=True)
# Preprocess test data using the latest date from training data
test_df_processed = preprocess_data(test_df, is_train=False, latest_date=global_latest_date)

print("Feature engineering complete.")
print("\nTrain Data Info after processing:")
train_df_processed.info()
print("\nTest Data Info after processing:")
test_df_processed.info()


# --- Model Training ---

# Separate features (X) and target (y)
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']
X_test = test_df_processed.drop('ID', axis=1)

# Align columns - crucial after feature engineering if columns were added/dropped differently (shouldn't happen here but good practice)
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0 # Add missing columns to test set with default value (0)

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0 # Add missing columns to train set with default value (0) - less likely

X_test = X_test[train_cols] # Ensure order is the same


# Identify column types for preprocessing
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Create preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')), # Impute missing numericals (Age, Income, Customer_Lifetime)
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')), # Impute missing categoricals (if any)
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Use sparse_output=False for easier debugging if needed
])

# Create a column transformer to apply different pipelines to different columns
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough') # Keep any columns not specified (though there shouldn't be any here)


# Define the model
# GradientBoostingClassifier often works well. random_state for reproducibility.
# Consider tuning hyperparameters later using GridSearchCV or RandomizedSearchCV
model = GradientBoostingClassifier(n_estimators=150, # Increased slightly
                                 learning_rate=0.08, # Slightly decreased
                                 max_depth=4,       # Increased slightly
                                 subsample=0.8,     # Added subsampling
                                 random_state=42)

# Create the full pipeline: preprocess + model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train the model on the entire training dataset
print("\nTraining the model...")
pipeline.fit(X, y)
print("Model training complete.")

# --- Prediction ---

# Predict on the preprocessed test data
print("Predicting on test data...")
test_predictions = pipeline.predict(X_test)
print("Prediction complete.")

# --- Submission File Generation ---

# Create the submission DataFrame
submission_df = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions})

# Save the submission file
submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully.")
print(submission_df.head())
print(f"\nPredicted target distribution:\n{submission_df['Target'].value_counts(normalize=True)}")

# Optional: Evaluate on the training set (for sanity check, not a true performance measure)
train_preds = pipeline.predict(X)
train_accuracy = accuracy_score(y, train_preds)
train_roc_auc = roc_auc_score(y, pipeline.predict_proba(X)[:, 1]) # Use probabilities for AUC
print(f"\n--- Training Set Evaluation (Sanity Check) ---")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"ROC AUC: {train_roc_auc:.4f}")
# print("Classification Report:\n", classification_report(y, train_preds))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed (e.g., from one-hot encoder)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning) # Often related to sparse output default

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    # test_df = pd.read_csv("test.csv") # Not needed for CV evaluation
except FileNotFoundError:
    print("Make sure train.csv is in the same directory.")
    exit()

print("Training data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (same as before) ---
def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years & Calculate Age
    current_year = datetime.datetime.now().year
    try:
        max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
    except:
        reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)

    # 2. Process Dt_Customer & Calculate Customer_Lifetime
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date
        global_latest_date = df_processed['Dt_Customer'].max() + datetime.timedelta(days=1)
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)

    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    # Impute median *before* dropping Dt_Customer to handle NaTs properly
    median_lifetime = df_processed['Customer_Lifetime'].median() # Calculate median only once
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)

    # 3. Simplify Marital Status
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True) # Keep originals for now

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    return df_processed

# --- Apply Preprocessing to Training Data ---
train_df_processed = preprocess_data(train_df.copy(), is_train=True) # Use copy to be safe
print("Preprocessing complete.")

# --- Prepare Data for CV ---
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']

# Identify column types (ensure this happens *after* preprocessing)
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# Check if Customer_Lifetime needs explicit imputation placeholder if it wasn't numeric initially
if 'Customer_Lifetime' in numerical_features:
     print("Customer_Lifetime treated as numerical.")
else:
     print("Warning: Customer_Lifetime might not be numerical after preprocessing.")


# --- Define Preprocessing Steps ---
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough')

# --- Define Model (Using the same parameters as your previous run) ---
# You can adjust these parameters later based on CV results
model = GradientBoostingClassifier(n_estimators=150,
                                 learning_rate=0.08,
                                 max_depth=4,
                                 subsample=0.8,
                                 random_state=42)

# --- Create Full Pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# --- Set up K-Fold Cross-Validation ---
N_SPLITS = 5 # Number of folds (5 or 10 are common)
RANDOM_STATE_KFOLD = 42 # For reproducible splits

# Use StratifiedKFold to maintain target class distribution in each fold
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

print(f"\nStarting {N_SPLITS}-Fold Cross-Validation...")

# --- Perform Cross-Validation and Calculate Scores ---

# Accuracy Scores
accuracy_scores = cross_val_score(pipeline, X, y, cv=skf, scoring='accuracy', n_jobs=-1) # n_jobs=-1 uses all processors

# ROC AUC Scores
# Note: cross_val_score calculates ROC AUC based on predict_proba internally
roc_auc_scores = cross_val_score(pipeline, X, y, cv=skf, scoring='roc_auc', n_jobs=-1)

print("Cross-Validation finished.")

# --- Report Results ---
print("\n--- Cross-Validation Results ---")
print(f"Accuracy Scores per Fold: {accuracy_scores}")
print(f"Mean Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Std Dev Accuracy: {np.std(accuracy_scores):.4f}")
print("-" * 30)
print(f"ROC AUC Scores per Fold: {roc_auc_scores}")
print(f"Mean ROC AUC: {np.mean(roc_auc_scores):.4f}")
print(f"Std Dev ROC AUC: {np.std(roc_auc_scores):.4f}")
print("-" * 30)

# --- Note on Final Training ---
print("\nNOTE: The scores above are estimates of generalization performance.")
print("For the final submission, you should train the pipeline on the *entire* training set (X, y)")
print("and then predict on the preprocessed test set.")
print("Example final training step (run this *after* CV and hyperparameter tuning):")
print("# pipeline.fit(X, y)")
print("# test_predictions = pipeline.predict(X_test) # Assuming X_test is preprocessed test data")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv") # Needed for final submission ID mapping
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (same as before) ---
def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years & Calculate Age
    current_year = datetime.datetime.now().year
    try:
        max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
    except:
        reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)

    # 2. Process Dt_Customer & Calculate Customer_Lifetime
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date
        global_latest_date = df_processed['Dt_Customer'].max() + datetime.timedelta(days=1) # Define global latest date from train set
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else: # Fallback if test is processed first somehow (shouldn't happen here)
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
        print(f"Warning: Using fallback latest date: {latest_date_to_use}")


    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)

    # 3. Simplify Marital Status
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True) # Keep originals for now

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    return df_processed

# --- Apply Preprocessing ---
train_df_processed = preprocess_data(train_df.copy(), is_train=True)
# Preprocess test data using the date derived from training data
test_df_processed = preprocess_data(test_df.copy(), is_train=False, latest_date=global_latest_date)
print("Preprocessing complete.")


# --- Prepare Data ---
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']
X_test = test_df_processed.drop('ID', axis=1)

# Align columns after preprocessing (important!)
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0

X_test = X_test[train_cols] # Ensure order is the same


# Identify column types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

# --- Define Preprocessing Steps ---
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Set sparse_output=True for large datasets if memory is an issue
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough')


# --- Define Base Model ---
# We will tune the parameters of this model
base_model = GradientBoostingClassifier(random_state=42)

# --- Create Full Pipeline ---
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', base_model) # Placeholder name 'classifier'
])

# --- Define Parameter Grid for RandomizedSearchCV ---
# Adjust ranges based on previous results and desired exploration
param_dist = {
    'classifier__n_estimators': [50, 80, 100, 150, 200], # Range around potentially good values
    'classifier__learning_rate': [0.01, 0.02, 0.05, 0.08, 0.1, 0.15], # Wider range, including lower values
    'classifier__max_depth': [2, 3, 4], # Focus on shallower trees to reduce overfitting
    'classifier__min_samples_leaf': [5, 10, 15, 20], # Force more samples per leaf
    'classifier__min_samples_split': [10, 20, 30], # Force more samples for a split
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9], # Explore subsampling ratios
    'classifier__max_features': ['sqrt', 'log2', 0.7, 0.8, None] # Limit features per split
}

# --- Set up K-Fold Strategy (same as before) ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV ---
N_ITER = 50 # Number of parameter settings to sample. Increase for more thorough search (e.g., 100), decrease for speed.
SCORING_METRIC = 'roc_auc' # Optimize for ROC AUC, common for binary classification

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=N_ITER,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1, # Use all available CPU cores
    random_state=42, # For reproducible search results
    verbose=1 # Set to 1 or 2 to see progress
)

print(f"\nStarting RandomizedSearchCV with {N_ITER} iterations for {SCORING_METRIC}...")
random_search.fit(X, y)
print("RandomizedSearchCV finished.")

# --- Report Best Results ---
print("\n--- Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search.best_score_:.4f}")
print("Best Parameters:")
# Nicely print the best parameters found
best_params = random_search.best_params_
for param, value in best_params.items():
    print(f"  {param}: {value}")

# --- Train Final Model with Best Parameters ---
print("\nTraining final model on the entire training set using best parameters...")
# The best estimator found by RandomizedSearchCV is already fitted on the full data
# if refit=True (default), but we fit it explicitly for clarity.
# Alternatively, you could just use: best_pipeline = random_search.best_estimator_
best_pipeline = pipeline # Start with the original pipeline structure
best_pipeline.set_params(**best_params) # Set the best parameters found
best_pipeline.fit(X, y)
print("Final model training complete.")

# --- Predict on Test Data ---
print("Predicting on test data using the tuned model...")
test_predictions = best_pipeline.predict(X_test)
print("Prediction complete.")

# --- Generate Submission File ---
submission_df = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions})
submission_filename = 'submission_tuned_gbc.csv' # New filename
submission_df.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully.")
print(submission_df.head())
print(f"\nPredicted target distribution:\n{submission_df['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* model on the training set
train_preds_tuned = best_pipeline.predict(X)
train_accuracy_tuned = accuracy_score(y, train_preds_tuned)
train_roc_auc_tuned = roc_auc_score(y, best_pipeline.predict_proba(X)[:, 1])
print(f"\n--- Tuned Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_tuned:.4f}")
print("(Compare these to the initial overfit scores and the CV scores)")

In [None]:
pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer # Ensure make_scorer is imported if needed, though cross_val_score handles it
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings
import xgboost
try:
    from xgboost import XGBClassifier
except ImportError:
    print("XGBoost not found. Please install it using: pip install xgboost")
    exit()

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (Identical to previous step) ---
def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years & Calculate Age
    current_year = datetime.datetime.now().year
    try:
        # Attempt to get reference year from Dt_Customer
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True) # Drop temporary column
    except Exception as e: # Broad exception for safety if Dt_Customer is missing or unparseable
        print(f"Warning: Could not parse Dt_Customer for reference year. Using current year. Error: {e}")
        reference_year = current_year

    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)

    # 2. Process Dt_Customer & Calculate Customer_Lifetime
    # Convert 'Dt_Customer' to datetime, coercing errors
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)

    # Find the latest date for calculating tenure if not provided (from training set)
    if is_train:
        global global_latest_date # Store latest date from training set globally
        # Handle case where all Dt_Customer might be NaT after coercion
        valid_dates = df_processed['Dt_Customer'].dropna()
        if not valid_dates.empty:
            global_latest_date = valid_dates.max() + datetime.timedelta(days=1)
        else:
            # Fallback if no valid dates found in training set
            global_latest_date = datetime.datetime(reference_year + 1, 1, 1)
            print(f"Warning: No valid Dt_Customer found in training set. Using fallback latest date: {global_latest_date}")
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        # Fallback if called on test set first or global_latest_date isn't set
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
        print(f"Warning: Using fallback latest date for test set: {latest_date_to_use}")

    # Calculate Customer_Lifetime (Tenure in days)
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    # Impute median *before* dropping Dt_Customer to handle NaTs properly
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)


    # 3. Simplify Marital Status
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # Keep original Kidhome/Teenhome for now, might be useful features
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True)

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    return df_processed

# --- Apply Preprocessing ---
train_df_processed = preprocess_data(train_df.copy(), is_train=True)
# Check if global_latest_date was set correctly
if 'global_latest_date' not in globals():
     print("Error: global_latest_date not set during training preprocessing. Exiting.")
     # Handle this case appropriately, maybe define a default or raise error
     # For now, let's set a default, but ideally the training data processing should succeed
     global_latest_date = datetime.datetime.now() + datetime.timedelta(days=1)
     print(f"Using current date as fallback for global_latest_date: {global_latest_date}")

test_df_processed = preprocess_data(test_df.copy(), is_train=False, latest_date=global_latest_date)
print("Preprocessing complete.")


# --- Prepare Data ---
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']
X_test = test_df_processed.drop('ID', axis=1)

# Align columns after preprocessing
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0
missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0

X_test = X_test[train_cols] # Ensure order is the same


# Identify column types
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# --- Define Preprocessing Steps (Same as before) ---
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Set sparse=True for large data if needed
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough')


# --- Define Base Model: XGBoost ---
# Use_label_encoder=False is recommended for newer XGBoost versions
# eval_metric='logloss' or 'auc' are common for binary classification
base_model_xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# --- Create Full Pipeline with XGBoost ---
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', base_model_xgb) # Step name remains 'classifier'
])

# --- Define Parameter Grid for XGBoost RandomizedSearchCV ---
# These ranges are starting points; adjust based on results or computational budget
param_dist_xgb = {
    'classifier__n_estimators': [100, 150, 200, 300, 400], # Number of boosting rounds
    'classifier__learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15], # Step size shrinkage
    'classifier__max_depth': [2, 3, 4, 5], # Maximum depth of a tree
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], # Fraction of samples used per tree
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0], # Fraction of features used per tree
    'classifier__gamma': [0, 0.1, 0.2, 0.5], # Minimum loss reduction required to make a further partition
    'classifier__reg_alpha': [0, 0.001, 0.01, 0.1], # L1 regularization term
    'classifier__reg_lambda': [0.5, 1, 1.5] # L2 regularization term (default is 1)
    # Add 'min_child_weight': [1, 3, 5] if needed (minimum sum of instance weight needed in a child)
}

# --- Set up K-Fold Strategy (same as before) ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV for XGBoost ---
N_ITER_XGB = 75 # Increase iterations for potentially better results (vs 50 for GBC)
SCORING_METRIC = 'roc_auc' # Optimize for ROC AUC

random_search_xgb = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_dist_xgb,
    n_iter=N_ITER_XGB,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1, # Use all available CPU cores
    random_state=42, # For reproducible search results
    verbose=1 # Set to 1 or 2 to see progress
)

print(f"\nStarting RandomizedSearchCV for XGBoost with {N_ITER_XGB} iterations for {SCORING_METRIC}...")
random_search_xgb.fit(X, y)
print("XGBoost RandomizedSearchCV finished.")

# --- Report Best XGBoost Results ---
print("\n--- XGBoost Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search_xgb.best_score_:.4f}")
print("Best Parameters:")
best_params_xgb = random_search_xgb.best_params_
for param, value in best_params_xgb.items():
    print(f"  {param}: {value}")

# --- Train Final XGBoost Model with Best Parameters ---
print("\nTraining final XGBoost model on the entire training set using best parameters...")
# The best estimator is automatically refit on the whole training data by RandomizedSearchCV
best_pipeline_xgb = random_search_xgb.best_estimator_
# Explicit refit just to be sure (Optional, default behavior of RS CV is refit=True)
# best_pipeline_xgb.fit(X, y)
print("Final XGBoost model training complete.")


# --- Predict on Test Data with Tuned XGBoost ---
print("Predicting on test data using the tuned XGBoost model...")
test_predictions_xgb = best_pipeline_xgb.predict(X_test)
print("Prediction complete.")

# --- Generate Submission File for XGBoost ---
submission_df_xgb = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_xgb})
submission_filename_xgb = 'submission_tuned_xgb.csv' # New filename
submission_df_xgb.to_csv(submission_filename_xgb, index=False)

print(f"\nSubmission file '{submission_filename_xgb}' created successfully.")
print(submission_df_xgb.head())
print(f"\nPredicted target distribution (XGBoost):\n{submission_df_xgb['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* XGBoost model on the training set
train_preds_xgb_tuned = best_pipeline_xgb.predict(X)
train_accuracy_xgb_tuned = accuracy_score(y, train_preds_xgb_tuned)
train_roc_auc_xgb_tuned = roc_auc_score(y, best_pipeline_xgb.predict_proba(X)[:, 1])
print(f"\n--- Tuned XGBoost Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_xgb_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_xgb_tuned:.4f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier # Keep GBC
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer # Added FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Function (MODIFIED) ---
def preprocess_data_v2(df, is_train=True, latest_date=None):
    """Applies feature engineering (v2) and basic cleaning."""
    df_processed = df.copy()

    # --- Original Preprocessing ---
    # 1. Handle Birth Year & Age
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception as e:
        print(f"Warning: Could not parse Dt_Customer for reference year. Using current year. Error: {e}")
        reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True) # Keep Age

    # 2. Process Dt_Customer & Lifetime + Extract Date Features
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v2 # Use a new global var name if running in same session
        valid_dates = df_processed['Dt_Customer'].dropna()
        if not valid_dates.empty:
            global_latest_date_v2 = valid_dates.max() + datetime.timedelta(days=1)
        else:
            global_latest_date_v2 = datetime.datetime(reference_year + 1, 1, 1)
            print(f"Warning: No valid Dt_Customer. Using fallback latest date: {global_latest_date_v2}")
        latest_date_to_use = global_latest_date_v2
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
        print(f"Warning: Using fallback latest date for test: {latest_date_to_use}")

    # --> NEW: Extract Date Features BEFORE calculating lifetime and dropping
    df_processed['Enroll_Month'] = df_processed['Dt_Customer'].dt.month
    df_processed['Enroll_Year'] = df_processed['Dt_Customer'].dt.year
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer'].dt.dayofweek
    # Impute NaNs in date features (e.g., with mode or median year/month)
    df_processed['Enroll_Month'].fillna(df_processed['Enroll_Month'].mode()[0], inplace=True)
    df_processed['Enroll_Year'].fillna(df_processed['Enroll_Year'].median(), inplace=True)
    df_processed['Enroll_DayOfWeek'].fillna(df_processed['Enroll_DayOfWeek'].mode()[0], inplace=True)


    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True) # Now drop Dt_Customer

    # 3. Simplify Marital Status (Keeping original for now - let's test)
    # df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({ ... }) # Keep original

    # 4. Simplify Education (Keeping original for now - let's test)
    # df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'}) # Keep original

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    # Impute NaNs in spending columns *before* summing (using 0 or median)
    for col in mnt_cols:
        df_processed[col] = df_processed[col].fillna(0) # Simple imputation with 0 for spending
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    # --- NEW Features ---
    # Ratio Features (handle division by zero)
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt']).fillna(0)
    # Replace inf values that might result from 0/0
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)


    # Income related (handle division by zero and potential NaNs in Income)
    # Impute Income NaNs *before* using it in calculations
    income_median = df_processed['Income'].median()
    df_processed['Income'].fillna(income_median, inplace=True)
    # Add 1 to avoid division by zero if Children=0 and partner=1 (or single=1)
    num_people = df_processed['Children'] + df_processed['Marital_Status'].apply(lambda x: 1 if x=='Single' else 2) # Simple adult estimate
    df_processed['Income_per_Person'] = (df_processed['Income'] / num_people.replace(0, 1)).fillna(0) # Replace 0 people with 1


    # Spending per Purchase (handle division by zero)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Total_Purchases'].replace(0, 1)).fillna(0)


    return df_processed

# --- Apply NEW Preprocessing ---
train_df_processed_v2 = preprocess_data_v2(train_df.copy(), is_train=True)
if 'global_latest_date_v2' not in globals(): # Check the new global var
     global_latest_date_v2 = datetime.datetime.now() + datetime.timedelta(days=1)
     print(f"Error: global_latest_date_v2 not set. Using fallback: {global_latest_date_v2}")
test_df_processed_v2 = preprocess_data_v2(test_df.copy(), is_train=False, latest_date=global_latest_date_v2)
print("V2 Preprocessing complete.")

# --- Prepare Data (using v2 processed data) ---
X_v2 = train_df_processed_v2.drop(['ID', 'Target'], axis=1)
y_v2 = train_df_processed_v2['Target'] # Target remains the same
X_test_v2 = test_df_processed_v2.drop('ID', axis=1)

# Align columns after V2 preprocessing
train_cols_v2 = X_v2.columns
test_cols_v2 = X_test_v2.columns

missing_in_test_v2 = set(train_cols_v2) - set(test_cols_v2)
for c in missing_in_test_v2:
    X_test_v2[c] = 0
missing_in_train_v2 = set(test_cols_v2) - set(train_cols_v2)
for c in missing_in_train_v2:
    X_v2[c] = 0

X_test_v2 = X_test_v2[train_cols_v2] # Ensure order is the same

# --- Define Preprocessing Steps (Potentially updated if features changed type) ---
numerical_features_v2 = X_v2.select_dtypes(include=np.number).columns.tolist()
categorical_features_v2 = X_v2.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nV2 Numerical features ({len(numerical_features_v2)}): {numerical_features_v2}")
print(f"V2 Categorical features ({len(categorical_features_v2)}): {categorical_features_v2}")


# Log transformer function
log_transformer = FunctionTransformer(np.log1p, validate=False) # validate=False to handle 0s after log1p

# Update Numerical Pipeline to include Log Transform for specific skewed features
# Identify potentially skewed features (Income, Spending)
skewed_num_features = ['Income', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
                       'MntSweetProducts', 'MntGoldProds', 'Total_Mnt', 'Income_per_Person', 'Spending_per_Purchase']
# Make sure these features actually exist after preprocessing
skewed_num_features = [f for f in skewed_num_features if f in numerical_features_v2]
other_num_features = [f for f in numerical_features_v2 if f not in skewed_num_features]


numerical_pipeline_v2 = Pipeline([
    # Impute FIRST
    ('imputer_num', SimpleImputer(strategy='median')),
    # Apply log transform only to skewed columns (using ColumnTransformer within Pipeline - tricky!)
    # Easier approach: Apply log transform in preprocess_data_v2 or handle separately if needed.
    # For simplicity here, let's apply StandardScaler to all imputed numericals.
    # Consider log transform within preprocess_data_v2 if it proves beneficial.
    ('scaler', StandardScaler())
])

categorical_pipeline_v2 = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Update Preprocessor
preprocessor_v2 = ColumnTransformer([
    ('num', numerical_pipeline_v2, numerical_features_v2), # Apply updated pipeline to all numerical
    ('cat', categorical_pipeline_v2, categorical_features_v2)
], remainder='passthrough')


# --- Define Base Model: Gradient Boosting (Retuning this one) ---
base_model_gbc = GradientBoostingClassifier(random_state=42)

# --- Create Full Pipeline with GBC V2 ---
pipeline_gbc_v2 = Pipeline([
    ('preprocessor', preprocessor_v2),
    ('classifier', base_model_gbc)
])

# --- Define Parameter Grid for GBC RandomizedSearchCV (Centered around previous best) ---
param_dist_gbc_v2 = {
    'classifier__n_estimators': [150, 200, 250, 300], # Explore higher values slightly
    'classifier__learning_rate': [0.02, 0.05, 0.08, 0.1], # Narrower range around 0.08
    'classifier__max_depth': [2, 3], # Keep focusing on shallow trees
    'classifier__min_samples_leaf': [15, 20, 25], # Stay around the previous best
    'classifier__min_samples_split': [15, 20, 30], # Stay around the previous best
    'classifier__subsample': [0.5, 0.6, 0.7], # Explore around 0.6
    'classifier__max_features': ['sqrt', 'log2'] # Keep simpler options
}

# --- Set up K-Fold Strategy (same as before) ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV for GBC V2 ---
N_ITER_GBC_V2 = 50 # Number of iterations for retuning
SCORING_METRIC = 'roc_auc'

random_search_gbc_v2 = RandomizedSearchCV(
    estimator=pipeline_gbc_v2,
    param_distributions=param_dist_gbc_v2,
    n_iter=N_ITER_GBC_V2,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

print(f"\nStarting RandomizedSearchCV for GBC (V2 Features) with {N_ITER_GBC_V2} iterations for {SCORING_METRIC}...")
random_search_gbc_v2.fit(X_v2, y_v2) # Use V2 features and original target
print("GBC V2 RandomizedSearchCV finished.")

# --- Report Best GBC V2 Results ---
print("\n--- GBC V2 Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search_gbc_v2.best_score_:.4f}")
print("Best Parameters:")
best_params_gbc_v2 = random_search_gbc_v2.best_params_
for param, value in best_params_gbc_v2.items():
    print(f"  {param}: {value}")

# --- Train Final GBC V2 Model with Best Parameters ---
print("\nTraining final GBC V2 model on the entire training set using best parameters...")
best_pipeline_gbc_v2 = random_search_gbc_v2.best_estimator_
print("Final GBC V2 model training complete.")

# --- Predict on Test Data with Tuned GBC V2 ---
print("Predicting on test data using the tuned GBC V2 model...")
test_predictions_gbc_v2 = best_pipeline_gbc_v2.predict(X_test_v2) # Use V2 test features
print("Prediction complete.")

# --- Generate Submission File for GBC V2 ---
submission_df_gbc_v2 = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_gbc_v2})
submission_filename_gbc_v2 = 'submission_tuned_gbc_v2_features.csv' # New filename
submission_df_gbc_v2.to_csv(submission_filename_gbc_v2, index=False)

print(f"\nSubmission file '{submission_filename_gbc_v2}' created successfully.")
print(submission_df_gbc_v2.head())
print(f"\nPredicted target distribution (GBC V2 Features):\n{submission_df_gbc_v2['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* GBC V2 model on the training set
train_preds_gbc_v2_tuned = best_pipeline_gbc_v2.predict(X_v2)
train_accuracy_gbc_v2_tuned = accuracy_score(y_v2, train_preds_gbc_v2_tuned)
train_roc_auc_gbc_v2_tuned = roc_auc_score(y_v2, best_pipeline_gbc_v2.predict_proba(X_v2)[:, 1])
print(f"\n--- Tuned GBC V2 Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_gbc_v2_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_gbc_v2_tuned:.4f}")

In [None]:
# Assuming best_pipeline_gbc_v2 is your fitted V2 GBC pipeline
# Get feature names after one-hot encoding
ohe_feature_names = best_pipeline_gbc_v2.named_steps['preprocessor'] \
                    .named_transformers_['cat'] \
                    .named_steps['onehot'] \
                    .get_feature_names_out(categorical_features_v2)
all_feature_names = numerical_features_v2 + list(ohe_feature_names)

# Get importances
importances = best_pipeline_gbc_v2.named_steps['classifier'].feature_importances_
feature_importance_df = pd.DataFrame({'feature': all_feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
print("\nTop 20 Feature Importances (GBC V2):")
print(feature_importance_df.head(20))

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold # Keep for reference if needed later
# Removed RandomizedSearchCV as we are using pre-found params
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Functions (V1 and V2) ---

# Function V1 (leading to 0.845 score)
def preprocess_data_v1(df, is_train=True, latest_date=None):
    df_processed = df.copy()
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v1
        valid_dates = df_processed['Dt_Customer'].dropna()
        global_latest_date_v1 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v1
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    for col in mnt_cols: df_processed[col] = df_processed[col].fillna(0) # Impute before sum
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    # Impute Income (might be needed if not done before FE)
    income_median = df_processed['Income'].median()
    df_processed['Income'].fillna(income_median, inplace=True)
    # Impute Age (might be needed if Year_Birth had NaNs)
    age_median = df_processed['Age'].median()
    df_processed['Age'].fillna(age_median, inplace=True)
    return df_processed

# Function V2 (leading to 0.848 score) - simplified, assuming it's the same as last run
def preprocess_data_v2(df, is_train=True, latest_date=None):
    df_processed = df.copy()
    # --- Previous steps: Age, Lifetime, Date Features ---
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v2
        valid_dates = df_processed['Dt_Customer'].dropna()
        global_latest_date_v2 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v2
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
    df_processed['Enroll_Month'] = df_processed['Dt_Customer'].dt.month.fillna(df_processed['Dt_Customer'].dt.month.mode()[0])
    df_processed['Enroll_Year'] = df_processed['Dt_Customer'].dt.year.fillna(df_processed['Dt_Customer'].dt.year.median())
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer'].dt.dayofweek.fillna(df_processed['Dt_Customer'].dt.dayofweek.mode()[0])
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)

    # --- V2 specific additions / kept originals ---
    # Marital_Status kept original
    # Education kept original
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    for col in mnt_cols: df_processed[col] = df_processed[col].fillna(0)
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt']).fillna(0)
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)
    income_median = df_processed['Income'].median()
    df_processed['Income'].fillna(income_median, inplace=True)
    num_people = df_processed['Children'] + df_processed['Marital_Status'].apply(lambda x: 1 if x in ['Single','Divorced','Widow','Absurd','Alone','YOLO'] else 2)
    df_processed['Income_per_Person'] = (df_processed['Income'] / num_people.replace(0, 1)).fillna(0)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Total_Purchases'].replace(0, 1)).fillna(0)
    # Impute Age (might be needed if Year_Birth had NaNs)
    age_median = df_processed['Age'].median()
    df_processed['Age'].fillna(age_median, inplace=True)
    return df_processed

# --- Apply Preprocessing V1 ---
train_df_processed_v1 = preprocess_data_v1(train_df.copy(), is_train=True)
if 'global_latest_date_v1' not in globals(): global_latest_date_v1 = datetime.datetime.now() + datetime.timedelta(days=1)
test_df_processed_v1 = preprocess_data_v1(test_df.copy(), is_train=False, latest_date=global_latest_date_v1)
print("V1 Preprocessing complete.")

# --- Apply Preprocessing V2 ---
train_df_processed_v2 = preprocess_data_v2(train_df.copy(), is_train=True)
if 'global_latest_date_v2' not in globals(): global_latest_date_v2 = datetime.datetime.now() + datetime.timedelta(days=1)
test_df_processed_v2 = preprocess_data_v2(test_df.copy(), is_train=False, latest_date=global_latest_date_v2)
print("V2 Preprocessing complete.")


# --- Prepare Data V1 ---
X_v1 = train_df_processed_v1.drop(['ID', 'Target'], axis=1)
y_v1 = train_df_processed_v1['Target']
X_test_v1 = test_df_processed_v1.drop('ID', axis=1)
# Align
train_cols_v1 = X_v1.columns
missing_in_test_v1 = set(train_cols_v1) - set(X_test_v1.columns)
for c in missing_in_test_v1: X_test_v1[c] = 0
missing_in_train_v1 = set(X_test_v1.columns) - set(train_cols_v1)
for c in missing_in_train_v1: X_v1[c] = 0
X_test_v1 = X_test_v1[train_cols_v1]

# --- Prepare Data V2 ---
X_v2 = train_df_processed_v2.drop(['ID', 'Target'], axis=1)
y_v2 = train_df_processed_v2['Target']
X_test_v2 = test_df_processed_v2.drop('ID', axis=1)
# Align
train_cols_v2 = X_v2.columns
missing_in_test_v2 = set(train_cols_v2) - set(X_test_v2.columns)
for c in missing_in_test_v2: X_test_v2[c] = 0
missing_in_train_v2 = set(X_test_v2.columns) - set(train_cols_v2)
for c in missing_in_train_v2: X_v2[c] = 0
X_test_v2 = X_test_v2[train_cols_v2]


# --- Define Preprocessing Pipelines (Need separate ones for V1 and V2 features) ---

# Pipeline V1 Definition
numerical_features_v1 = X_v1.select_dtypes(include=np.number).columns.tolist()
categorical_features_v1 = X_v1.select_dtypes(exclude=np.number).columns.tolist()
numerical_pipeline_v1 = Pipeline([('imputer_num', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_pipeline_v1 = Pipeline([('imputer_cat', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v1 = ColumnTransformer([
    ('num', numerical_pipeline_v1, numerical_features_v1),
    ('cat', categorical_pipeline_v1, categorical_features_v1)], remainder='passthrough')

# Pipeline V2 Definition
numerical_features_v2 = X_v2.select_dtypes(include=np.number).columns.tolist()
categorical_features_v2 = X_v2.select_dtypes(exclude=np.number).columns.tolist()
numerical_pipeline_v2 = Pipeline([('imputer_num', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_pipeline_v2 = Pipeline([('imputer_cat', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v2 = ColumnTransformer([
    ('num', numerical_pipeline_v2, numerical_features_v2),
    ('cat', categorical_pipeline_v2, categorical_features_v2)], remainder='passthrough')

# --- Define BEST Hyperparameters found previously ---

# Best parameters for GBC with V1 features (resulted in 0.845 Kaggle score)
# Note: These are the params *you reported* finding previously. Double-check if needed.
best_params_gbc_v1 = {
    'classifier__subsample': 0.6,
    'classifier__n_estimators': 200,
    'classifier__min_samples_split': 20,
    'classifier__min_samples_leaf': 20,
    'classifier__max_features': 'sqrt',
    'classifier__max_depth': 2,
    'classifier__learning_rate': 0.08
}

# Best parameters for GBC with V2 features (resulted in 0.848 Kaggle score)
best_params_gbc_v2 = {
    'classifier__subsample': 0.7,
    'classifier__n_estimators': 300,
    'classifier__min_samples_split': 20,
    'classifier__min_samples_leaf': 20,
    'classifier__max_features': 'log2',
    'classifier__max_depth': 2,
    'classifier__learning_rate': 0.05
}


# --- Build and Train Model 1 (GBC V1) ---
print("Training Model 1 (GBC V1)...")
pipeline_gbc_v1 = Pipeline([
    ('preprocessor', preprocessor_v1),
    ('classifier', GradientBoostingClassifier(random_state=42)) # Base model
])
pipeline_gbc_v1.set_params(**best_params_gbc_v1) # Apply best params
pipeline_gbc_v1.fit(X_v1, y_v1)
print("Model 1 training complete.")

# --- Build and Train Model 2 (GBC V2) ---
print("Training Model 2 (GBC V2)...")
pipeline_gbc_v2 = Pipeline([
    ('preprocessor', preprocessor_v2),
    ('classifier', GradientBoostingClassifier(random_state=42)) # Base model
])
pipeline_gbc_v2.set_params(**best_params_gbc_v2) # Apply best params
pipeline_gbc_v2.fit(X_v2, y_v2)
print("Model 2 training complete.")


# --- Predict Probabilities on Test Set ---
print("Predicting probabilities...")
# IMPORTANT: Use the correctly preprocessed test set for each model!
probs_gbc_v1 = pipeline_gbc_v1.predict_proba(X_test_v1)[:, 1]
probs_gbc_v2 = pipeline_gbc_v2.predict_proba(X_test_v2)[:, 1]

# --- Ensemble Averaging ---
print("Averaging predictions...")
# Simple average (you could also try weighted average if desired)
avg_probs = (probs_gbc_v1 + probs_gbc_v2) / 2

# Convert probabilities to 0/1 using 0.5 threshold
final_predictions = (avg_probs >= 0.5).astype(int)

# --- Generate Submission File ---
submission_df_ensemble = pd.DataFrame({'ID': test_df['ID'], 'Target': final_predictions})
submission_filename_ensemble = 'submission_ensemble_gbc_v1_v2.csv'
submission_df_ensemble.to_csv(submission_filename_ensemble, index=False)

print(f"\nSubmission file '{submission_filename_ensemble}' created successfully.")
print(submission_df_ensemble.head())
print(f"\nPredicted target distribution (Ensemble):\n{submission_df_ensemble['Target'].value_counts(normalize=True)}")

# Optional: Evaluate component models on training data (as a rough check)
train_preds_gbc_v1 = pipeline_gbc_v1.predict(X_v1)
train_roc_auc_gbc_v1 = roc_auc_score(y_v1, pipeline_gbc_v1.predict_proba(X_v1)[:, 1])
print(f"\n--- Model 1 (GBC V1) Training Set Eval ---")
print(f"ROC AUC: {train_roc_auc_gbc_v1:.4f}")

train_preds_gbc_v2 = pipeline_gbc_v2.predict(X_v2)
train_roc_auc_gbc_v2 = roc_auc_score(y_v2, pipeline_gbc_v2.predict_proba(X_v2)[:, 1])
print(f"\n--- Model 2 (GBC V2) Training Set Eval ---")
print(f"ROC AUC: {train_roc_auc_gbc_v2:.4f}")

Data loaded successfully.
V1 Preprocessing complete.
V2 Preprocessing complete.
Training Model 1 (GBC V1)...
Model 1 training complete.
Training Model 2 (GBC V2)...
Model 2 training complete.
Predicting probabilities...
Averaging predictions...

Submission file 'submission_ensemble_gbc_v1_v2.csv' created successfully.
      ID  Target
0   4390       1
1  10478       1
2   1081       1
3   4261       1
4   9916       0

Predicted target distribution (Ensemble):
Target
0    0.616642
1    0.383358
Name: proportion, dtype: float64

--- Model 1 (GBC V1) Training Set Eval ---
ROC AUC: 0.9547

--- Model 2 (GBC V2) Training Set Eval ---
ROC AUC: 0.9569


In [None]:
pip install lightgbm

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings
import lightgbm

try:
    from lightgbm import LGBMClassifier
except ImportError:
    print("LightGBM not found. Please install it using: pip install lightgbm")
    exit()

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")

# --- Feature Engineering & Preprocessing Function V2 (from previous step) ---
def preprocess_data_v2(df, is_train=True, latest_date=None):
    """Applies feature engineering (v2) and basic cleaning."""
    df_processed = df.copy()
    # --- Original Preprocessing ---
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v2
        valid_dates = df_processed['Dt_Customer'].dropna()
        global_latest_date_v2 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v2
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1); print(f"Warning: Using fallback latest date for test: {latest_date_to_use}")

    # Extract Date Features BEFORE calculating lifetime and dropping
    df_processed['Enroll_Month'] = df_processed['Dt_Customer'].dt.month.fillna(df_processed['Dt_Customer'].dt.month.mode()[0])
    df_processed['Enroll_Year'] = df_processed['Dt_Customer'].dt.year.fillna(df_processed['Dt_Customer'].dt.year.median())
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer'].dt.dayofweek.fillna(df_processed['Dt_Customer'].dt.dayofweek.mode()[0])

    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    median_lifetime = df_processed['Customer_Lifetime'].median()
    df_processed['Customer_Lifetime'].fillna(median_lifetime, inplace=True)
    df_processed.drop('Dt_Customer', axis=1, inplace=True)

    # Keep Original Marital_Status & Education for V2
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    for col in mnt_cols: df_processed[col] = df_processed[col].fillna(0) # Impute spending NaNs with 0
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    # --- V2 Features ---
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt']).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt']).fillna(0)
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)
    income_median = df_processed['Income'].median()
    df_processed['Income'].fillna(income_median, inplace=True)
    # Estimate adults based on Marital_Status original values
    df_processed['Num_Adults'] = df_processed['Marital_Status'].apply(lambda x: 1 if x in ['Single','Divorced','Widow','Absurd','Alone','YOLO'] else 2)
    num_people = df_processed['Children'] + df_processed['Num_Adults']
    df_processed['Income_per_Person'] = (df_processed['Income'] / num_people.replace(0, 1)).fillna(0)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Total_Purchases'].replace(0, 1)).fillna(0)
    age_median = df_processed['Age'].median()
    df_processed['Age'].fillna(age_median, inplace=True)
    # Drop Num_Adults helper column
    df_processed.drop('Num_Adults', axis=1, inplace=True, errors='ignore')

    return df_processed

# --- Apply Preprocessing V2 ---
train_df_processed_v2 = preprocess_data_v2(train_df.copy(), is_train=True)
if 'global_latest_date_v2' not in globals(): global_latest_date_v2 = datetime.datetime.now() + datetime.timedelta(days=1); print(f"Error: global_latest_date_v2 not set. Using fallback: {global_latest_date_v2}")
test_df_processed_v2 = preprocess_data_v2(test_df.copy(), is_train=False, latest_date=global_latest_date_v2)
print("V2 Preprocessing complete.")


# --- Prepare Data V2 ---
X_v2 = train_df_processed_v2.drop(['ID', 'Target'], axis=1)
y_v2 = train_df_processed_v2['Target']
X_test_v2 = test_df_processed_v2.drop('ID', axis=1)
# Align columns
train_cols_v2 = X_v2.columns
test_cols_v2 = X_test_v2.columns
missing_in_test_v2 = set(train_cols_v2) - set(test_cols_v2)
for c in missing_in_test_v2: X_test_v2[c] = 0
missing_in_train_v2 = set(test_cols_v2) - set(train_cols_v2)
for c in missing_in_train_v2: X_v2[c] = 0
X_test_v2 = X_test_v2[train_cols_v2]


# --- Define Preprocessing Steps (Using V2 Features) ---
numerical_features_v2 = X_v2.select_dtypes(include=np.number).columns.tolist()
categorical_features_v2 = X_v2.select_dtypes(exclude=np.number).columns.tolist()
# Ensure 'Age' and 'Customer_Lifetime' are correctly identified if they exist
print(f"\nV2 Numerical features ({len(numerical_features_v2)}): {numerical_features_v2}")
print(f"V2 Categorical features ({len(categorical_features_v2)}): {categorical_features_v2}")

numerical_pipeline_v2 = Pipeline([('imputer_num', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_pipeline_v2 = Pipeline([('imputer_cat', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v2 = ColumnTransformer([
    ('num', numerical_pipeline_v2, numerical_features_v2),
    ('cat', categorical_pipeline_v2, categorical_features_v2)], remainder='passthrough')


# --- Define Base Model: LightGBM ---
base_model_lgbm = LGBMClassifier(random_state=42, objective='binary') # objective='binary' is good practice

# --- Create Full Pipeline with LightGBM ---
pipeline_lgbm = Pipeline([
    ('preprocessor', preprocessor_v2),
    ('classifier', base_model_lgbm) # Step name remains 'classifier'
])

# --- Define Parameter Grid for LightGBM RandomizedSearchCV ---
param_dist_lgbm = {
    'classifier__n_estimators': [100, 200, 300, 400, 500],
    'classifier__learning_rate': [0.01, 0.02, 0.05, 0.1],
    'classifier__max_depth': [3, 4, 5, 7, -1], # -1 means no limit
    'classifier__num_leaves': [10, 15, 20, 31, 40], # Should be < 2^max_depth
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0], # Alias: bagging_fraction
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0], # Alias: feature_fraction
    'classifier__reg_alpha': [0, 0.01, 0.1, 0.5, 1.0], # L1
    'classifier__reg_lambda': [0, 0.1, 0.5, 1.0, 2.0], # L2
    'classifier__min_child_samples': [10, 20, 30, 50] # Min data in leaf
}

# --- Set up K-Fold Strategy ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV for LightGBM ---
N_ITER_LGBM = 75 # Number of iterations
SCORING_METRIC = 'roc_auc'

random_search_lgbm = RandomizedSearchCV(
    estimator=pipeline_lgbm,
    param_distributions=param_dist_lgbm,
    n_iter=N_ITER_LGBM,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

print(f"\nStarting RandomizedSearchCV for LightGBM with {N_ITER_LGBM} iterations for {SCORING_METRIC}...")
random_search_lgbm.fit(X_v2, y_v2) # Use V2 features
print("LightGBM RandomizedSearchCV finished.")

# --- Report Best LightGBM Results ---
print("\n--- LightGBM Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search_lgbm.best_score_:.4f}")
print("Best Parameters:")
best_params_lgbm = random_search_lgbm.best_params_
for param, value in best_params_lgbm.items():
    print(f"  {param}: {value}")

# --- Train Final LightGBM Model with Best Parameters ---
print("\nTraining final LightGBM model on the entire training set using best parameters...")
best_pipeline_lgbm = random_search_lgbm.best_estimator_
# best_pipeline_lgbm.fit(X_v2, y_v2) # Already refit by default
print("Final LightGBM model training complete.")

# --- Predict on Test Data with Tuned LightGBM ---
print("Predicting on test data using the tuned LightGBM model...")
test_predictions_lgbm = best_pipeline_lgbm.predict(X_test_v2) # Use V2 test features
print("Prediction complete.")

# --- Generate Submission File for LightGBM ---
submission_df_lgbm = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_lgbm})
submission_filename_lgbm = 'submission_tuned_lgbm_v2_features.csv' # New filename
submission_df_lgbm.to_csv(submission_filename_lgbm, index=False)

print(f"\nSubmission file '{submission_filename_lgbm}' created successfully.")
print(submission_df_lgbm.head())
print(f"\nPredicted target distribution (LightGBM):\n{submission_df_lgbm['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* LightGBM model on the training set
train_preds_lgbm_tuned = best_pipeline_lgbm.predict(X_v2)
train_accuracy_lgbm_tuned = accuracy_score(y_v2, train_preds_lgbm_tuned)
train_roc_auc_lgbm_tuned = roc_auc_score(y_v2, best_pipeline_lgbm.predict_proba(X_v2)[:, 1])
print(f"\n--- Tuned LightGBM Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_lgbm_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_lgbm_tuned:.4f}")

In [None]:
pip install catboost

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import datetime
import warnings
try:
    from catboost import CatBoostClassifier
except ImportError:
    print("CatBoost not found. Please install it using: pip install catboost")
    exit()

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)


# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")


# --- Feature Engineering & Preprocessing Function V4 (Corrected Target Exclusion) ---
def preprocess_data_v4_catboost_manual(df, is_train=True, latest_date=None, fit_imputers=None, fit_scaler=None):
    """ V2 features, keeps original categoricals, handles imputation+scaling manually """
    df_processed = df.copy()
    target_col = 'Target' # Define target column name

    # --- Step 1: Initial Feature Creation ---
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer_dt'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v4
        valid_dates = df_processed['Dt_Customer_dt'].dropna()
        global_latest_date_v4 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v4
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1); print(f"Warning: Using fallback latest date for test: {latest_date_to_use}")
    df_processed['Enroll_Month'] = df_processed['Dt_Customer_dt'].dt.month
    df_processed['Enroll_Year'] = df_processed['Dt_Customer_dt'].dt.year
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer_dt'].dt.dayofweek
    mask = pd.notna(df_processed['Dt_Customer_dt'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer_dt']).dt.days
    df_processed.drop(['Dt_Customer_dt', 'Dt_Customer'], axis=1, inplace=True, errors='ignore')
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    df_processed['Num_Adults'] = df_processed['Marital_Status'].apply(lambda x: 1 if str(x) in ['Single','Divorced','Widow','Absurd','Alone','YOLO'] else 2)
    num_people = df_processed['Children'] + df_processed['Num_Adults']
    df_processed['Income_per_Person_Denom'] = num_people.replace(0, 1)
    df_processed['Spend_per_Purchase_Denom'] = df_processed['Total_Purchases'].replace(0, 1)
    df_processed.drop('Num_Adults', axis=1, inplace=True, errors='ignore')

    # Identify initial feature types *excluding Target and ID*
    cols_to_process = [col for col in df_processed.columns if col not in ['ID', target_col]]
    initial_numerical_features = df_processed[cols_to_process].select_dtypes(include=np.number).columns.tolist()
    initial_categorical_features = df_processed[cols_to_process].select_dtypes(exclude=np.number).columns.tolist()

    # --- Step 2: Imputation ---
    if is_train:
        imputer_num = SimpleImputer(strategy='median')
        # Fit only on the selected features from the training set
        imputer_num.fit(df_processed[initial_numerical_features])
        df_processed[initial_numerical_features] = imputer_num.transform(df_processed[initial_numerical_features])

        imputer_cat = SimpleImputer(strategy='most_frequent')
        # Fit only on the selected features from the training set
        imputer_cat.fit(df_processed[initial_categorical_features])
        df_processed[initial_categorical_features] = imputer_cat.transform(df_processed[initial_categorical_features])

        fit_imputers = {'num': imputer_num, 'cat': imputer_cat}
        print("Imputers fitted on training data (excluding Target).")
    else:
        if fit_imputers is None: raise ValueError("Fitted imputers must be provided for test data")
        try:
            # Transform using the same feature list used in fit
            df_processed[initial_numerical_features] = fit_imputers['num'].transform(df_processed[initial_numerical_features])
            df_processed[initial_categorical_features] = fit_imputers['cat'].transform(df_processed[initial_categorical_features])
        except Exception as e:
             print(f"Error during imputation transformation: {e}")
             print("Columns being imputed (Num):", initial_numerical_features)
             print("Columns being imputed (Cat):", initial_categorical_features)
             print("Columns available in df:", df_processed.columns.tolist())
             raise e
        print("Test data imputed using fitted imputers.")

    # Convert imputed arrays back to DataFrame -> Important: Keep Target if it exists!
    original_cols = df_processed.columns # Store original columns before potential array conversion
    imputed_cols = initial_numerical_features + initial_categorical_features
    df_temp_imputed = pd.DataFrame(df_processed[imputed_cols], columns=imputed_cols, index=df_processed.index)
    # Add back ID and Target if they exist
    if 'ID' in original_cols: df_temp_imputed['ID'] = df_processed['ID']
    if target_col in original_cols: df_temp_imputed[target_col] = df_processed[target_col]
    df_processed = df_temp_imputed # Overwrite df_processed with the correctly columned DataFrame

    # --- Step 3: Create Derived Features ---
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Income_per_Person'] = (df_processed['Income'] / df_processed['Income_per_Person_Denom']).fillna(0)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Spend_per_Purchase_Denom']).fillna(0)
    df_processed.drop(['Income_per_Person_Denom', 'Spend_per_Purchase_Denom'], axis=1, inplace=True, errors='ignore')
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)

    # --- Step 4: Scaling ---
    final_cols_to_process = [col for col in df_processed.columns if col not in ['ID', target_col]]
    final_numerical_features = df_processed[final_cols_to_process].select_dtypes(include=np.number).columns.tolist()

    if is_train:
        scaler = StandardScaler()
        # Fit only on numerical features (excluding Target)
        scaler.fit(df_processed[final_numerical_features])
        df_processed[final_numerical_features] = scaler.transform(df_processed[final_numerical_features])
        fit_scaler = scaler
        print("Scaler fitted on training data (excluding Target).")
    else:
        if fit_scaler is None: raise ValueError("Fitted scaler must be provided for test data")
        try:
             # Transform using the same feature list used in fit
             df_processed[final_numerical_features] = fit_scaler.transform(df_processed[final_numerical_features])
        except Exception as e:
             print(f"Error during scaling transformation: {e}")
             print("Columns being scaled:", final_numerical_features)
             print("Columns available in df:", df_processed.columns.tolist())
             raise e
        print("Test data scaled using fitted scaler.")

    # --- Step 5: Final Type Conversion for CatBoost ---
    final_categorical_features = df_processed[final_cols_to_process].select_dtypes(exclude=np.number).columns.tolist()
    for col in final_categorical_features:
        df_processed[col] = df_processed[col].astype(str)

    # Ensure all column names (except Target maybe) are strings
    df_processed.columns = df_processed.columns.astype(str)
    if target_col in df_processed.columns: # Rename target back if needed
         df_processed.rename(columns={str(target_col): target_col}, inplace=True)


    return df_processed, fit_imputers, fit_scaler


# --- Apply Preprocessing V4 (Manual - Corrected) ---
# Need error handling around these calls as well
try:
    train_df_processed_v4, fitted_imputers, fitted_scaler = preprocess_data_v4_catboost_manual(train_df.copy(), is_train=True)
    if 'global_latest_date_v4' not in globals(): global_latest_date_v4 = datetime.datetime.now() + datetime.timedelta(days=1); print(f"Error: global_latest_date_v4 not set. Using fallback: {global_latest_date_v4}")
    test_df_processed_v4, _, _ = preprocess_data_v4_catboost_manual(test_df.copy(), is_train=False, latest_date=global_latest_date_v4, fit_imputers=fitted_imputers, fit_scaler=fitted_scaler)
    print("V4 Preprocessing (Manual Impute/Scale - Corrected) complete.")
except Exception as e:
    print(f"An error occurred during preprocessing: {e}")
    exit() # Exit if preprocessing fails

# --- Prepare Data V4 ---
try:
    X_v4 = train_df_processed_v4.drop(['ID', 'Target'], axis=1, errors='ignore')
    y_v4 = train_df_processed_v4['Target'] # Target should exist here
    X_test_v4 = test_df_processed_v4.drop('ID', axis=1, errors='ignore')

    # Align columns
    train_cols_v4 = X_v4.columns
    test_cols_v4 = X_test_v4.columns
    missing_in_test_v4 = set(train_cols_v4) - set(test_cols_v4)
    for c in missing_in_test_v4: X_test_v4[c] = 0
    missing_in_train_v4 = set(test_cols_v4) - set(train_cols_v4)
    for c in missing_in_train_v4: X_v4[c] = 0
    X_test_v4 = X_test_v4[train_cols_v4]

except KeyError as e:
    print(f"KeyError during data preparation after preprocessing: {e}")
    print("Columns in train_df_processed_v4:", train_df_processed_v4.columns)
    print("Columns in test_df_processed_v4:", test_df_processed_v4.columns)
    exit()
except Exception as e:
    print(f"An unexpected error occurred during data preparation: {e}")
    exit()


# Identify Categorical Feature *Names* for CatBoost AFTER all processing
categorical_features_v4_names = X_v4.select_dtypes(include='object').columns.tolist()
print(f"\nV4 Categorical features by name: {categorical_features_v4_names}")


# --- Define Base Model: CatBoost (No Pipeline Needed) ---
base_model_catboost = CatBoostClassifier(
    random_state=42,
    verbose=0,
    loss_function='Logloss',
    eval_metric='AUC',
    cat_features=categorical_features_v4_names # Pass NAMES
)

# --- Define Parameter Grid for CatBoost RandomizedSearchCV ---
param_dist_catboost = {
    'iterations': [100, 200, 300, 500, 700, 900],
    'learning_rate': [0.01, 0.02, 0.03, 0.05, 0.07],
    'depth': [4, 5, 6, 7],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128],
    'subsample': [0.6, 0.7, 0.8, 0.9],
}

# --- Set up K-Fold Strategy ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV for CatBoost (No Pipeline) ---
N_ITER_CATBOOST = 75
SCORING_METRIC = 'roc_auc'

random_search_catboost = RandomizedSearchCV(
    estimator=base_model_catboost,
    param_distributions=param_dist_catboost,
    n_iter=N_ITER_CATBOOST,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

print(f"\nStarting RandomizedSearchCV for CatBoost (Manual Preprocessing) with {N_ITER_CATBOOST} iterations for {SCORING_METRIC}...")
# Fit directly on the preprocessed dataframes
random_search_catboost.fit(X_v4, y_v4)
print("CatBoost RandomizedSearchCV finished.")

# --- Report Best CatBoost Results ---
print("\n--- CatBoost Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search_catboost.best_score_:.4f}")
print("Best Parameters:")
best_params_catboost = random_search_catboost.best_params_
for param, value in best_params_catboost.items():
    print(f"  {param}: {value}")

# --- Train Final CatBoost Model with Best Parameters ---
print("\nTraining final CatBoost model on the entire training set using best parameters...")
best_model_catboost = random_search_catboost.best_estimator_
print("Final CatBoost model training complete.")

# --- Predict on Test Data with Tuned CatBoost ---
print("Predicting on test data using the tuned CatBoost model...")
test_predictions_catboost = best_model_catboost.predict(X_test_v4)
print("Prediction complete.")

# --- Generate Submission File for CatBoost ---
submission_df_catboost = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_catboost})
submission_filename_catboost = 'submission_tuned_catboost_v4_manualprep.csv' # New filename
submission_df_catboost.to_csv(submission_filename_catboost, index=False)

print(f"\nSubmission file '{submission_filename_catboost}' created successfully.")
print(submission_df_catboost.head())
print(f"\nPredicted target distribution (CatBoost V4):\n{submission_df_catboost['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* CatBoost model on the training set
train_preds_catboost_tuned = best_model_catboost.predict(X_v4)
train_accuracy_catboost_tuned = accuracy_score(y_v4, train_preds_catboost_tuned)
train_roc_auc_catboost_tuned = roc_auc_score(y_v4, best_model_catboost.predict_proba(X_v4)[:, 1])
print(f"\n--- Tuned CatBoost V4 Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_catboost_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_catboost_tuned:.4f}")

Data loaded successfully.
Imputers fitted on training data (excluding Target).
Scaler fitted on training data (excluding Target).
Test data imputed using fitted imputers.
Test data scaled using fitted scaler.
V4 Preprocessing (Manual Impute/Scale - Corrected) complete.

V4 Categorical features by name: ['Education', 'Marital_Status']

Starting RandomizedSearchCV for CatBoost (Manual Preprocessing) with 75 iterations for roc_auc...
Fitting 5 folds for each of 75 candidates, totalling 375 fits
CatBoost RandomizedSearchCV finished.

--- CatBoost Hyperparameter Tuning Results ---
Best Score (roc_auc): 0.9260
Best Parameters:
  subsample: 0.9
  learning_rate: 0.07
  l2_leaf_reg: 3
  iterations: 100
  depth: 4
  border_count: 64

Training final CatBoost model on the entire training set using best parameters...
Final CatBoost model training complete.
Predicting on test data using the tuned CatBoost model...
Prediction complete.

Submission file 'submission_tuned_catboost_v4_manualprep.csv' cr

In [3]:
import pandas as pd
import numpy as np
# No CV needed here, just loading/training final models
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression # Added for meta-model if doing stacking later
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings
try:
    from catboost import CatBoostClassifier
except ImportError:
    print("CatBoost not found. Please install it using: pip install catboost")
    exit()

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")


# --- Feature Engineering & Preprocessing Functions (V1 and V4 needed) ---

# Function V1 (Correct definition for GBC V1 - returns only DataFrame)
def preprocess_data_v1(df, is_train=True, latest_date=None):
    """ V1 features with simplified categoricals, no manual impute/scale return """
    df_processed = df.copy()
    target_col = 'Target'
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer_dt'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v1 # Keep distinct global date var
        valid_dates = df_processed['Dt_Customer_dt'].dropna()
        # Use try-except for safety if no valid dates exist
        try:
            global_latest_date_v1 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        except TypeError: # Handle case where valid_dates might be empty or cause issues
             global_latest_date_v1 = datetime.datetime(reference_year + 1, 1, 1)
             print(f"Warning: Error setting global_latest_date_v1. Using fallback: {global_latest_date_v1}")

        latest_date_to_use = global_latest_date_v1
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)

    mask = pd.notna(df_processed['Dt_Customer_dt'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer_dt']).dt.days
    df_processed.drop(['Dt_Customer_dt', 'Dt_Customer'], axis=1, inplace=True, errors='ignore')

    # V1 Simplifications
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner', 'Together': 'Partner',
        'Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'
    })
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})

    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    for col in mnt_cols: df_processed[col] = df_processed[col].fillna(0) # Simple imputation
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')

    # Simple median imputation for remaining numericals (before pipeline)
    cols_to_impute_num = ['Age', 'Income', 'Customer_Lifetime', 'Total_Mnt'] # Add others if needed
    for col in cols_to_impute_num:
         if col in df_processed.columns and df_processed[col].isnull().any():
              median_val = df_processed[col].median() # Calculate median before filling
              df_processed[col].fillna(median_val, inplace=True)

    # Simple mode imputation for remaining categoricals (before pipeline)
    cat_cols_simple = ['Marital_Status', 'Education']
    for col in cat_cols_simple:
        if col in df_processed.columns and df_processed[col].isnull().any():
            mode_val = df_processed[col].mode()[0] # Calculate mode before filling
            df_processed[col].fillna(mode_val, inplace=True)

    return df_processed # Only return the DataFrame


# Function V4 (for CatBoost - Manual Impute/Scale) - Keep as is from previous correct version
def preprocess_data_v4_catboost_manual(df, is_train=True, latest_date=None, fit_imputers=None, fit_scaler=None):
    # ... (Keep the full V4 function definition from the previous working block) ...
    df_processed = df.copy()
    target_col = 'Target' # Define target column name
    # --- Step 1: Initial Feature Creation (before imputation/scaling) ---
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth'] # Age calculated, may have NaNs
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer_dt'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True) # Keep intermediate
    if is_train:
        global global_latest_date_v4
        valid_dates = df_processed['Dt_Customer_dt'].dropna()
        # Use try-except for safety if no valid dates exist
        try:
            global_latest_date_v4 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        except TypeError: # Handle case where valid_dates might be empty or cause issues
             global_latest_date_v4 = datetime.datetime(reference_year + 1, 1, 1)
             print(f"Warning: Error setting global_latest_date_v4. Using fallback: {global_latest_date_v4}")
        latest_date_to_use = global_latest_date_v4
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1); print(f"Warning: Using fallback latest date for test: {latest_date_to_use}")
    df_processed['Enroll_Month'] = df_processed['Dt_Customer_dt'].dt.month # Keep as number initially for imputation
    df_processed['Enroll_Year'] = df_processed['Dt_Customer_dt'].dt.year
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer_dt'].dt.dayofweek # Keep as number initially
    mask = pd.notna(df_processed['Dt_Customer_dt'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer_dt']).dt.days # May have NaNs
    df_processed.drop(['Dt_Customer_dt', 'Dt_Customer'], axis=1, inplace=True, errors='ignore') # Drop original and intermediate date cols
    # Keep Original Marital_Status & Education
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1) # Calculated before imputation, may include NaNs if components are NaN
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    # Don't create ratio/derived features yet
    df_processed['Num_Adults'] = df_processed['Marital_Status'].apply(lambda x: 1 if str(x) in ['Single','Divorced','Widow','Absurd','Alone','YOLO'] else 2)
    num_people = df_processed['Children'] + df_processed['Num_Adults']
    df_processed['Income_per_Person_Denom'] = num_people.replace(0, 1) # Denominator stored temporarily
    df_processed['Spend_per_Purchase_Denom'] = df_processed['Total_Purchases'].replace(0, 1)
    df_processed.drop('Num_Adults', axis=1, inplace=True, errors='ignore')

    # Identify initial feature types *excluding Target and ID*
    cols_to_process = [col for col in df_processed.columns if col not in ['ID', target_col]]
    initial_numerical_features = df_processed[cols_to_process].select_dtypes(include=np.number).columns.tolist()
    initial_categorical_features = df_processed[cols_to_process].select_dtypes(exclude=np.number).columns.tolist()

    # --- Step 2: Imputation ---
    if is_train:
        imputer_num = SimpleImputer(strategy='median')
        imputer_num.fit(df_processed[initial_numerical_features])
        df_processed[initial_numerical_features] = imputer_num.transform(df_processed[initial_numerical_features])
        imputer_cat = SimpleImputer(strategy='most_frequent')
        imputer_cat.fit(df_processed[initial_categorical_features])
        df_processed[initial_categorical_features] = imputer_cat.transform(df_processed[initial_categorical_features])
        fit_imputers = {'num': imputer_num, 'cat': imputer_cat}
        # print("Imputers fitted on training data (excluding Target).") # Optional print
    else:
        if fit_imputers is None: raise ValueError("Fitted imputers must be provided for test data")
        try:
            df_processed[initial_numerical_features] = fit_imputers['num'].transform(df_processed[initial_numerical_features])
            df_processed[initial_categorical_features] = fit_imputers['cat'].transform(df_processed[initial_categorical_features])
        except Exception as e:
             print(f"Error during imputation transformation: {e}")
             print("Columns available in df:", df_processed.columns.tolist())
             raise e
        # print("Test data imputed using fitted imputers.") # Optional print

    # Reconstruct DataFrame -> Important: Keep Target if it exists!
    original_cols = df_processed.columns
    imputed_cols = initial_numerical_features + initial_categorical_features
    df_temp_imputed = pd.DataFrame(df_processed[imputed_cols], columns=imputed_cols, index=df_processed.index)
    if 'ID' in original_cols: df_temp_imputed['ID'] = df_processed['ID']
    if target_col in original_cols: df_temp_imputed[target_col] = df_processed[target_col]
    df_processed = df_temp_imputed

    # --- Step 3: Create Derived Features ---
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Income_per_Person'] = (df_processed['Income'] / df_processed['Income_per_Person_Denom']).fillna(0)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Spend_per_Purchase_Denom']).fillna(0)
    df_processed.drop(['Income_per_Person_Denom', 'Spend_per_Purchase_Denom'], axis=1, inplace=True, errors='ignore')
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)

    # --- Step 4: Scaling ---
    final_cols_to_process = [col for col in df_processed.columns if col not in ['ID', target_col]]
    final_numerical_features = df_processed[final_cols_to_process].select_dtypes(include=np.number).columns.tolist()
    if is_train:
        scaler = StandardScaler()
        scaler.fit(df_processed[final_numerical_features])
        df_processed[final_numerical_features] = scaler.transform(df_processed[final_numerical_features])
        fit_scaler = scaler
        # print("Scaler fitted on training data (excluding Target).") # Optional
    else:
        if fit_scaler is None: raise ValueError("Fitted scaler must be provided for test data")
        try:
             df_processed[final_numerical_features] = fit_scaler.transform(df_processed[final_numerical_features])
        except Exception as e:
             print(f"Error during scaling transformation: {e}")
             raise e
        # print("Test data scaled using fitted scaler.") # Optional

    # --- Step 5: Final Type Conversion for CatBoost ---
    final_categorical_features = df_processed[final_cols_to_process].select_dtypes(exclude=np.number).columns.tolist()
    for col in final_categorical_features: df_processed[col] = df_processed[col].astype(str)
    df_processed.columns = df_processed.columns.astype(str)
    if target_col in df_processed.columns: df_processed.rename(columns={str(target_col): target_col}, inplace=True)
    return df_processed, fit_imputers, fit_scaler


# --- Apply Preprocessing V1 (Corrected Call) ---
print("Preprocessing V1...")
train_df_processed_v1 = preprocess_data_v1(train_df.copy(), is_train=True) # Now expects only 1 return value
if 'global_latest_date_v1' not in globals(): global_latest_date_v1 = datetime.datetime.now() + datetime.timedelta(days=1)
test_df_processed_v1 = preprocess_data_v1(test_df.copy(), is_train=False, latest_date=global_latest_date_v1)
print("V1 Preprocessing complete.")

# --- Apply Preprocessing V4 (for CatBoost - Manual Impute/Scale) ---
print("\nPreprocessing V4 (for CatBoost)...")
train_df_processed_v4, fitted_imputers_v4, fitted_scaler_v4 = preprocess_data_v4_catboost_manual(train_df.copy(), is_train=True)
if 'global_latest_date_v4' not in globals(): global_latest_date_v4 = datetime.datetime.now() + datetime.timedelta(days=1);
test_df_processed_v4, _, _ = preprocess_data_v4_catboost_manual(test_df.copy(), is_train=False, latest_date=global_latest_date_v4, fit_imputers=fitted_imputers_v4, fit_scaler=fitted_scaler_v4)
print("V4 Preprocessing complete.")


# --- Prepare Data V1 (for GBC V1) ---
# ... (rest of data prep V1 is likely fine) ...
X_v1 = train_df_processed_v1.drop(['ID', 'Target'], axis=1, errors='ignore')
y_v1 = train_df_processed_v1['Target']
X_test_v1 = test_df_processed_v1.drop('ID', axis=1, errors='ignore')
train_cols_v1 = X_v1.columns; test_cols_v1 = X_test_v1.columns
missing_in_test_v1 = set(train_cols_v1) - set(test_cols_v1)
for c in missing_in_test_v1: X_test_v1[c] = 0
missing_in_train_v1 = set(test_cols_v1) - set(train_cols_v1)
for c in missing_in_train_v1: X_v1[c] = 0
X_test_v1 = X_test_v1[train_cols_v1]

# --- Prepare Data V2 (for GBC V2 - using V4 preproc function for consistency, but applying OHE) ---
X_v2_like = train_df_processed_v4.drop(['ID', 'Target'], axis=1, errors='ignore')
y_v2_like = train_df_processed_v4['Target'] # Target comes from V4 processed train
X_test_v2_like = test_df_processed_v4.drop('ID', axis=1, errors='ignore') # Features come from V4 processed test
# Align V2_like
train_cols_v2_like = X_v2_like.columns; test_cols_v2_like = X_test_v2_like.columns
missing_in_test_v2_like = set(train_cols_v2_like) - set(test_cols_v2_like)
for c in missing_in_test_v2_like: X_test_v2_like[c] = 0
missing_in_train_v2_like = set(test_cols_v2_like) - set(train_cols_v2_like)
for c in missing_in_train_v2_like: X_v2_like[c] = 0
X_test_v2_like = X_test_v2_like[train_cols_v2_like]


# --- Prepare Data V4 (for CatBoost) ---
X_v4 = train_df_processed_v4.drop(['ID', 'Target'], axis=1, errors='ignore')
y_v4 = train_df_processed_v4['Target']
X_test_v4 = test_df_processed_v4.drop('ID', axis=1, errors='ignore')
# Align V4 (redundant if V2_like is aligned, but safe)
train_cols_v4 = X_v4.columns
test_cols_v4 = X_test_v4.columns
missing_in_test_v4 = set(train_cols_v4) - set(test_cols_v4)
for c in missing_in_test_v4: X_test_v4[c] = 0
missing_in_train_v4 = set(test_cols_v4) - set(train_cols_v4)
for c in missing_in_train_v4: X_v4[c] = 0
X_test_v4 = X_test_v4[train_cols_v4]


# --- Define Preprocessing Pipelines V1 (GBC V1 requires OHE) ---
numerical_features_v1 = X_v1.select_dtypes(include=np.number).columns.tolist()
categorical_features_v1 = X_v1.select_dtypes(exclude=np.number).columns.tolist()
numerical_pipeline_v1 = Pipeline([('imputer_num', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_pipeline_v1 = Pipeline([('imputer_cat', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v1 = ColumnTransformer([
    ('num', numerical_pipeline_v1, numerical_features_v1),
    ('cat', categorical_pipeline_v1, categorical_features_v1)], remainder='passthrough')

# --- Define Preprocessing Pipelines V2 (GBC V2 requires OHE on V4 features) ---
numerical_features_v2_like = X_v2_like.select_dtypes(include=np.number).columns.tolist()
categorical_features_v2_like = X_v2_like.select_dtypes(exclude=np.number).columns.tolist()
# Define pipelines assuming imputation happened in preprocess_data_v4
numerical_pipeline_v2_like = Pipeline([('scaler', StandardScaler())]) # Scaling only
categorical_pipeline_v2_like = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]) # OHE only
preprocessor_v2_like = ColumnTransformer([
    ('num', numerical_pipeline_v2_like, numerical_features_v2_like),
    ('cat', categorical_pipeline_v2_like, categorical_features_v2_like)], remainder='passthrough')


# --- Define BEST Hyperparameters ---
# ... (keep best params definitions) ...
best_params_gbc_v1 = {
    'classifier__subsample': 0.6, 'classifier__n_estimators': 200,
    'classifier__min_samples_split': 20, 'classifier__min_samples_leaf': 20,
    'classifier__max_features': 'sqrt', 'classifier__max_depth': 2,
    'classifier__learning_rate': 0.08}
best_params_gbc_v2 = { # Params that gave 0.848
    'classifier__subsample': 0.7, 'classifier__n_estimators': 300,
    'classifier__min_samples_split': 20, 'classifier__min_samples_leaf': 20,
    'classifier__max_features': 'log2', 'classifier__max_depth': 2,
    'classifier__learning_rate': 0.05}
best_params_catboost_v4 = { # Params found for CatBoost V4
    'subsample': 0.9, 'learning_rate': 0.07, 'l2_leaf_reg': 3,
    'iterations': 100, 'depth': 4, 'border_count': 64}

# --- Build and Train Model 1 (GBC V1) ---
print("\nTraining Model 1 (GBC V1)...")
pipeline_gbc_v1 = Pipeline([
    ('preprocessor', preprocessor_v1),
    ('classifier', GradientBoostingClassifier(random_state=42))
])
pipeline_gbc_v1.set_params(**best_params_gbc_v1)
pipeline_gbc_v1.fit(X_v1, y_v1)
print("Model 1 training complete.")

# --- Build and Train Model 2 (GBC V2 features + OHE) ---
print("Training Model 2 (GBC V2 Features + OHE)...")
pipeline_gbc_v2 = Pipeline([
    ('preprocessor', preprocessor_v2_like),
    ('classifier', GradientBoostingClassifier(random_state=42))
])
pipeline_gbc_v2.set_params(**best_params_gbc_v2)
pipeline_gbc_v2.fit(X_v2_like, y_v2_like)
print("Model 2 training complete.")

# --- Build and Train Model 3 (CatBoost V4 features - Manual Prep) ---
print("Training Model 3 (CatBoost V4 Features - Manual Prep)...")
# Identify final features for CatBoost training
categorical_features_v4_names = X_v4.select_dtypes(include='object').columns.tolist() # Names needed for CatBoost model init

model_catboost_final = CatBoostClassifier(
    random_state=42, verbose=0, loss_function='Logloss', eval_metric='AUC',
    cat_features=categorical_features_v4_names, **best_params_catboost_v4 # Apply best params
)
# Train directly on the V4 data (already imputed and scaled in preprocess function)
model_catboost_final.fit(X_v4, y_v4)
print("Model 3 training complete.")

# --- Predict Probabilities on Test Set ---
print("Predicting probabilities...")
probs_gbc_v1 = pipeline_gbc_v1.predict_proba(X_test_v1)[:, 1]
probs_gbc_v2 = pipeline_gbc_v2.predict_proba(X_test_v2_like)[:, 1]
probs_catboost = model_catboost_final.predict_proba(X_test_v4)[:, 1] # Predict on manually prepped test data

# --- Ensemble Averaging ---
print("Averaging predictions...")
avg_probs_3 = (probs_gbc_v1 + probs_gbc_v2 + probs_catboost) / 3
final_predictions = (avg_probs_3 >= 0.5).astype(int)

# --- Generate Submission File ---
submission_df_ensemble3 = pd.DataFrame({'ID': test_df['ID'], 'Target': final_predictions})
submission_filename_ensemble3 = 'submission_ensemble_3model_avg.csv'
submission_df_ensemble3.to_csv(submission_filename_ensemble3, index=False)

print(f"\nSubmission file '{submission_filename_ensemble3}' created successfully.")
print(submission_df_ensemble3.head())
print(f"\nPredicted target distribution (3-Model Ensemble):\n{submission_df_ensemble3['Target'].value_counts(normalize=True)}")

Data loaded successfully.
Preprocessing V1...
V1 Preprocessing complete.

Preprocessing V4 (for CatBoost)...
V4 Preprocessing complete.

Training Model 1 (GBC V1)...
Model 1 training complete.
Training Model 2 (GBC V2 Features + OHE)...
Model 2 training complete.
Training Model 3 (CatBoost V4 Features - Manual Prep)...
Model 3 training complete.
Predicting probabilities...
Averaging predictions...

Submission file 'submission_ensemble_3model_avg.csv' created successfully.
      ID  Target
0   4390       1
1  10478       1
2   1081       1
3   4261       1
4   9916       0

Predicted target distribution (3-Model Ensemble):
Target
0    0.624071
1    0.375929
Name: proportion, dtype: float64


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV # Using Lasso for feature selection
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime
import warnings

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")


# --- Feature Engineering & Preprocessing Function V5 (Adds Poly/Interaction) ---
def preprocess_data_v5_poly_interact(df, is_train=True, latest_date=None, fit_imputers=None, poly_feature_names=None):
    """ V4 + Polynomial/Interaction features, returns df before scaling """
    df_processed = df.copy()
    target_col = 'Target'
    # --- Initial Feature Creation (Same as V4) ---
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer_dt'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v5 # Use distinct name
        valid_dates = df_processed['Dt_Customer_dt'].dropna()
        global_latest_date_v5 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v5
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1); print(f"Warning: Using fallback latest date for test: {latest_date_to_use}")
    df_processed['Enroll_Month'] = df_processed['Dt_Customer_dt'].dt.month
    df_processed['Enroll_Year'] = df_processed['Dt_Customer_dt'].dt.year
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer_dt'].dt.dayofweek
    mask = pd.notna(df_processed['Dt_Customer_dt'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer_dt']).dt.days
    df_processed.drop(['Dt_Customer_dt', 'Dt_Customer'], axis=1, inplace=True, errors='ignore')
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    df_processed['Num_Adults'] = df_processed['Marital_Status'].apply(lambda x: 1 if str(x) in ['Single','Divorced','Widow','Absurd','Alone','YOLO'] else 2)
    num_people = df_processed['Children'] + df_processed['Num_Adults']
    df_processed['Income_per_Person_Denom'] = num_people.replace(0, 1)
    df_processed['Spend_per_Purchase_Denom'] = df_processed['Total_Purchases'].replace(0, 1)
    df_processed.drop('Num_Adults', axis=1, inplace=True, errors='ignore')

    # --- Imputation (Manual) ---
    cols_to_process = [col for col in df_processed.columns if col not in ['ID', target_col]]
    initial_numerical_features = df_processed[cols_to_process].select_dtypes(include=np.number).columns.tolist()
    initial_categorical_features = df_processed[cols_to_process].select_dtypes(exclude=np.number).columns.tolist()
    if is_train:
        imputer_num = SimpleImputer(strategy='median')
        imputer_num.fit(df_processed[initial_numerical_features])
        df_processed[initial_numerical_features] = imputer_num.transform(df_processed[initial_numerical_features])
        imputer_cat = SimpleImputer(strategy='most_frequent')
        imputer_cat.fit(df_processed[initial_categorical_features])
        df_processed[initial_categorical_features] = imputer_cat.transform(df_processed[initial_categorical_features])
        fit_imputers = {'num': imputer_num, 'cat': imputer_cat}
    else:
        if fit_imputers is None: raise ValueError("Fitted imputers needed for test")
        df_processed[initial_numerical_features] = fit_imputers['num'].transform(df_processed[initial_numerical_features])
        df_processed[initial_categorical_features] = fit_imputers['cat'].transform(df_processed[initial_categorical_features])
    # Reconstruct DataFrame
    original_cols = df_processed.columns
    imputed_cols = initial_numerical_features + initial_categorical_features
    df_temp_imputed = pd.DataFrame(df_processed[imputed_cols], columns=imputed_cols, index=df_processed.index)
    if 'ID' in original_cols: df_temp_imputed['ID'] = df_processed['ID']
    if target_col in original_cols: df_temp_imputed[target_col] = df_processed[target_col]
    df_processed = df_temp_imputed

    # --- Create Derived Features (Post-Imputation) ---
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Income_per_Person'] = (df_processed['Income'] / df_processed['Income_per_Person_Denom']).fillna(0)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Spend_per_Purchase_Denom']).fillna(0)
    df_processed.drop(['Income_per_Person_Denom', 'Spend_per_Purchase_Denom'], axis=1, inplace=True, errors='ignore')
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)

    # --- Add Interaction & Polynomial Features ---
    # Select top numerical features based on previous importance analysis
    poly_cols = ['Recency', 'MntWines', 'Total_CmpAccepted', 'Total_Purchases', 'NumWebPurchases', 'Customer_Lifetime', 'Total_Mnt', 'Age']
    # Ensure these columns exist after previous steps
    poly_cols = [col for col in poly_cols if col in df_processed.columns]

    if is_train:
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False) # Try full quadratic
        poly_features = poly.fit_transform(df_processed[poly_cols])
        # Get new feature names
        poly_feature_names = poly.get_feature_names_out(poly_cols)
        poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df_processed.index)
        # Drop original columns used in poly features to avoid duplication? Optional, keep for now.
        # df_processed = df_processed.drop(columns=poly_cols)
        df_processed = pd.concat([df_processed, poly_df.drop(columns=poly_cols)], axis=1) # Add only new terms
        print(f"Added {len(poly_feature_names) - len(poly_cols)} polynomial/interaction features.")
    else:
        if poly_feature_names is None:
             raise ValueError("Polynomial feature names must be provided for test data transformation.")
        # Need to apply the *same* transformation. Requires fitted poly object or careful reconstruction.
        # Easier approach for now: Recalculate on test, assuming same features are generated.
        # THIS IS NOT IDEAL for production but simpler for testing feature impact here.
        # A robust solution uses the fitted poly object from training.
        poly_test = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        poly_features_test = poly_test.fit_transform(df_processed[poly_cols])
        poly_feature_names_test = poly_test.get_feature_names_out(poly_cols) # Get names from test fit
        poly_df_test = pd.DataFrame(poly_features_test, columns=poly_feature_names_test, index=df_processed.index)

        # Align columns based on training names (important!)
        poly_df_test_aligned = pd.DataFrame(index=poly_df_test.index) # Empty df with same index
        for col_name in poly_feature_names: # Iterate through names FROM TRAINING
            if col_name in poly_df_test.columns and col_name not in poly_cols: # Check if exists in test and is a new term
                poly_df_test_aligned[col_name] = poly_df_test[col_name]
            elif col_name not in poly_cols: # If missing in test, add column of zeros
                poly_df_test_aligned[col_name] = 0

        df_processed = pd.concat([df_processed, poly_df_test_aligned], axis=1)

    # --- Manual Interaction Example ---
    if 'Recency' in df_processed.columns and 'Total_CmpAccepted' in df_processed.columns:
         df_processed['Recency_x_Cmp'] = df_processed['Recency'] * df_processed['Total_CmpAccepted']


    # Convert final categoricals to string (important BEFORE pipeline)
    final_categorical_features = df_processed.select_dtypes(exclude=np.number).columns.tolist()
    for col in final_categorical_features:
        df_processed[col] = df_processed[col].astype(str)
    df_processed.columns = df_processed.columns.astype(str) # Ensure all column names are strings
    if target_col in df_processed.columns: df_processed.rename(columns={str(target_col): target_col}, inplace=True)


    return df_processed, fit_imputers, poly_feature_names # Return poly names for test set


# --- Apply Preprocessing V5 ---
print("Preprocessing V5 (Poly/Interaction Features)...")
train_df_processed_v5, fitted_imputers_v5, poly_names_v5 = preprocess_data_v5_poly_interact(train_df.copy(), is_train=True)
if 'global_latest_date_v5' not in globals(): global_latest_date_v5 = datetime.datetime.now() + datetime.timedelta(days=1); print(f"Error: global_latest_date_v5 not set. Using fallback: {global_latest_date_v5}")
test_df_processed_v5, _, _ = preprocess_data_v5_poly_interact(test_df.copy(), is_train=False, latest_date=global_latest_date_v5, fit_imputers=fitted_imputers_v5, poly_feature_names=poly_names_v5)
print("V5 Preprocessing complete.")


# --- Prepare Data V5 ---
X_v5 = train_df_processed_v5.drop(['ID', 'Target'], axis=1, errors='ignore')
y_v5 = train_df_processed_v5['Target']
X_test_v5 = test_df_processed_v5.drop('ID', axis=1, errors='ignore')
# Align columns
train_cols_v5 = X_v5.columns
test_cols_v5 = X_test_v5.columns
missing_in_test_v5 = set(train_cols_v5) - set(test_cols_v5)
for c in missing_in_test_v5: X_test_v5[c] = 0
missing_in_train_v5 = set(test_cols_v5) - set(train_cols_v5)
for c in missing_in_train_v5: X_v5[c] = 0
X_test_v5 = X_test_v5[train_cols_v5] # Ensure order


# --- Define Preprocessing Pipeline V5 (OHE + Scale) ---
numerical_features_v5 = X_v5.select_dtypes(include=np.number).columns.tolist()
categorical_features_v5 = X_v5.select_dtypes(exclude=np.number).columns.tolist()
print(f"\nV5 Numerical features ({len(numerical_features_v5)}): {len(numerical_features_v5)} features") # Print count due to length
# print(numerical_features_v5) # Optionally print list
print(f"V5 Categorical features ({len(categorical_features_v5)}): {categorical_features_v5}")


# Pipeline for GBC (Scale Numerics, OHE Categoricals)
# Imputation already done in preprocess_data_v5
numerical_pipeline_v5 = Pipeline([('scaler', StandardScaler())])
categorical_pipeline_v5 = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v5 = ColumnTransformer([
    ('num', numerical_pipeline_v5, numerical_features_v5),
    ('cat', categorical_pipeline_v5, categorical_features_v5)],
    remainder='passthrough')


# --- Define Feature Selector ---
# Using LassoCV for selection based on linear importance
# Threshold='median' means select features with importance > median importance
# Can also use a float like '1.25*mean' or a specific value
selector = SelectFromModel(
    estimator=LassoCV(cv=3, random_state=42, max_iter=2000), # LassoCV finds best alpha
    threshold='median', # Select features with importance above the median
    prefit=False # Estimator will be fit automatically
)

# --- Define Base Model: GBC ---
base_model_gbc = GradientBoostingClassifier(random_state=42)


# --- Create Full Pipeline with Selection ---
pipeline_gbc_v5_select = Pipeline([
    ('preprocessor', preprocessor_v5),
    ('selector', selector), # Add feature selection step
    ('classifier', base_model_gbc)
])


# --- Define Parameter Grid for GBC V5 (Focus on GBC Params) ---
# Use previous best GBC V2 params as a guide
param_dist_gbc_v5 = {
    'classifier__n_estimators': [200, 300, 400],
    'classifier__learning_rate': [0.03, 0.05, 0.07, 0.1],
    'classifier__max_depth': [2, 3],
    'classifier__min_samples_leaf': [15, 20, 25],
    'classifier__min_samples_split': [20, 30, 40],
    'classifier__subsample': [0.6, 0.7, 0.8],
    'classifier__max_features': ['sqrt', 'log2', None]
    # Can also tune selector threshold if needed: 'selector__threshold': ['median', 'mean']
}


# --- Set up K-Fold Strategy ---
N_SPLITS = 5
RANDOM_STATE_KFOLD = 42
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE_KFOLD)

# --- Set up RandomizedSearchCV for GBC V5 ---
N_ITER_GBC_V5 = 60 # Adjust iterations based on time constraints
SCORING_METRIC = 'roc_auc'

random_search_gbc_v5 = RandomizedSearchCV(
    estimator=pipeline_gbc_v5_select,
    param_distributions=param_dist_gbc_v5,
    n_iter=N_ITER_GBC_V5,
    scoring=SCORING_METRIC,
    cv=skf,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

print(f"\nStarting RandomizedSearchCV for GBC V5 (Poly/Interact + Selection) with {N_ITER_GBC_V5} iterations...")
random_search_gbc_v5.fit(X_v5, y_v5)
print("GBC V5 RandomizedSearchCV finished.")

# --- Report Best GBC V5 Results ---
print("\n--- GBC V5 (Poly/Interact + Selection) Hyperparameter Tuning Results ---")
print(f"Best Score ({SCORING_METRIC}): {random_search_gbc_v5.best_score_:.5f}") # More precision
print("Best Parameters:")
best_params_gbc_v5 = random_search_gbc_v5.best_params_
for param, value in best_params_gbc_v5.items():
    print(f"  {param}: {value}")

# --- Train Final GBC V5 Model with Best Parameters ---
print("\nTraining final GBC V5 model on the entire training set...")
best_pipeline_gbc_v5 = random_search_gbc_v5.best_estimator_
print("Final GBC V5 model training complete.")

# --- Predict on Test Data with Tuned GBC V5 ---
print("Predicting on test data using the tuned GBC V5 model...")
test_predictions_gbc_v5 = best_pipeline_gbc_v5.predict(X_test_v5)
print("Prediction complete.")

# --- Generate Submission File for GBC V5 ---
submission_df_gbc_v5 = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_gbc_v5})
submission_filename_gbc_v5 = 'submission_tuned_gbc_v5_poly_select.csv'
submission_df_gbc_v5.to_csv(submission_filename_gbc_v5, index=False)

print(f"\nSubmission file '{submission_filename_gbc_v5}' created successfully.")
print(submission_df_gbc_v5.head())
print(f"\nPredicted target distribution (GBC V5):\n{submission_df_gbc_v5['Target'].value_counts(normalize=True)}")

# Optional: Evaluate the *tuned* GBC V5 model on the training set
train_preds_gbc_v5_tuned = best_pipeline_gbc_v5.predict(X_v5)
train_accuracy_gbc_v5_tuned = accuracy_score(y_v5, train_preds_gbc_v5_tuned)
train_roc_auc_gbc_v5_tuned = roc_auc_score(y_v5, best_pipeline_gbc_v5.predict_proba(X_v5)[:, 1])
print(f"\n--- Tuned GBC V5 Model Training Set Evaluation ---")
print(f"Accuracy: {train_accuracy_gbc_v5_tuned:.4f}")
print(f"ROC AUC: {train_roc_auc_gbc_v5_tuned:.5f}")

Data loaded successfully.
Preprocessing V5 (Poly/Interaction Features)...
Added 36 polynomial/interaction features.
V5 Preprocessing complete.

V5 Numerical features (72): 72 features
V5 Categorical features (2): ['Education', 'Marital_Status']

Starting RandomizedSearchCV for GBC V5 (Poly/Interact + Selection) with 60 iterations...
Fitting 5 folds for each of 60 candidates, totalling 300 fits
GBC V5 RandomizedSearchCV finished.

--- GBC V5 (Poly/Interact + Selection) Hyperparameter Tuning Results ---
Best Score (roc_auc): 0.92078
Best Parameters:
  classifier__subsample: 0.7
  classifier__n_estimators: 200
  classifier__min_samples_split: 20
  classifier__min_samples_leaf: 15
  classifier__max_features: log2
  classifier__max_depth: 2
  classifier__learning_rate: 0.07

Training final GBC V5 model on the entire training set...
Final GBC V5 model training complete.
Predicting on test data using the tuned GBC V5 model...
Prediction complete.

Submission file 'submission_tuned_gbc_v5_poly

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier # Import StackingClassifier
from sklearn.linear_model import LogisticRegression # Meta-model
import datetime
import warnings
try:
    from catboost import CatBoostClassifier # Keep for potential future use
except ImportError:
    print("CatBoost not found.")

# Suppress specific warnings if needed
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# --- Load Data ---
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()
print("Data loaded successfully.")


# --- Feature Engineering & Preprocessing Functions ---
# Using V1 function (for GBC V1) and V4 function (as base for GBC V2 structure)

# Function V1 (Correct definition for GBC V1 - returns only DataFrame)
def preprocess_data_v1(df, is_train=True, latest_date=None):
    """ V1 features with simplified categoricals, basic imputation """
    df_processed = df.copy(); target_col = 'Target'; current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer_dt'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v1
        valid_dates = df_processed['Dt_Customer_dt'].dropna()
        try: global_latest_date_v1 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        except TypeError: global_latest_date_v1 = datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v1
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1)
    mask = pd.notna(df_processed['Dt_Customer_dt'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer_dt']).dt.days
    df_processed.drop(['Dt_Customer_dt', 'Dt_Customer'], axis=1, inplace=True, errors='ignore')
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({'Married': 'Partner', 'Together': 'Partner','Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'})
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'})
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']; purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']; cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    for col in mnt_cols: df_processed[col] = df_processed[col].fillna(0)
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    cols_to_impute_num = ['Age', 'Income', 'Customer_Lifetime', 'Total_Mnt']
    for col in cols_to_impute_num:
         if col in df_processed.columns and df_processed[col].isnull().any(): df_processed[col].fillna(df_processed[col].median(), inplace=True)
    cat_cols_simple = ['Marital_Status', 'Education']
    for col in cat_cols_simple:
        if col in df_processed.columns and df_processed[col].isnull().any(): df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)
    return df_processed

# Function V4 (Manual Impute/Scale) - Used as basis for GBC V2 features
def preprocess_data_v4_catboost_manual(df, is_train=True, latest_date=None, fit_imputers=None, fit_scaler=None):
    """ V2 features, keeps original categoricals, handles imputation+scaling manually """
    df_processed = df.copy(); target_col = 'Target'
    current_year = datetime.datetime.now().year
    try:
        df_processed['Dt_Customer_temp'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
        max_enroll_year = df_processed['Dt_Customer_temp'].dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
        df_processed.drop('Dt_Customer_temp', axis=1, inplace=True)
    except Exception: reference_year = current_year
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)
    df_processed['Dt_Customer_dt'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)
    if is_train:
        global global_latest_date_v4
        valid_dates = df_processed['Dt_Customer_dt'].dropna()
        try: global_latest_date_v4 = valid_dates.max() + datetime.timedelta(days=1) if not valid_dates.empty else datetime.datetime(reference_year + 1, 1, 1)
        except TypeError: global_latest_date_v4 = datetime.datetime(reference_year + 1, 1, 1)
        latest_date_to_use = global_latest_date_v4
    elif latest_date: latest_date_to_use = latest_date
    else: latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1); print(f"Warning: Using fallback latest date for test: {latest_date_to_use}")
    df_processed['Enroll_Month'] = df_processed['Dt_Customer_dt'].dt.month
    df_processed['Enroll_Year'] = df_processed['Dt_Customer_dt'].dt.year
    df_processed['Enroll_DayOfWeek'] = df_processed['Dt_Customer_dt'].dt.dayofweek
    mask = pd.notna(df_processed['Dt_Customer_dt'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer_dt']).dt.days
    df_processed.drop(['Dt_Customer_dt', 'Dt_Customer'], axis=1, inplace=True, errors='ignore')
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']; purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']; cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore')
    df_processed['Num_Adults'] = df_processed['Marital_Status'].apply(lambda x: 1 if str(x) in ['Single','Divorced','Widow','Absurd','Alone','YOLO'] else 2)
    num_people = df_processed['Children'] + df_processed['Num_Adults']
    df_processed['Income_per_Person_Denom'] = num_people.replace(0, 1)
    df_processed['Spend_per_Purchase_Denom'] = df_processed['Total_Purchases'].replace(0, 1)
    df_processed.drop('Num_Adults', axis=1, inplace=True, errors='ignore')
    # Identify initial features *before* imputation
    cols_to_process = [col for col in df_processed.columns if col not in ['ID', target_col]]
    initial_numerical_features = df_processed[cols_to_process].select_dtypes(include=np.number).columns.tolist()
    initial_categorical_features = df_processed[cols_to_process].select_dtypes(exclude=np.number).columns.tolist()
    # Imputation
    if is_train:
        imputer_num = SimpleImputer(strategy='median'); imputer_num.fit(df_processed[initial_numerical_features])
        df_processed[initial_numerical_features] = imputer_num.transform(df_processed[initial_numerical_features])
        imputer_cat = SimpleImputer(strategy='most_frequent'); imputer_cat.fit(df_processed[initial_categorical_features])
        df_processed[initial_categorical_features] = imputer_cat.transform(df_processed[initial_categorical_features])
        fit_imputers = {'num': imputer_num, 'cat': imputer_cat}
    else:
        if fit_imputers is None: raise ValueError("Fitted imputers needed")
        df_processed[initial_numerical_features] = fit_imputers['num'].transform(df_processed[initial_numerical_features])
        df_processed[initial_categorical_features] = fit_imputers['cat'].transform(df_processed[initial_categorical_features])
    # Reconstruct DataFrame
    original_cols = df_processed.columns
    imputed_cols = initial_numerical_features + initial_categorical_features
    df_temp_imputed = pd.DataFrame(df_processed[imputed_cols], columns=imputed_cols, index=df_processed.index)
    if 'ID' in original_cols: df_temp_imputed['ID'] = df_processed['ID']
    if target_col in original_cols: df_temp_imputed[target_col] = df_processed[target_col]
    df_processed = df_temp_imputed
    # Create Derived Features
    df_processed['Wine_Ratio'] = (df_processed['MntWines'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Meat_Ratio'] = (df_processed['MntMeatProducts'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Fruit_Ratio'] = (df_processed['MntFruits'] / df_processed['Total_Mnt'].replace(0, 1)).fillna(0)
    df_processed['Income_per_Person'] = (df_processed['Income'] / df_processed['Income_per_Person_Denom']).fillna(0)
    df_processed['Spending_per_Purchase'] = (df_processed['Total_Mnt'] / df_processed['Spend_per_Purchase_Denom']).fillna(0)
    df_processed.drop(['Income_per_Person_Denom', 'Spend_per_Purchase_Denom'], axis=1, inplace=True, errors='ignore')
    df_processed.replace([np.inf, -np.inf], 0, inplace=True)
    # Scaling
    final_cols_to_process = [col for col in df_processed.columns if col not in ['ID', target_col]]
    final_numerical_features = df_processed[final_cols_to_process].select_dtypes(include=np.number).columns.tolist()
    if is_train:
        scaler = StandardScaler()
        scaler.fit(df_processed[final_numerical_features])
        df_processed[final_numerical_features] = scaler.transform(df_processed[final_numerical_features])
        fit_scaler = scaler
    else:
        if fit_scaler is None: raise ValueError("Fitted scaler needed")
        df_processed[final_numerical_features] = fit_scaler.transform(df_processed[final_numerical_features])
    # Final Type Conversion
    final_categorical_features = df_processed[final_cols_to_process].select_dtypes(exclude=np.number).columns.tolist()
    for col in final_categorical_features: df_processed[col] = df_processed[col].astype(str)
    df_processed.columns = df_processed.columns.astype(str)
    if target_col in df_processed.columns: df_processed.rename(columns={str(target_col): target_col}, inplace=True)
    return df_processed, fit_imputers, fit_scaler


# --- Apply Preprocessing V1 ---
print("Preprocessing V1...")
train_df_processed_v1 = preprocess_data_v1(train_df.copy(), is_train=True)
if 'global_latest_date_v1' not in globals(): global_latest_date_v1 = datetime.datetime.now() + datetime.timedelta(days=1)
test_df_processed_v1 = preprocess_data_v1(test_df.copy(), is_train=False, latest_date=global_latest_date_v1)
print("V1 Preprocessing complete.")

# --- Apply Preprocessing V4 (for GBC V2 Base) ---
print("\nPreprocessing V4 (for GBC V2 base)...")
train_df_processed_v4, fitted_imputers_v4, fitted_scaler_v4 = preprocess_data_v4_catboost_manual(train_df.copy(), is_train=True)
if 'global_latest_date_v4' not in globals(): global_latest_date_v4 = datetime.datetime.now() + datetime.timedelta(days=1);
test_df_processed_v4, _, _ = preprocess_data_v4_catboost_manual(test_df.copy(), is_train=False, latest_date=global_latest_date_v4, fit_imputers=fitted_imputers_v4, fit_scaler=fitted_scaler_v4)
print("V4 Preprocessing complete.")


# --- Prepare Data V1 (Target for GBC V1) ---
X_v1 = train_df_processed_v1.drop(['ID', 'Target'], axis=1, errors='ignore')
y_v1 = train_df_processed_v1['Target']
X_test_v1 = test_df_processed_v1.drop('ID', axis=1, errors='ignore')
train_cols_v1 = X_v1.columns; test_cols_v1 = X_test_v1.columns
missing_in_test_v1 = set(train_cols_v1) - set(test_cols_v1)
for c in missing_in_test_v1: X_test_v1[c] = 0
missing_in_train_v1 = set(test_cols_v1) - set(train_cols_v1)
for c in missing_in_train_v1: X_v1[c] = 0
X_test_v1 = X_test_v1[train_cols_v1]

# --- Prepare Data V4 (Target for GBC V2) ---
# StackingClassifier needs the *same* X for all base estimators usually.
# So we use V4 data structure for BOTH GBC models in stacking.
X_stack = train_df_processed_v4.drop(['ID', 'Target'], axis=1, errors='ignore')
y_stack = train_df_processed_v4['Target'] # Target is the same
X_test_stack = test_df_processed_v4.drop('ID', axis=1, errors='ignore')
# Align Stacking Data
train_cols_stack = X_stack.columns; test_cols_stack = X_test_stack.columns
missing_in_test_stack = set(train_cols_stack) - set(test_cols_stack)
for c in missing_in_test_stack: X_test_stack[c] = 0
missing_in_train_stack = set(test_cols_stack) - set(train_cols_stack)
for c in missing_in_train_stack: X_stack[c] = 0
X_test_stack = X_test_stack[train_cols_stack]


# --- Define Preprocessing Pipelines for Base Models in Stacking ---

# Preprocessor for GBC V1 base model (Simplified Cats + OHE + Scale)
# Needs to operate on the common X_stack structure now
numerical_features_stack = X_stack.select_dtypes(include=np.number).columns.tolist()
categorical_features_stack_orig = ['Education', 'Marital_Status'] # Original cats in V4 structure
# Apply V1 simplification logic here before OHE
def simplify_cats_v1(df):
    df_copy = df.copy()
    df_copy['Marital_Status'] = df_copy['Marital_Status'].replace({'Married': 'Partner', 'Together': 'Partner','Absurd': 'Single', 'Alone': 'Single', 'YOLO': 'Single', 'Widow': 'Single','Divorced':'Single'})
    df_copy['Education'] = df_copy['Education'].replace({'2n Cycle': 'Master'})
    return df_copy

# Need FunctionTransformer if simplifying within pipeline
from sklearn.preprocessing import FunctionTransformer
simplifier_v1 = FunctionTransformer(simplify_cats_v1, validate=False)

# Updated Preprocessor V1 for Stacking
numerical_pipeline_stack = Pipeline([('scaler', StandardScaler())]) # Assumes imputation done
categorical_pipeline_v1_stack = Pipeline([
    # ('simplifier', simplifier_v1), # Apply simplification - tricky with FunctionTransformer and column names
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
# Need to select *original* cat columns for OHE in V1 pipeline
preprocessor_v1_stack = ColumnTransformer([
    ('num', numerical_pipeline_stack, numerical_features_stack),
    ('cat', categorical_pipeline_v1_stack, categorical_features_stack_orig)], # OHE original cats
    remainder='passthrough') # Pass through other V4 features


# Preprocessor for GBC V2 base model (Original Cats + Date Cats + OHE + Scale)
categorical_features_v4_names = X_stack.select_dtypes(include='object').columns.tolist() # All object columns in V4 structure
numerical_pipeline_stack_v2 = Pipeline([('scaler', StandardScaler())]) # Assumes imputation done
categorical_pipeline_v2_stack = Pipeline([('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])
preprocessor_v2_stack = ColumnTransformer([
    ('num', numerical_pipeline_stack_v2, numerical_features_stack),
    ('cat', categorical_pipeline_v2_stack, categorical_features_v4_names)], # OHE *all* object cols
    remainder='passthrough')


# --- Define BEST Hyperparameters (from previous runs) ---
best_params_gbc_v1_direct = { # No 'classifier__' prefix needed when setting on model directly
    'subsample': 0.6, 'n_estimators': 200, 'min_samples_split': 20,
    'min_samples_leaf': 20, 'max_features': 'sqrt', 'max_depth': 2,
    'learning_rate': 0.08, 'random_state':42}

best_params_gbc_v2_direct = { # Params that gave 0.848
    'subsample': 0.7, 'n_estimators': 300, 'min_samples_split': 20,
    'min_samples_leaf': 20, 'max_features': 'log2', 'max_depth': 2,
    'learning_rate': 0.05, 'random_state':42}

# --- Define Base Estimator Pipelines for Stacking ---
# Base Estimator 1: GBC with V1-style preprocessing
base_gbc_v1 = Pipeline([
    # Need to handle simplification OR use V1 data. Using V4 data + OHE original cats is simpler for stacking input consistency.
    # Let's redefine GBC V1 slightly: V4 features, but OHE only original cats, use V1 params.
    ('preprocessor', preprocessor_v1_stack), # Scales all numerics, OHEs 'Education', 'Marital_Status'
    ('classifier', GradientBoostingClassifier(**best_params_gbc_v1_direct))
])

# Base Estimator 2: GBC with V2-style preprocessing (all cats OHE'd)
base_gbc_v2 = Pipeline([
    ('preprocessor', preprocessor_v2_stack), # Scales all numerics, OHEs all object columns
    ('classifier', GradientBoostingClassifier(**best_params_gbc_v2_direct))
])

# List of base estimators for StackingClassifier
base_estimators = [
    ('gbc_v1_style', base_gbc_v1),
    ('gbc_v2_style', base_gbc_v2)
]

# --- Define Meta-Model ---
# Logistic Regression is a common choice, C=1 is default regularization
meta_model = LogisticRegression(solver='liblinear', random_state=42)

# --- Define Cross-Validation Strategy for Meta-Model ---
# StackingClassifier uses this CV to generate the out-of-fold predictions for training the meta-model
N_SPLITS_STACK = 5
RANDOM_STATE_STACK = 42
cv_stack = StratifiedKFold(n_splits=N_SPLITS_STACK, shuffle=True, random_state=RANDOM_STATE_STACK)

# --- Create Stacking Classifier ---
stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=meta_model,
    cv=cv_stack,
    stack_method='predict_proba', # Use probabilities as input to meta-model
    n_jobs=-1,
    passthrough=False # Do not pass original features to final estimator
)

# --- Train Stacking Classifier ---
print("\nTraining Stacking Classifier...")
# Fit on the V4 data structure, the pipelines within handle specifics
stacking_clf.fit(X_stack, y_stack)
print("Stacking Classifier training complete.")

# --- Predict on Test Data with Stacking Classifier ---
print("Predicting on test data using the Stacking Classifier...")
# Predict using the same V4 test data structure
test_predictions_stacking = stacking_clf.predict(X_test_stack)
print("Prediction complete.")

# --- Generate Submission File for Stacking ---
submission_df_stacking = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions_stacking})
submission_filename_stacking = 'submission_stacking_gbc1_gbc2.csv'
submission_df_stacking.to_csv(submission_filename_stacking, index=False)

print(f"\nSubmission file '{submission_filename_stacking}' created successfully.")
print(submission_df_stacking.head())
print(f"\nPredicted target distribution (Stacking):\n{submission_df_stacking['Target'].value_counts(normalize=True)}")

# Optional: Evaluate Stacking model on training data (less reliable due to OOF nature)
train_preds_stacking = stacking_clf.predict(X_stack)
train_accuracy_stacking = accuracy_score(y_stack, train_preds_stacking)
# Getting reliable ROC AUC requires predict_proba on OOF preds or separate validation
try:
    train_roc_auc_stacking = roc_auc_score(y_stack, stacking_clf.predict_proba(X_stack)[:, 1])
    print(f"\n--- Stacking Model Training Set Evaluation ---")
    print(f"Accuracy: {train_accuracy_stacking:.4f}")
    print(f"ROC AUC: {train_roc_auc_stacking:.5f}")
except Exception as e:
    print(f"\nCould not calculate ROC AUC for stacking on training set: {e}")
    print(f"Training Accuracy: {train_accuracy_stacking:.4f}")