In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import datetime

# Load the data
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    sample_submission_df = pd.read_csv("sample_submission.csv")
except FileNotFoundError:
    print("Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.")
    exit()

print("Data loaded successfully.")
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# --- Feature Engineering & Preprocessing ---

def preprocess_data(df, is_train=True, latest_date=None):
    """Applies feature engineering and basic cleaning."""
    df_processed = df.copy()

    # 1. Handle Outlier/Implausible Birth Years
    current_year = datetime.datetime.now().year
    # Use a reasonable reference year based on Dt_Customer if available, otherwise current year
    try:
        max_enroll_year = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce').dt.year.max()
        reference_year = max_enroll_year if pd.notna(max_enroll_year) else current_year
    except: # Handle cases where Dt_Customer might not exist or be parseable easily
        reference_year = current_year

    # Replace very old birth years (e.g., < 1910) with NaN to be imputed later
    df_processed.loc[df_processed['Year_Birth'] < 1910, 'Year_Birth'] = np.nan

    # Calculate Age (handle potential NaNs in Year_Birth temporarily)
    df_processed['Age'] = reference_year - df_processed['Year_Birth']
    df_processed.drop('Year_Birth', axis=1, inplace=True)


    # 2. Process Dt_Customer
    # Convert 'Dt_Customer' to datetime, coercing errors
    df_processed['Dt_Customer'] = pd.to_datetime(df_processed['Dt_Customer'], errors='coerce', dayfirst=False, infer_datetime_format=True)

    # Find the latest date for calculating tenure if not provided (from training set)
    if is_train:
        global global_latest_date # Store latest date from training set globally
        global_latest_date = df_processed['Dt_Customer'].max() + datetime.timedelta(days=1)
        latest_date_to_use = global_latest_date
    elif latest_date:
         latest_date_to_use = latest_date
    else:
        # Fallback if called on test set first or global_latest_date isn't set
        latest_date_to_use = datetime.datetime(reference_year + 1, 1, 1) # Use start of next year as reference
        print(f"Warning: Using fallback latest date: {latest_date_to_use}")


    # Calculate Customer_Lifetime (Tenure in days)
    # Handle potential NaT dates resulting from coerce errors
    mask = pd.notna(df_processed['Dt_Customer'])
    df_processed.loc[mask, 'Customer_Lifetime'] = (latest_date_to_use - df_processed.loc[mask, 'Dt_Customer']).dt.days
    df_processed['Customer_Lifetime'].fillna(df_processed['Customer_Lifetime'].median(), inplace=True) # Impute NaNs created by NaT
    df_processed.drop('Dt_Customer', axis=1, inplace=True)


    # 3. Simplify Marital Status
    # Consolidate categories
    df_processed['Marital_Status'] = df_processed['Marital_Status'].replace({
        'Married': 'Partner',
        'Together': 'Partner',
        'Absurd': 'Single',
        'Alone': 'Single',
        'YOLO': 'Single',
        'Widow': 'Single',
        'Divorced':'Single'
         }) # Grouping Married/Together and others into Single for simplicity

    # 4. Simplify Education
    df_processed['Education'] = df_processed['Education'].replace({'2n Cycle': 'Master'}) # Group '2n Cycle' with 'Master'

    # 5. Combine Children/Teens
    df_processed['Children'] = df_processed['Kidhome'] + df_processed['Teenhome']
    # Optionally drop original columns if 'Children' is deemed sufficient
    # df_processed.drop(['Kidhome', 'Teenhome'], axis=1, inplace=True)

    # 6. Total Spending
    mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
    df_processed['Total_Mnt'] = df_processed[mnt_cols].sum(axis=1)

    # 7. Total Purchases
    purch_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases']
    df_processed['Total_Purchases'] = df_processed[purch_cols].sum(axis=1)

    # 8. Total Campaigns Accepted
    cmp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
    df_processed['Total_CmpAccepted'] = df_processed[cmp_cols].sum(axis=1)

    # 9. Drop Constant Columns (identified during EDA)
    df_processed.drop(['Z_CostContact', 'Z_Revenue'], axis=1, inplace=True, errors='ignore') # errors='ignore' in case they were already dropped

    return df_processed

# Preprocess training data
train_df_processed = preprocess_data(train_df, is_train=True)
# Preprocess test data using the latest date from training data
test_df_processed = preprocess_data(test_df, is_train=False, latest_date=global_latest_date)

print("Feature engineering complete.")
print("\nTrain Data Info after processing:")
train_df_processed.info()
print("\nTest Data Info after processing:")
test_df_processed.info()


# --- Model Training ---

# Separate features (X) and target (y)
X = train_df_processed.drop(['ID', 'Target'], axis=1)
y = train_df_processed['Target']
X_test = test_df_processed.drop('ID', axis=1)

# Align columns - crucial after feature engineering if columns were added/dropped differently (shouldn't happen here but good practice)
train_cols = X.columns
test_cols = X_test.columns

missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    X_test[c] = 0 # Add missing columns to test set with default value (0)

missing_in_train = set(test_cols) - set(train_cols)
for c in missing_in_train:
    X[c] = 0 # Add missing columns to train set with default value (0) - less likely

X_test = X_test[train_cols] # Ensure order is the same


# Identify column types for preprocessing
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Create preprocessing pipelines for numerical and categorical features
numerical_pipeline = Pipeline([
    ('imputer_num', SimpleImputer(strategy='median')), # Impute missing numericals (Age, Income, Customer_Lifetime)
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer_cat', SimpleImputer(strategy='most_frequent')), # Impute missing categoricals (if any)
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # Use sparse_output=False for easier debugging if needed
])

# Create a column transformer to apply different pipelines to different columns
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
], remainder='passthrough') # Keep any columns not specified (though there shouldn't be any here)


# Define the model
# GradientBoostingClassifier often works well. random_state for reproducibility.
# Consider tuning hyperparameters later using GridSearchCV or RandomizedSearchCV
model = GradientBoostingClassifier(n_estimators=150, # Increased slightly
                                 learning_rate=0.08, # Slightly decreased
                                 max_depth=4,       # Increased slightly
                                 subsample=0.8,     # Added subsampling
                                 random_state=42)

# Create the full pipeline: preprocess + model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

# Train the model on the entire training dataset
print("\nTraining the model...")
pipeline.fit(X, y)
print("Model training complete.")

# --- Prediction ---

# Predict on the preprocessed test data
print("Predicting on test data...")
test_predictions = pipeline.predict(X_test)
print("Prediction complete.")

# --- Submission File Generation ---

# Create the submission DataFrame
submission_df = pd.DataFrame({'ID': test_df['ID'], 'Target': test_predictions})

# Save the submission file
submission_filename = 'submission.csv'
submission_df.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully.")
print(submission_df.head())
print(f"\nPredicted target distribution:\n{submission_df['Target'].value_counts(normalize=True)}")

# Optional: Evaluate on the training set (for sanity check, not a true performance measure)
train_preds = pipeline.predict(X)
train_accuracy = accuracy_score(y, train_preds)
train_roc_auc = roc_auc_score(y, pipeline.predict_proba(X)[:, 1]) # Use probabilities for AUC
print(f"\n--- Training Set Evaluation (Sanity Check) ---")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"ROC AUC: {train_roc_auc:.4f}")
# print("Classification Report:\n", classification_report(y, train_preds))

Make sure train.csv, test.csv, and sample_submission.csv are in the same directory.
Data loaded successfully.


NameError: name 'train_df' is not defined