# II. Baseline Modeling: Customer Product Adoption Prediction

## Objectives
- Establish baseline model performance benchmarks
- Compare multiple algorithms for product adoption prediction
- Implement proper evaluation framework with business metrics
- Create model interpretation and feature importance analysis
- Prepare foundation for advanced modeling techniques

## Modeling Strategy
We'll implement several baseline models:
1. **Logistic Regression** - Linear baseline with interpretability
2. **Random Forest** - Ensemble method for feature importance
3. **XGBoost** - Gradient boosting for performance
4. **Neural Network** - Deep learning baseline
5. **Naive Bayes** - Probabilistic baseline

## Evaluation Framework
- **Primary Metrics**: Precision@K, Recall@K, F1-Score
- **Business Metrics**: Conversion Lift, Revenue Impact
- **Statistical Tests**: McNemar's test for model comparison
- **Cross-Validation**: Time-based splits to prevent leakage

In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    precision_recall_curve, roc_curve, average_precision_score,
    precision_score, recall_score, f1_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Advanced ML Libraries
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("⚠️ XGBoost not available. Install with: pip install xgboost")

# Utilities
import warnings
import json
from datetime import datetime
from IPython.display import display
import pickle

warnings.filterwarnings('ignore')

# Set plotting style
try:
    plt.style.use('seaborn-v0_8')
except OSError:
    try:
        plt.style.use('seaborn')
    except OSError:
        plt.style.use('default')

# Configuration
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("🚀 Baseline Modeling Environment Initialized")
print(f"XGBoost Available: {XGBOOST_AVAILABLE}")

⚠️ XGBoost not available. Install with: pip install xgboost
🚀 Baseline Modeling Environment Initialized
XGBoost Available: False


## 1. Data Loading and Preparation

In [2]:
# Load processed datasets from EDA
print("📊 LOADING PROCESSED DATASETS")
print("=" * 50)

try:
    # Try to load processed datasets first
    adoption_data = pd.read_csv('data/processed_adoption_logs.csv')
    print(f"✓ Loaded processed adoption logs: {adoption_data.shape}")
    
    products_data = pd.read_csv('data/processed_products.csv')
    print(f"✓ Loaded processed products: {products_data.shape}")
    
    PROCESSED_DATA_AVAILABLE = True
    
except FileNotFoundError:
    print("⚠️ Processed datasets not found. Loading raw data...")
    PROCESSED_DATA_AVAILABLE = False
    
    # Load raw datasets
    try:
        adoption_data = pd.read_csv('data/data_adoption_logs.csv')
        products_data = pd.read_csv('data/data_products.csv')
        customers_data = pd.read_csv('data/data_customers.csv')
        
        print(f"✓ Loaded raw adoption logs: {adoption_data.shape}")
        print(f"✓ Loaded raw products: {products_data.shape}")
        print(f"✓ Loaded raw customers: {customers_data.shape}")
        
    except FileNotFoundError as e:
        print(f"❌ Error loading datasets: {e}")
        print("Please ensure data files are available in the data/ directory")

# Load feature selection if available
try:
    with open('selected_features.txt', 'r') as f:
        selected_features = [line.strip() for line in f.readlines()]
    print(f"✓ Loaded {len(selected_features)} selected features")
    FEATURE_SELECTION_AVAILABLE = True
except FileNotFoundError:
    print("⚠️ Selected features file not found. Will use all features.")
    FEATURE_SELECTION_AVAILABLE = False
    selected_features = []

print(f"\nPreprocessed data available: {PROCESSED_DATA_AVAILABLE}")
print(f"Feature selection available: {FEATURE_SELECTION_AVAILABLE}")

📊 LOADING PROCESSED DATASETS
✓ Loaded processed adoption logs: (949650, 12)
✓ Loaded processed products: (1000, 70)
⚠️ Selected features file not found. Will use all features.

Preprocessed data available: True
Feature selection available: False
✓ Loaded processed adoption logs: (949650, 12)
✓ Loaded processed products: (1000, 70)
⚠️ Selected features file not found. Will use all features.

Preprocessed data available: True
Feature selection available: False


In [3]:
# Prepare modeling dataset
print("\n🔧 PREPARING MODELING DATASET")
print("=" * 40)

if PROCESSED_DATA_AVAILABLE:
    # Use processed data
    modeling_data = adoption_data.copy()
    print("Using processed adoption logs as base dataset")
    
else:
    # Quick preprocessing for raw data
    print("Applying basic preprocessing to raw data...")
    
    modeling_data = adoption_data.copy()
    
    # Basic encoding for categorical columns
    categorical_cols = modeling_data.select_dtypes(include=['object']).columns.tolist()
    categorical_cols = [col for col in categorical_cols if col not in ['user_id', 'product_id']]
    
    for col in categorical_cols:
        if modeling_data[col].nunique() <= 10:
            # One-hot encode low cardinality
            dummies = pd.get_dummies(modeling_data[col], prefix=col, drop_first=True)
            modeling_data = pd.concat([modeling_data.drop(columns=[col]), dummies], axis=1)
        else:
            # Drop high cardinality categorical
            modeling_data = modeling_data.drop(columns=[col])
    
    # Fill missing values
    numeric_cols = modeling_data.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        if modeling_data[col].isnull().sum() > 0:
            modeling_data[col].fillna(modeling_data[col].median(), inplace=True)

# Prepare features and target
if 'adopted' in modeling_data.columns:
    target_col = 'adopted'
    
    # Remove ID columns from features
    id_cols = ['user_id', 'product_id']
    feature_cols = [col for col in modeling_data.columns if col not in [target_col] + id_cols]
    
    # Apply feature selection if available
    if FEATURE_SELECTION_AVAILABLE and selected_features:
        available_selected = [f for f in selected_features if f in feature_cols]
        if available_selected:
            feature_cols = available_selected
            print(f"Applied feature selection: {len(feature_cols)} features")
    
    X = modeling_data[feature_cols]
    y = modeling_data[target_col]
    
    # Convert boolean target to integer
    if y.dtype == 'bool':
        y = y.astype(int)
    
    print(f"\nModelingDataset prepared:")
    print(f"  Features: {X.shape[1]}")
    print(f"  Samples: {X.shape[0]:,}")
    print(f"  Target distribution: {y.value_counts().to_dict()}")
    print(f"  Positive rate: {y.mean():.4f}")
    
else:
    print("❌ Target variable 'adopted' not found in dataset")
    X, y = None, None


🔧 PREPARING MODELING DATASET
Using processed adoption logs as base dataset

ModelingDataset prepared:
  Features: 11
  Samples: 949,650
  Target distribution: {0: 711603, 1: 238047}
  Positive rate: 0.2507


## 2. Data Splitting and Validation Strategy

In [4]:
# Implement time-based data splitting for realistic evaluation
print("🔀 IMPLEMENTING VALIDATION STRATEGY")
print("=" * 40)

if X is not None and y is not None:
    # Check for temporal columns for time-based split
    temporal_cols = [col for col in modeling_data.columns if 'date' in col.lower() or 'days' in col.lower()]
    
    if 'tenure_days' in modeling_data.columns:
        # Use tenure_days for time-based split
        print("Using tenure_days for time-based validation split")
        
        # Sort by tenure to simulate temporal order
        modeling_data_sorted = modeling_data.sort_values('tenure_days')
        
        # Use last 20% as test set (most recent interactions)
        split_idx = int(0.8 * len(modeling_data_sorted))
        
        train_data = modeling_data_sorted.iloc[:split_idx]
        test_data = modeling_data_sorted.iloc[split_idx:]
        
        X_train = train_data[feature_cols]
        y_train = train_data[target_col].astype(int)
        X_test = test_data[feature_cols]
        y_test = test_data[target_col].astype(int)
        
        print(f"Time-based split applied:")
        print(f"  Train set: {X_train.shape[0]:,} samples")
        print(f"  Test set: {X_test.shape[0]:,} samples")
        print(f"  Train positive rate: {y_train.mean():.4f}")
        print(f"  Test positive rate: {y_test.mean():.4f}")
        
    else:
        # Standard stratified split
        print("Using stratified random split")
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
        )
        
        print(f"Stratified split applied:")
        print(f"  Train set: {X_train.shape[0]:,} samples")
        print(f"  Test set: {X_test.shape[0]:,} samples")
        print(f"  Train positive rate: {y_train.mean():.4f}")
        print(f"  Test positive rate: {y_test.mean():.4f}")
    
    # Create validation set from training data
    X_train_final, X_val, y_train_final, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=RANDOM_STATE, stratify=y_train
    )
    
    print(f"\nFinal data splits:")
    print(f"  Training: {X_train_final.shape[0]:,} samples")
    print(f"  Validation: {X_val.shape[0]:,} samples")
    print(f"  Test: {X_test.shape[0]:,} samples")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_final)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    print("✓ Feature scaling applied")
    
else:
    print("❌ Cannot proceed with modeling - data preparation failed")

🔀 IMPLEMENTING VALIDATION STRATEGY
Using tenure_days for time-based validation split


MemoryError: Unable to allocate 58.0 MiB for an array with shape (10, 759720) and data type float64

## 3. Baseline Models Implementation

In [None]:
# Define and train baseline models
print("🤖 TRAINING BASELINE MODELS")
print("=" * 40)

if X is not None and y is not None:
    # Initialize models
    models = {}
    model_results = {}
    
    # 1. Logistic Regression
    print("\n1️⃣ Training Logistic Regression...")
    models['logistic'] = LogisticRegression(
        random_state=RANDOM_STATE,
        class_weight='balanced',
        max_iter=1000
    )
    models['logistic'].fit(X_train_scaled, y_train_final)
    print("   ✓ Logistic Regression trained")
    
    # 2. Random Forest
    print("\n2️⃣ Training Random Forest...")
    models['random_forest'] = RandomForestClassifier(
        n_estimators=100,
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_jobs=-1
    )
    models['random_forest'].fit(X_train_final, y_train_final)
    print("   ✓ Random Forest trained")
    
    # 3. XGBoost (if available)
    if XGBOOST_AVAILABLE:
        print("\n3️⃣ Training XGBoost...")
        
        # Calculate scale_pos_weight for class imbalance
        scale_pos_weight = (y_train_final == 0).sum() / (y_train_final == 1).sum()
        
        models['xgboost'] = xgb.XGBClassifier(
            random_state=RANDOM_STATE,
            scale_pos_weight=scale_pos_weight,
            eval_metric='logloss'
        )
        models['xgboost'].fit(X_train_final, y_train_final)
        print("   ✓ XGBoost trained")
    
    # 4. Naive Bayes
    print("\n4️⃣ Training Naive Bayes...")
    models['naive_bayes'] = GaussianNB()
    models['naive_bayes'].fit(X_train_scaled, y_train_final)
    print("   ✓ Naive Bayes trained")
    
    # 5. Neural Network
    print("\n5️⃣ Training Neural Network...")
    models['neural_network'] = MLPClassifier(
        hidden_layer_sizes=(100, 50),
        random_state=RANDOM_STATE,
        max_iter=500,
        early_stopping=True,
        validation_fraction=0.1
    )
    models['neural_network'].fit(X_train_scaled, y_train_final)
    print("   ✓ Neural Network trained")
    
    print(f"\n✅ Trained {len(models)} baseline models")
    
else:
    print("❌ Cannot train models - data not available")
    models = {}

## 4. Model Evaluation and Comparison

In [None]:
# Comprehensive model evaluation
print("📊 COMPREHENSIVE MODEL EVALUATION")
print("=" * 50)

if models and X is not None:
    evaluation_results = {}
    predictions = {}
    
    for model_name, model in models.items():
        print(f"\n🔍 Evaluating {model_name.upper()}...")
        
        # Use scaled features for models that need them
        if model_name in ['logistic', 'naive_bayes', 'neural_network']:
            val_features = X_val_scaled
            test_features = X_test_scaled
        else:
            val_features = X_val
            test_features = X_test
        
        # Validation predictions
        y_val_pred = model.predict(val_features)
        y_val_proba = model.predict_proba(val_features)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Test predictions
        y_test_pred = model.predict(test_features)
        y_test_proba = model.predict_proba(test_features)[:, 1] if hasattr(model, 'predict_proba') else None
        
        # Store predictions
        predictions[model_name] = {
            'val_pred': y_val_pred,
            'val_proba': y_val_proba,
            'test_pred': y_test_pred,
            'test_proba': y_test_proba
        }
        
        # Calculate metrics
        val_metrics = {
            'precision': precision_score(y_val, y_val_pred),
            'recall': recall_score(y_val, y_val_pred),
            'f1': f1_score(y_val, y_val_pred),
            'roc_auc': roc_auc_score(y_val, y_val_proba) if y_val_proba is not None else None
        }
        
        test_metrics = {
            'precision': precision_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred),
            'f1': f1_score(y_test, y_test_pred),
            'roc_auc': roc_auc_score(y_test, y_test_proba) if y_test_proba is not None else None
        }
        
        evaluation_results[model_name] = {
            'validation': val_metrics,
            'test': test_metrics
        }
        
        print(f"   Validation - Precision: {val_metrics['precision']:.4f}, Recall: {val_metrics['recall']:.4f}, F1: {val_metrics['f1']:.4f}")
        print(f"   Test - Precision: {test_metrics['precision']:.4f}, Recall: {test_metrics['recall']:.4f}, F1: {test_metrics['f1']:.4f}")
    
    print("\n✅ Model evaluation completed")
    
else:
    print("❌ Cannot evaluate models - models not trained")
    evaluation_results = {}
    predictions = {}

In [None]:
# Create comprehensive results comparison
if evaluation_results:
    print("\n📈 MODEL PERFORMANCE COMPARISON")
    print("=" * 50)
    
    # Create results DataFrame
    results_data = []
    for model_name, results in evaluation_results.items():
        for split, metrics in results.items():
            row = {'Model': model_name, 'Split': split}
            row.update(metrics)
            results_data.append(row)
    
    results_df = pd.DataFrame(results_data)
    
    # Display results table
    print("\n📊 Detailed Results:")
    pivot_results = results_df.pivot_table(
        index='Model', 
        columns='Split', 
        values=['precision', 'recall', 'f1', 'roc_auc'],
        aggfunc='first'
    ).round(4)
    
    display(pivot_results)
    
    # Identify best model
    test_f1_scores = results_df[results_df['Split'] == 'test']['f1']
    best_model_idx = test_f1_scores.idxmax()
    best_model_name = results_df.loc[best_model_idx, 'Model']
    best_f1 = results_df.loc[best_model_idx, 'f1']
    
    print(f"\n🏆 Best Model: {best_model_name.upper()} (Test F1: {best_f1:.4f})")
    
    # Visualize results
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Precision Comparison', 'Recall Comparison', 'F1-Score Comparison', 'ROC-AUC Comparison')
    )
    
    test_results = results_df[results_df['Split'] == 'test']
    
    # Precision
    fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['precision'], name='Precision'), row=1, col=1)
    
    # Recall
    fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['recall'], name='Recall'), row=1, col=2)
    
    # F1-Score
    fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['f1'], name='F1-Score'), row=2, col=1)
    
    # ROC-AUC
    roc_auc_data = test_results.dropna(subset=['roc_auc'])
    fig.add_trace(go.Bar(x=roc_auc_data['Model'], y=roc_auc_data['roc_auc'], name='ROC-AUC'), row=2, col=2)
    
    fig.update_layout(height=800, title_text="Model Performance Comparison (Test Set)", showlegend=False)
    fig.show()

## 5. Feature Importance Analysis

In [None]:
# Feature importance analysis
print("🔍 FEATURE IMPORTANCE ANALYSIS")
print("=" * 40)

if models and X is not None:
    feature_importance_results = {}
    
    # Random Forest Feature Importance
    if 'random_forest' in models:
        rf_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': models['random_forest'].feature_importances_
        }).sort_values('importance', ascending=False)
        
        feature_importance_results['random_forest'] = rf_importance
        
        print("\n🌲 Random Forest - Top 10 Important Features:")
        for i, row in rf_importance.head(10).iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")
    
    # XGBoost Feature Importance
    if 'xgboost' in models and XGBOOST_AVAILABLE:
        xgb_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': models['xgboost'].feature_importances_
        }).sort_values('importance', ascending=False)
        
        feature_importance_results['xgboost'] = xgb_importance
        
        print("\n⚡ XGBoost - Top 10 Important Features:")
        for i, row in xgb_importance.head(10).iterrows():
            print(f"   {row['feature']}: {row['importance']:.4f}")
    
    # Logistic Regression Coefficients
    if 'logistic' in models:
        lr_coefs = pd.DataFrame({
            'feature': feature_cols,
            'coefficient': models['logistic'].coef_[0]
        })
        lr_coefs['abs_coefficient'] = lr_coefs['coefficient'].abs()
        lr_coefs = lr_coefs.sort_values('abs_coefficient', ascending=False)
        
        feature_importance_results['logistic'] = lr_coefs
        
        print("\n📈 Logistic Regression - Top 10 Important Features (by |coefficient|):")
        for i, row in lr_coefs.head(10).iterrows():
            print(f"   {row['feature']}: {row['coefficient']:.4f}")
    
    # Visualize feature importance (Random Forest)
    if 'random_forest' in feature_importance_results:
        plt.figure(figsize=(10, 8))
        top_features = feature_importance_results['random_forest'].head(15)
        sns.barplot(data=top_features, x='importance', y='feature')
        plt.title('Top 15 Feature Importances (Random Forest)')
        plt.xlabel('Importance')
        plt.tight_layout()
        plt.show()
    
    print("\n✅ Feature importance analysis completed")
    
else:
    print("❌ Cannot analyze feature importance - models not available")
    feature_importance_results = {}

## 6. Model Interpretation and Business Insights

In [None]:
# Business-focused model interpretation
print("💼 BUSINESS INSIGHTS AND MODEL INTERPRETATION")
print("=" * 60)

if evaluation_results and feature_importance_results:
    business_insights = []
    
    # Performance insights
    print("\n📊 Performance Insights:")
    
    best_model_name = best_model_name if 'best_model_name' in locals() else list(models.keys())[0]
    best_model_results = evaluation_results[best_model_name]['test']
    
    precision = best_model_results['precision']
    recall = best_model_results['recall']
    f1 = best_model_results['f1']
    
    print(f"   • Best performing model: {best_model_name}")
    print(f"   • Precision: {precision:.4f} ({precision*100:.1f}% of predicted adoptions are correct)")
    print(f"   • Recall: {recall:.4f} ({recall*100:.1f}% of actual adoptions are captured)")
    print(f"   • F1-Score: {f1:.4f} (balanced precision-recall performance)")
    
    business_insights.append(f"Best model achieves {precision*100:.1f}% precision and {recall*100:.1f}% recall")
    
    # Feature insights
    if 'random_forest' in feature_importance_results:
        print("\n🎯 Key Predictive Features:")
        top_features = feature_importance_results['random_forest'].head(5)
        
        for i, row in top_features.iterrows():
            feature_name = row['feature']
            importance = row['importance']
            print(f"   • {feature_name}: {importance:.4f} importance")
            
            # Interpret feature meaning
            if 'monetary' in feature_name.lower():
                interpretation = "Customer spending behavior is a strong predictor"
            elif 'activity' in feature_name.lower():
                interpretation = "Customer engagement level drives adoption"
            elif 'tenure' in feature_name.lower():
                interpretation = "Customer relationship length affects adoption likelihood"
            elif 'risk' in feature_name.lower():
                interpretation = "Risk profile influences product adoption decisions"
            else:
                interpretation = "Significant predictor of adoption behavior"
            
            print(f"     → {interpretation}")
            business_insights.append(f"{feature_name} is a key predictor: {interpretation}")
    
    # Business impact estimation
    print("\n💰 Estimated Business Impact:")
    
    if 'y_test' in locals():
        total_customers = len(y_test)
        actual_adoptions = y_test.sum()
        
        # Calculate potential impact with best model
        if best_model_name in predictions:
            predicted_adoptions = predictions[best_model_name]['test_pred'].sum()
            true_positives = ((predictions[best_model_name]['test_pred'] == 1) & (y_test == 1)).sum()
            
            print(f"   • Total customers evaluated: {total_customers:,}")
            print(f"   • Actual adoptions: {actual_adoptions:,}")
            print(f"   • Model predictions: {predicted_adoptions:,}")
            print(f"   • Correctly identified adoptions: {true_positives:,}")
            
            # Estimate targeting efficiency
            if predicted_adoptions > 0:
                targeting_efficiency = true_positives / predicted_adoptions
                print(f"   • Targeting efficiency: {targeting_efficiency:.2%}")
                
                # Estimate cost savings (assuming targeting costs)
                cost_per_target = 10  # Example cost per customer targeted
                baseline_cost = total_customers * cost_per_target
                optimized_cost = predicted_adoptions * cost_per_target
                cost_savings = baseline_cost - optimized_cost
                
                print(f"   • Estimated cost savings: ${cost_savings:,.0f} (vs. targeting all customers)")
                business_insights.append(f"Model-based targeting could save ${cost_savings:,.0f} in marketing costs")
    
    # Recommendations
    print("\n🎯 Strategic Recommendations:")
    recommendations = [
        f"Deploy {best_model_name} model for customer targeting",
        "Focus marketing on high-importance feature segments",
        "Implement A/B testing to validate model performance",
        "Monitor model performance for drift over time",
        "Collect additional data on top predictive features"
    ]
    
    for i, rec in enumerate(recommendations, 1):
        print(f"   {i}. {rec}")
    
    # Store insights
    business_summary = {
        'analysis_date': datetime.now().isoformat(),
        'best_model': best_model_name,
        'performance_metrics': best_model_results,
        'key_insights': business_insights,
        'recommendations': recommendations
    }
    
    with open('baseline_modeling_insights.json', 'w') as f:
        json.dump(business_summary, f, indent=2, default=str)
    
    print("\n📁 Business insights saved to 'baseline_modeling_insights.json'")
    
else:
    print("❌ Cannot generate business insights - evaluation results not available")

## 7. Model Persistence and Next Steps

In [None]:
# Save models and prepare for next steps
print("💾 MODEL PERSISTENCE AND DEPLOYMENT PREPARATION")
print("=" * 60)

if models:
    # Save best model
    best_model = models[best_model_name]
    
    # Create model package
    model_package = {
        'model': best_model,
        'scaler': scaler if best_model_name in ['logistic', 'naive_bayes', 'neural_network'] else None,
        'feature_columns': feature_cols,
        'model_type': best_model_name,
        'performance_metrics': evaluation_results[best_model_name]['test'],
        'training_date': datetime.now().isoformat()
    }
    
    # Save model package
    with open(f'best_model_{best_model_name}.pkl', 'wb') as f:
        pickle.dump(model_package, f)
    
    print(f"✅ Best model ({best_model_name}) saved to 'best_model_{best_model_name}.pkl'")
    
    # Save all models for comparison
    all_models_package = {
        'models': models,
        'scaler': scaler,
        'feature_columns': feature_cols,
        'evaluation_results': evaluation_results,
        'feature_importance': feature_importance_results,
        'training_date': datetime.now().isoformat()
    }
    
    with open('all_baseline_models.pkl', 'wb') as f:
        pickle.dump(all_models_package, f)
    
    print("✅ All baseline models saved to 'all_baseline_models.pkl'")
    
    # Create model summary report
    model_summary = {
        'baseline_modeling_complete': True,
        'models_trained': list(models.keys()),
        'best_model': best_model_name,
        'best_model_metrics': evaluation_results[best_model_name]['test'],
        'dataset_info': {
            'total_samples': len(X),
            'features': len(feature_cols),
            'positive_rate': float(y.mean())
        },
        'next_steps': [
            "Implement advanced feature engineering",
            "Experiment with ensemble methods",
            "Optimize hyperparameters",
            "Deploy model for A/B testing",
            "Set up monitoring and retraining pipeline"
        ]
    }
    
    with open('baseline_modeling_summary.json', 'w') as f:
        json.dump(model_summary, f, indent=2, default=str)
    
    print("✅ Modeling summary saved to 'baseline_modeling_summary.json'")

print("\n🎯 NEXT STEPS FOR ADVANCED MODELING:")
print("=" * 40)
next_steps = [
    "1. Advanced Feature Engineering",
    "   - Create interaction features between customer and product attributes",
    "   - Implement temporal features and seasonality patterns",
    "   - Develop customer lifetime value predictions",
    "",
    "2. Model Optimization",
    "   - Hyperparameter tuning with GridSearch/RandomSearch",
    "   - Ensemble methods (Voting, Stacking, Blending)",
    "   - Advanced algorithms (LightGBM, CatBoost)",
    "",
    "3. Evaluation Enhancement", 
    "   - Business-specific metrics (Revenue lift, Customer acquisition cost)",
    "   - Time-series validation for temporal robustness",
    "   - Fairness and bias analysis",
    "",
    "4. Production Deployment",
    "   - Model serving infrastructure",
    "   - A/B testing framework", 
    "   - Monitoring and alerting system",
    "   - Automated retraining pipeline"
]

for step in next_steps:
    print(step)

print("\n🚀 Ready to proceed with advanced modeling techniques!")
print("📊 Baseline established - all models trained and evaluated")
print("✅ Foundation prepared for production deployment")