# I. Exploratory Data Analysis: Customer Product Adoption Prediction

## Objectives
- Understand the structure and quality of our datasets
- Explore relationships between customer features and product adoption
- Identify patterns and insights for feature engineering
- Assess data quality and preprocessing needs

## Dataset Overview
We have 3 main datasets:
1. **data_customers.csv**: Customer demographics and behavior (37 features)
2. **data_products.csv**: Product characteristics and performance (26 features)
3. **data_adoption_logs.csv**: Historical adoption records (10 features)

In [1]:
# Import Required Libraries
print("🔄 Loading libraries...")

# Core libraries (always available)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display
import warnings
import json
import os
from datetime import datetime

# Configure display settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Core libraries loaded")

# Try to import IPython display
try:
    from IPython.display import display
    IPYTHON_AVAILABLE = True
    print("✅ IPython display loaded")
except ImportError:
    IPYTHON_AVAILABLE = False
    # Create a fallback display function
    def display(obj):
        print(obj)
    print("⚠️ IPython not available, using fallback display")

# Import plotting libraries with comprehensive error handling
MATPLOTLIB_AVAILABLE = False
SEABORN_AVAILABLE = False

try:
    import matplotlib
    matplotlib.use('Agg')  # Use non-interactive backend as fallback
    import matplotlib.pyplot as plt
    
    # Try different style options
    style_applied = False
    for style in ['seaborn-v0_8', 'seaborn', 'ggplot', 'default']:
        try:
            plt.style.use(style)
            print(f"✅ Matplotlib style '{style}' applied")
            style_applied = True
            break
        except:
            continue
    
    if not style_applied:
        print("⚠️ Using matplotlib default style")
    
    MATPLOTLIB_AVAILABLE = True
    print("✅ Matplotlib loaded successfully")
    
    # Try to load seaborn
    try:
        import seaborn as sns
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
        sns.set_palette(colors)
        SEABORN_AVAILABLE = True
        print("✅ Seaborn loaded successfully")
    except ImportError:
        SEABORN_AVAILABLE = False
        print("⚠️ Seaborn not available")
        
except ImportError as e:
    MATPLOTLIB_AVAILABLE = False
    print(f"⚠️ Matplotlib not available: {e}")
    
    # Create dummy plotting functions
    class DummyPlt:
        @staticmethod
        def figure(*args, **kwargs):
            print("📊 Plot would be displayed here (matplotlib not available)")
        @staticmethod
        def show(*args, **kwargs):
            pass
        @staticmethod
        def title(*args, **kwargs):
            pass
        @staticmethod
        def xlabel(*args, **kwargs):
            pass
        @staticmethod
        def ylabel(*args, **kwargs):
            pass
        @staticmethod
        def tight_layout(*args, **kwargs):
            pass
    
    plt = DummyPlt()
    
    class DummySns:
        @staticmethod
        def barplot(*args, **kwargs):
            print("📊 Seaborn barplot would be displayed here")
        @staticmethod
        def heatmap(*args, **kwargs):
            print("📊 Seaborn heatmap would be displayed here")
        @staticmethod
        def set_palette(*args, **kwargs):
            pass
    
    sns = DummySns()

# Import plotly with error handling
PLOTLY_AVAILABLE = False
try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    PLOTLY_AVAILABLE = True
    print("✅ Plotly loaded successfully")
except ImportError as e:
    print(f"⚠️ Plotly not available: {e}")
    
    # Create dummy plotly classes
    class DummyFig:
        def add_trace(self, *args, **kwargs):
            pass
        def update_layout(self, *args, **kwargs):
            pass
        def show(self, *args, **kwargs):
            print("📊 Interactive plot would be displayed here (plotly not available)")
    
    class DummyGo:
        @staticmethod
        def Figure(*args, **kwargs):
            return DummyFig()
        @staticmethod
        def Bar(*args, **kwargs):
            return {}
        @staticmethod
        def Pie(*args, **kwargs):
            return {}
        @staticmethod
        def Histogram(*args, **kwargs):
            return {}
    
    class DummyPx:
        @staticmethod
        def bar(*args, **kwargs):
            print("📊 Plotly express bar chart would be displayed here")
    
    def make_subplots(*args, **kwargs):
        return DummyFig()
    
    go = DummyGo()
    px = DummyPx()

# Import scikit-learn with error handling
SKLEARN_AVAILABLE = False
try:
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.impute import SimpleImputer, KNNImputer
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.feature_selection import mutual_info_classif
    from sklearn.model_selection import train_test_split
    SKLEARN_AVAILABLE = True
    print("✅ Scikit-learn loaded successfully")
except ImportError as e:
    print(f"⚠️ Scikit-learn not available: {e}")
    print("   Run: pip install scikit-learn")

# Summary
print("\n🚀 Environment Setup Complete!")
print(f"📦 Library Status:")
print(f"   Pandas: ✅ v{pd.__version__}")
print(f"   NumPy: ✅ v{np.__version__}")
print(f"   IPython: {'✅' if IPYTHON_AVAILABLE else '⚠️'}")
print(f"   Matplotlib: {'✅' if MATPLOTLIB_AVAILABLE else '⚠️'}")
print(f"   Seaborn: {'✅' if SEABORN_AVAILABLE else '⚠️'}")
print(f"   Plotly: {'✅' if PLOTLY_AVAILABLE else '⚠️'}")
print(f"   Scikit-learn: {'✅' if SKLEARN_AVAILABLE else '⚠️'}")

# Set global flags for conditional execution
globals().update({
    'MATPLOTLIB_AVAILABLE': MATPLOTLIB_AVAILABLE,
    'SEABORN_AVAILABLE': SEABORN_AVAILABLE,
    'PLOTLY_AVAILABLE': PLOTLY_AVAILABLE,
    'SKLEARN_AVAILABLE': SKLEARN_AVAILABLE,
    'IPYTHON_AVAILABLE': IPYTHON_AVAILABLE
})

print("\n✨ Ready to start data analysis!")

🔄 Loading libraries...
✅ Core libraries loaded
✅ IPython display loaded
✅ Matplotlib style 'seaborn-v0_8' applied
✅ Matplotlib loaded successfully
✅ Seaborn loaded successfully
✅ Plotly loaded successfully
✅ Core libraries loaded
✅ IPython display loaded
✅ Matplotlib style 'seaborn-v0_8' applied
✅ Matplotlib loaded successfully
✅ Seaborn loaded successfully
✅ Plotly loaded successfully
✅ Scikit-learn loaded successfully

🚀 Environment Setup Complete!
📦 Library Status:
   Pandas: ✅ v2.2.3
   NumPy: ✅ v2.2.4
   IPython: ✅
   Matplotlib: ✅
   Seaborn: ✅
   Plotly: ✅
   Scikit-learn: ✅

✨ Ready to start data analysis!
✅ Scikit-learn loaded successfully

🚀 Environment Setup Complete!
📦 Library Status:
   Pandas: ✅ v2.2.3
   NumPy: ✅ v2.2.4
   IPython: ✅
   Matplotlib: ✅
   Seaborn: ✅
   Plotly: ✅
   Scikit-learn: ✅

✨ Ready to start data analysis!


## 1. Data Loading and Initial Inspection

In [2]:
# Load datasets
print("Loading datasets...")

# Load customer data
try:
    customers = pd.read_csv('data/data_customers.csv')
    print(f"✓ Customers dataset loaded: {customers.shape}")
except Exception as e:
    print(f"❌ Error loading customers data: {e}")

# Load products data
try:
    products = pd.read_csv('data/data_products.csv')
    print(f"✓ Products dataset loaded: {products.shape}")
except Exception as e:
    print(f"❌ Error loading products data: {e}")

# Load adoption logs
try:
    adoption_logs = pd.read_csv('data/data_adoption_logs.csv')
    print(f"✓ Adoption logs dataset loaded: {adoption_logs.shape}")
except Exception as e:
    print(f"❌ Error loading adoption logs data: {e}")

# Load metadata for reference
metadata_user = pd.read_csv('data/metadata_user.csv')
metadata_product = pd.read_csv('data/metadata_product.csv')
metadata_adoption = pd.read_csv('data/metadata_adoption.csv')

print("\n📊 Dataset Overview:")
print(f"Customers: {customers.shape if 'customers' in locals() else 'Not loaded'}")
print(f"Products: {products.shape if 'products' in locals() else 'Not loaded'}")
print(f"Adoption Logs: {adoption_logs.shape if 'adoption_logs' in locals() else 'Not loaded'}")

Loading datasets...
✓ Customers dataset loaded: (100000, 37)
✓ Products dataset loaded: (1000, 26)
✓ Customers dataset loaded: (100000, 37)
✓ Products dataset loaded: (1000, 26)
✓ Adoption logs dataset loaded: (949650, 10)

📊 Dataset Overview:
Customers: (100000, 37)
Products: (1000, 26)
Adoption Logs: (949650, 10)
✓ Adoption logs dataset loaded: (949650, 10)

📊 Dataset Overview:
Customers: (100000, 37)
Products: (1000, 26)
Adoption Logs: (949650, 10)


### 1.1 Dataset Structure Analysis

In [3]:
# Function to analyze dataset structure
def analyze_dataset_structure(df, dataset_name):
    print(f"\n{'='*50}")
    print(f"📋 {dataset_name.upper()} DATASET ANALYSIS")
    print(f"{'='*50}")
    
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    print("\n🔍 Column Information:")
    print(f"{'Column':<25} {'Type':<15} {'Non-Null':<10} {'Null %':<8} {'Unique'}")
    print("-" * 70)
    
    for col in df.columns:
        non_null = df[col].count()
        null_pct = (1 - non_null/len(df)) * 100
        unique_count = df[col].nunique()
        dtype = str(df[col].dtype)
        
        print(f"{col:<25} {dtype:<15} {non_null:<10} {null_pct:<8.1f} {unique_count}")
    
    return df.dtypes, df.isnull().sum()

# Analyze each dataset if available
if 'customers' in locals():
    customer_dtypes, customer_nulls = analyze_dataset_structure(customers, "customers")

if 'products' in locals():
    product_dtypes, product_nulls = analyze_dataset_structure(products, "products")
    
if 'adoption_logs' in locals():
    adoption_dtypes, adoption_nulls = analyze_dataset_structure(adoption_logs, "adoption_logs")


📋 CUSTOMERS DATASET ANALYSIS
Shape: (100000, 37)
Memory usage: 153.01 MB

🔍 Column Information:
Column                    Type            Non-Null   Null %   Unique
----------------------------------------------------------------------
user_id                   object          100000     0.0      100000
age                       int64           100000     0.0      58
occupation                object          100000     0.0      7
income_tier               object          100000     0.0      5
marital_status            object          100000     0.0      4
household_size            int64           100000     0.0      6
preferred_language        object          100000     0.0      2
products                  object          100000     0.0      256
tenure_years              int64           100000     0.0      31
avg_balance               float64         100000     0.0      99999
cc_limit_util             object          100000     0.0      98118
mortgage_outstanding      float64         

### 1.2 Data Quality Assessment

In [4]:
# Data Quality Summary
def data_quality_summary(df, dataset_name):
    print(f"\n🔍 {dataset_name} Data Quality Summary:")
    print("-" * 40)
    
    # Missing values
    missing_pct = (df.isnull().sum() / len(df) * 100).round(2)
    high_missing = missing_pct[missing_pct > 10]
    
    if len(high_missing) > 0:
        print(f"⚠️  Columns with >10% missing values:")
        for col, pct in high_missing.items():
            print(f"   {col}: {pct}%")
    else:
        print("✅ No columns with >10% missing values")
    
    # Duplicate rows
    duplicates = df.duplicated().sum()
    print(f"🔄 Duplicate rows: {duplicates} ({duplicates/len(df)*100:.2f}%)")
    
    # Data types
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    print(f"📊 Numeric columns: {len(numeric_cols)}")
    print(f"📝 Categorical columns: {len(categorical_cols)}")
    
    return {
        'missing_pct': missing_pct,
        'duplicates': duplicates,
        'numeric_cols': numeric_cols,
        'categorical_cols': categorical_cols
    }

# Analyze data quality for each dataset
datasets_quality = {}

if 'customers' in locals():
    datasets_quality['customers'] = data_quality_summary(customers, "CUSTOMERS")

if 'products' in locals():
    datasets_quality['products'] = data_quality_summary(products, "PRODUCTS")
    
if 'adoption_logs' in locals():
    datasets_quality['adoption_logs'] = data_quality_summary(adoption_logs, "ADOPTION LOGS")


🔍 CUSTOMERS Data Quality Summary:
----------------------------------------
⚠️  Columns with >10% missing values:
   declined_offer_cat: 16.78%
   season_flag: 20.12%
⚠️  Columns with >10% missing values:
   declined_offer_cat: 16.78%
   season_flag: 20.12%
🔄 Duplicate rows: 0 (0.00%)
📊 Numeric columns: 19
📝 Categorical columns: 17

🔍 PRODUCTS Data Quality Summary:
----------------------------------------
✅ No columns with >10% missing values
🔄 Duplicate rows: 0 (0.00%)
📊 Numeric columns: 13
📝 Categorical columns: 13

🔍 ADOPTION LOGS Data Quality Summary:
----------------------------------------
✅ No columns with >10% missing values
🔄 Duplicate rows: 0 (0.00%)
📊 Numeric columns: 19
📝 Categorical columns: 17

🔍 PRODUCTS Data Quality Summary:
----------------------------------------
✅ No columns with >10% missing values
🔄 Duplicate rows: 0 (0.00%)
📊 Numeric columns: 13
📝 Categorical columns: 13

🔍 ADOPTION LOGS Data Quality Summary:
----------------------------------------
✅ No columns w

## 2. Products Dataset Analysis

In [5]:
# Detailed analysis of products dataset
if 'products' in locals():
    print("📦 PRODUCTS DATASET DEEP DIVE")
    print("=" * 50)
    
    # Display first few rows
    print("\n🔍 Sample Data:")
    display(products.head())
    
    # Basic statistics for numeric columns
    print("\n📊 Numeric Features Statistics:")
    numeric_cols = products.select_dtypes(include=[np.number]).columns
    display(products[numeric_cols].describe())

📦 PRODUCTS DATASET DEEP DIVE

🔍 Sample Data:


Unnamed: 0,product_id,category,tier,apr,reward_type,reward_value,eligibility,tenor_months,risk_adj_margin,hist_conv_rate,hist_profit,budget_remaining,max_redemptions,offer_dates,launch_recency_days,compliance_tag,channels,target_segments,geo_applic,merchant_industry,cost_to_bank,expected_utility,cross_sell_score,bundle_depth,valid_window,popularity_trend
0,74cf7050-cd3a-4f02-a146-58e8bc5e0b97,DebitCard,Signature,11.47,Gift,7.56,"Age>=18 & IncomeTier in ['Upper-Middle','High'...",3,2.18,0.022,35.63,377169.93,27041,"{'start': '2025-04-19', 'end': '2025-05-30'}",1443,UsurySafe,['Branch'],"['Expat', 'Family']",Central,E-commerce,17.49,0.62,0.687,2,"{'start': '21:00', 'end': '03:00'}",Declining
1,1dd25b44-395b-4c3d-afcc-52fede816f8a,PersonalLoan,Gold,10.87,Discount,9.23,OccupationCode='TECH' & SalaryCreditedMonthly>...,6,2.77,0.065,37.97,227666.28,17286,"{'start': '2025-04-08', 'end': '2025-07-08'}",910,FXReg,"['Branch', 'Web', 'CallCenter']","['Retiree', 'HighRoller', 'SME_Owner']",Hanoi_Metro,Airlines,88.52,0.534,0.862,3,"{'start': '02:00', 'end': '10:00'}",Rising
2,9811bf22-bbdd-453b-a07d-0b9dfa8bd759,Overdraft,Gold,3.74,Cashback,23.14,OccupationCode='TECH' & SalaryCreditedMonthly>...,3,1.38,0.123,25.27,447756.13,19814,"{'start': '2025-04-11', 'end': '2025-04-26'}",903,UsurySafe,"['Web', 'Branch']",['Millennial'],Abroad,Electronics,54.93,0.822,0.366,4,"{'start': '04:00', 'end': '06:00'}",Declining
3,a9c47675-8621-4620-89d7-93927e5d1a97,FXTransfer,Infinite,10.09,Points,37.13,OccupationCode='TECH' & SalaryCreditedMonthly>...,36,1.84,0.15,28.01,145886.57,7965,"{'start': '2025-05-22', 'end': '2025-08-30'}",809,ConsumerLendingRule,"['Mobile', 'ATM', 'CallCenter', 'Branch', 'Web']","['Retiree', 'HighRoller']",Abroad,Healthcare,44.3,0.632,0.404,3,"{'start': '00:00', 'end': '02:00'}",Declining
4,da2ff860-c86b-48bb-be83-d36b7b4bafb2,Insurance,Gold,6.87,Cashback,16.5,Holding<'FXTransfer'>=False & ChurnRisk<0.4,3,2.25,0.06,147.88,307201.98,19906,"{'start': '2025-05-31', 'end': '2025-09-07'}",721,UsurySafe,"['ATM', 'CallCenter', 'Branch', 'Web']",['Traveler'],Hanoi_Metro,Fuel,22.5,0.918,0.771,4,"{'start': '09:00', 'end': '13:00'}",Rising



📊 Numeric Features Statistics:


Unnamed: 0,apr,reward_value,tenor_months,risk_adj_margin,hist_conv_rate,hist_profit,budget_remaining,max_redemptions,launch_recency_days,cost_to_bank,expected_utility,cross_sell_score,bundle_depth
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,8.0989,15.31201,20.717,2.44262,0.117261,78.83901,249788.61604,25314.834,909.257,28.43461,0.602671,0.501237,2.534
std,2.993015,11.335511,20.039224,0.788431,0.077104,90.990632,139749.345507,14295.379509,505.510312,26.026849,0.201314,0.202689,1.110891
min,0.5,1.63,1.0,-0.11,0.002,2.8,5820.38,106.0,2.0,2.14,0.038,0.023,1.0
25%,6.0825,7.5775,3.0,1.94,0.058,30.4725,128367.425,12709.5,492.0,12.4875,0.463,0.34375,2.0
50%,7.995,12.175,12.0,2.44,0.1,52.805,253218.965,25848.0,915.0,20.605,0.622,0.506,3.0
75%,10.13,19.33,36.0,2.98,0.159,95.44,364379.26,37009.75,1345.0,35.075,0.759,0.65825,3.25
max,18.86,85.44,60.0,4.65,0.446,1481.07,499506.69,49941.0,1799.0,250.75,0.996,0.954,4.0


In [6]:
# Product categories and distribution
if 'products' in locals() and PLOTLY_AVAILABLE:
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Product Categories', 'Product Tiers', 'Reward Types', 'Geographic Coverage'),
        specs=[[{"type": "pie"}, {"type": "pie"}],
               [{"type": "pie"}, {"type": "pie"}]]
    )
    
    # Product categories
    cat_counts = products['category'].value_counts()
    fig.add_trace(go.Pie(labels=cat_counts.index, values=cat_counts.values, name="Categories"), 
                  row=1, col=1)
    
    # Product tiers
    tier_counts = products['tier'].value_counts()
    fig.add_trace(go.Pie(labels=tier_counts.index, values=tier_counts.values, name="Tiers"), 
                  row=1, col=2)
    
    # Reward types
    reward_counts = products['reward_type'].value_counts()
    fig.add_trace(go.Pie(labels=reward_counts.index, values=reward_counts.values, name="Rewards"), 
                  row=2, col=1)
    
    # Geographic applicability
    geo_counts = products['geo_applic'].value_counts()
    fig.add_trace(go.Pie(labels=geo_counts.index, values=geo_counts.values, name="Geography"), 
                  row=2, col=2)
    
    fig.update_layout(height=800, title_text="Product Portfolio Distribution")
    fig.show()
    
elif 'products' in locals():
    # Fallback to text-based analysis when plotly not available
    print("📊 Product Portfolio Distribution (Text Summary):")
    print("\n📈 Product Categories:")
    print(products['category'].value_counts())
    print("\n🏆 Product Tiers:")
    print(products['tier'].value_counts())
    print("\n🎁 Reward Types:")
    print(products['reward_type'].value_counts())
    print("\n🌍 Geographic Coverage:")
    print(products['geo_applic'].value_counts())

# Print detailed breakdown regardless of plotting availability
if 'products' in locals():
    print("\n📈 Product Portfolio Breakdown:")
    print(f"Categories: {products['category'].unique()}")
    print(f"Tiers: {products['tier'].unique()}")
    print(f"Reward Types: {products['reward_type'].unique()}")
    print(f"Geographic Coverage: {products['geo_applic'].unique()}")


📈 Product Portfolio Breakdown:
Categories: ['DebitCard' 'PersonalLoan' 'Overdraft' 'FXTransfer' 'Insurance'
 'Mortgage' 'CreditCard' 'InvestmentFund' 'FixedDeposit' 'SavingsAccount']
Tiers: ['Signature' 'Gold' 'Infinite' 'Standard' 'Platinum']
Reward Types: ['Gift' 'Discount' 'Cashback' 'Points' 'Miles']
Geographic Coverage: ['Central' 'Hanoi_Metro' 'Abroad' 'HCMC_Metro' 'North' 'South']


In [7]:
# Product performance metrics analysis
if 'products' in locals():
    # Create performance dashboard
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('APR Distribution', 'Historical Conversion Rate', 
                       'Risk-Adjusted Margin', 'Expected Utility'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # APR distribution
    fig.add_trace(go.Histogram(x=products['apr'], name='APR', nbinsx=20), row=1, col=1)
    
    # Conversion rate
    fig.add_trace(go.Histogram(x=products['hist_conv_rate'], name='Conv Rate', nbinsx=20), row=1, col=2)
    
    # Risk-adjusted margin
    fig.add_trace(go.Histogram(x=products['risk_adj_margin'], name='Risk Margin', nbinsx=20), row=2, col=1)
    
    # Expected utility
    fig.add_trace(go.Histogram(x=products['expected_utility'], name='Utility', nbinsx=20), row=2, col=2)
    
    fig.update_layout(height=800, title_text="Product Performance Metrics Distribution")
    fig.show()
    
    # Performance statistics
    print("\n🎯 Product Performance Statistics:")
    performance_cols = ['apr', 'hist_conv_rate', 'risk_adj_margin', 'expected_utility', 
                       'hist_profit', 'cross_sell_score']
    for col in performance_cols:
        if col in products.columns:
            print(f"{col}: Mean={products[col].mean():.3f}, Std={products[col].std():.3f}")


🎯 Product Performance Statistics:
apr: Mean=8.099, Std=2.993
hist_conv_rate: Mean=0.117, Std=0.077
risk_adj_margin: Mean=2.443, Std=0.788
expected_utility: Mean=0.603, Std=0.201
hist_profit: Mean=78.839, Std=90.991
cross_sell_score: Mean=0.501, Std=0.203


## 3. Customer Dataset Analysis

In [8]:
# Customer dataset overview (if available)
if 'customers' in locals():
    print("👥 CUSTOMERS DATASET DEEP DIVE")
    print("=" * 50)
    
    # Sample data
    print("\n🔍 Sample Customer Data:")
    display(customers.head())
    
    # If customer data is too large, analyze a sample
    if len(customers) > 10000:
        print(f"\n📊 Large dataset detected ({len(customers):,} rows). Analyzing sample of 10,000...")
        customer_sample = customers.sample(n=10000, random_state=42)
    else:
        customer_sample = customers.copy()
    
    print(f"\nAnalyzing {len(customer_sample):,} customer records...")
else:
    print("⚠️ Customer dataset not available or too large to load")
    print("Proceeding with products and adoption analysis...")
    customer_sample = None

👥 CUSTOMERS DATASET DEEP DIVE

🔍 Sample Customer Data:


Unnamed: 0,user_id,age,occupation,income_tier,marital_status,household_size,preferred_language,products,tenure_years,avg_balance,cc_limit_util,mortgage_outstanding,investments_aum,monthly_salary,top_mcc,ecom_pos_ratio,overseas_share,avg_bill_pay_amt,cash_wd_freq,mobile_login_freq,days_since_push,preferred_channel,offer_ctr,offer_accepts,offer_fatigue,declined_offer_cat,day_time,season_flag,geo_region,weather,rt_spending_trigger,clv_score,churn_risk,propensity_scores,price_sensitivity,peer_cluster_vec,usage_journey
0,ad089c26-f733-4535-9901-bfbf827272b5,32,GOV,Affluent,Divorced,2,vi,"{'DDA': 0, 'SAV': 1, 'CC': 1, 'MORT': 0, 'INV'...",9,8137.517124,"{'limit': 34673, 'utilisation': 0.27}",115490.8066,3764.015709,2388.96,Healthcare,0.83,0.1,910.61,2,24,28,Web,0.119,2,0.2,Insurance,Fri_22:00,,Hanoi_Metro,Cloudy,False,27189.37,0.534,"{'CreditCard': 0.555, 'PersonalLoan': 0.139, '...",0.65,"[np.float64(0.653), np.float64(1.533), np.floa...","['Login', 'Approved', 'Service', 'Compare']"
1,1fce992a-435c-4363-917c-aed958213b43,28,FIN,Low,Married,4,vi,"{'DDA': 1, 'SAV': 1, 'CC': 1, 'MORT': 1, 'INV'...",25,18752.30368,"{'limit': 40254, 'utilisation': 0.18}",53582.44388,9082.966221,1198.02,Entertainment,0.33,0.14,534.84,1,23,32,ATM,0.092,0,0.07,Insurance,Fri_22:00,Summer,HCMC_Metro,Cold,False,53686.9,0.086,"{'CreditCard': 0.11, 'PersonalLoan': 0.02, 'Mo...",0.52,"[np.float64(0.766), np.float64(0.292), np.floa...","['Service', 'AwaitApproval', 'Compare', 'ViewP..."
2,26b6cb73-8bb6-4bf5-a7a0-c932721e1df9,65,RETAIL,Low,Single,4,vi,"{'DDA': 0, 'SAV': 0, 'CC': 0, 'MORT': 1, 'INV'...",6,7661.861656,"{'limit': 33909, 'utilisation': 0.22}",121902.037,17716.72317,1251.7,Entertainment,0.41,0.21,575.01,4,29,6,Mobile,0.191,1,0.53,,Fri_22:00,MidAutumn,Central,Cloudy,False,16887.37,0.166,"{'CreditCard': 0.003, 'PersonalLoan': 0.083, '...",0.39,"[np.float64(0.904), np.float64(0.362), np.floa...","['Service', 'Apply', 'Activate', 'Login']"
3,51c75821-5a6e-4e29-948b-2ecfdc9cc12f,34,RETAIL,Upper-Middle,Divorced,3,en,"{'DDA': 0, 'SAV': 0, 'CC': 1, 'MORT': 0, 'INV'...",23,5369.284725,"{'limit': 25422, 'utilisation': 0.15}",51415.01786,31294.59913,2636.41,Entertainment,0.4,0.12,833.47,1,27,41,CallCenter,0.187,1,0.42,,Fri_22:00,,South,Cold,False,44652.21,0.363,"{'CreditCard': 0.028, 'PersonalLoan': 0.198, '...",0.78,"[np.float64(-0.668), np.float64(0.849), np.flo...","['UploadDocs', 'Login', 'ViewProduct', 'Compar..."
4,3d29ae48-1838-44ba-ae30-8a8c4275d138,72,EDU,Lower-Middle,Single,4,en,"{'DDA': 1, 'SAV': 1, 'CC': 0, 'MORT': 0, 'INV'...",21,65906.07078,"{'limit': 19208, 'utilisation': 0.4}",0.0,5571.306463,2495.41,Entertainment,0.46,0.08,441.54,7,24,20,Web,0.036,1,0.33,Insurance,Fri_22:00,,HCMC_Metro,Clear,True,38716.65,0.307,"{'CreditCard': 0.511, 'PersonalLoan': 0.043, '...",0.34,"[np.float64(0.878), np.float64(0.63), np.float...","['UploadDocs', 'Apply', 'AwaitApproval', 'Acti..."



📊 Large dataset detected (100,000 rows). Analyzing sample of 10,000...

Analyzing 10,000 customer records...


In [9]:
# Customer demographics analysis (if data available)
if customer_sample is not None and len(customer_sample) > 0:
    # Look for demographic columns in the sample
    demo_cols = ['age', 'income_tier', 'marital_status', 'occupation', 'household_size']
    available_demo = [col for col in demo_cols if col in customer_sample.columns]
    
    if available_demo and PLOTLY_AVAILABLE:
        # Create individual plots for each demographic feature
        try:
            for i, col in enumerate(available_demo[:4]):
                if customer_sample[col].dtype in ['object', 'category']:
                    # Categorical data - bar chart (more reliable than pie in subplots)
                    value_counts = customer_sample[col].value_counts().head(10)
                    fig = go.Figure()
                    fig.add_trace(go.Bar(
                        x=value_counts.index,
                        y=value_counts.values,
                        name=col,
                        text=value_counts.values,
                        textposition='auto'
                    ))
                    fig.update_layout(
                        title=f"Distribution of {col}",
                        xaxis_title=col,
                        yaxis_title="Count",
                        height=400
                    )
                    fig.show()
                else:
                    # Numeric data - histogram
                    fig = go.Figure()
                    fig.add_trace(go.Histogram(
                        x=customer_sample[col],
                        name=col,
                        nbinsx=20
                    ))
                    fig.update_layout(
                        title=f"Distribution of {col}",
                        xaxis_title=col,
                        yaxis_title="Frequency",
                        height=400
                    )
                    fig.show()
        except Exception as e:
            print(f"⚠️ Error creating plots: {e}")
            print("📊 Showing text-based analysis instead...")
            
            # Fallback to text analysis
            for col in available_demo[:4]:
                print(f"\n📈 {col} Distribution:")
                if customer_sample[col].dtype in ['object', 'category']:
                    print(customer_sample[col].value_counts().head(10))
                else:
                    print(f"  Mean: {customer_sample[col].mean():.2f}")
                    print(f"  Std: {customer_sample[col].std():.2f}")
                    print(f"  Min: {customer_sample[col].min():.2f}")
                    print(f"  Max: {customer_sample[col].max():.2f}")
    
    elif available_demo:
        # Text-based analysis when plotly not available
        print("📊 Customer Demographics Analysis (Text Summary):")
        for col in available_demo[:4]:
            print(f"\n📈 {col} Distribution:")
            if customer_sample[col].dtype in ['object', 'category']:
                print(customer_sample[col].value_counts().head(10))
            else:
                print(f"  Mean: {customer_sample[col].mean():.2f}")
                print(f"  Std: {customer_sample[col].std():.2f}")
                print(f"  Min: {customer_sample[col].min():.2f}")
                print(f"  Max: {customer_sample[col].max():.2f}")
    
    print(f"\n👥 Customer Sample Analysis ({len(customer_sample):,} records):")
    print(f"Available demographic features: {available_demo}")
else:
    print("📊 Creating mock customer analysis based on metadata...")
    # Show what we would analyze based on metadata
    print("\n🎯 Customer Features to Analyze (from metadata):")
    for i, row in metadata_user.iterrows():
        if i < 10:  # Show first 10 features
            print(f"  {row['Feature']}: {row['Description']}")


👥 Customer Sample Analysis (10,000 records):
Available demographic features: ['age', 'income_tier', 'marital_status', 'occupation', 'household_size']


## 4. Adoption Logs Analysis

In [10]:
# Adoption logs analysis
if 'adoption_logs' in locals():
    print("📈 ADOPTION LOGS ANALYSIS")
    print("=" * 50)
    
    # Sample data
    print("\n🔍 Sample Adoption Data:")
    display(adoption_logs.head())
    
    # Basic statistics
    print(f"\nTotal adoption records: {len(adoption_logs):,}")
    
    # Adoption rate analysis
    if 'adopted' in adoption_logs.columns:
        adoption_rate = adoption_logs['adopted'].mean()
        print(f"Overall adoption rate: {adoption_rate:.2%}")
        
        # Adoption distribution
        fig = go.Figure()
        adoption_counts = adoption_logs['adopted'].value_counts()
        fig.add_trace(go.Bar(
            x=['Not Adopted', 'Adopted'], 
            y=[adoption_counts.get(0, 0), adoption_counts.get(1, 0)],
            marker_color=['#ff7f7f', '#7fbf7f']
        ))
        fig.update_layout(title="Product Adoption Distribution")
        fig.show()
    
    # Analyze usage patterns
    usage_cols = ['tenure_days', 'recency_days', 'activity_intensity', 'monetary_volume']
    available_usage = [col for col in usage_cols if col in adoption_logs.columns]
    
    if available_usage:
        print(f"\n📊 Usage Metrics Statistics:")
        display(adoption_logs[available_usage].describe())
else:
    print("⚠️ Adoption logs dataset not available")

📈 ADOPTION LOGS ANALYSIS

🔍 Sample Adoption Data:


Unnamed: 0,adopted,tenure_days,recency_days,activity_intensity,monetary_volume,utilisation_ratio,reward_redemption_rate,risk_flag,user_id,product_id
0,False,767,229,13,4271.0,0.19,0.246,False,ad089c26-f733-4535-9901-bfbf827272b5,ce07d04c-b6e7-449d-a1ab-f13267ac7cf1
1,True,980,264,13,3239.1,0.345,0.025,False,ad089c26-f733-4535-9901-bfbf827272b5,69702bb0-34ca-4850-8d38-3d3c261d063e
2,False,172,80,7,13783.12,0.135,0.853,False,ad089c26-f733-4535-9901-bfbf827272b5,e0d289ba-6f94-47f6-a2b2-ecdaa0de0535
3,False,156,184,9,6246.93,0.199,0.124,False,ad089c26-f733-4535-9901-bfbf827272b5,565789b8-029d-45e1-9c31-63034c6d277a
4,False,193,173,9,10475.55,0.409,0.378,False,ad089c26-f733-4535-9901-bfbf827272b5,ec0c4e1e-d0b3-414b-b8ae-dd8429ff1214



Total adoption records: 949,650
Overall adoption rate: 25.07%



📊 Usage Metrics Statistics:


Unnamed: 0,tenure_days,recency_days,activity_intensity,monetary_volume
count,949650.0,949650.0,949650.0,949650.0
mean,362.106614,182.347578,9.992544,8128.243606
std,352.756286,105.674588,3.158918,10644.929222
min,0.0,0.0,0.0,37.52
25%,105.0,91.0,8.0,2505.0625
50%,252.0,182.0,10.0,4920.345
75%,505.0,274.0,12.0,9667.01
max,1825.0,365.0,28.0,518039.35


In [11]:
# Cross-dataset analysis: Product-Adoption relationship
if 'products' in locals() and 'adoption_logs' in locals():
    print("🔗 CROSS-DATASET ANALYSIS")
    print("=" * 50)
    
    # Merge products with adoption logs
    if 'product_id' in products.columns and 'product_id' in adoption_logs.columns:
        product_adoption = adoption_logs.merge(products, on='product_id', how='left')
        
        print(f"Merged dataset shape: {product_adoption.shape}")
        
        # Adoption rate by product category
        if 'adopted' in product_adoption.columns and 'category' in product_adoption.columns:
            adoption_by_category = product_adoption.groupby('category')['adopted'].agg(['mean', 'count']).round(3)
            adoption_by_category.columns = ['Adoption_Rate', 'Total_Records']
            
            print("\n📊 Adoption Rate by Product Category:")
            display(adoption_by_category.sort_values('Adoption_Rate', ascending=False))
            
            # Visualization
            fig = go.Figure()
            fig.add_trace(go.Bar(
                x=adoption_by_category.index,
                y=adoption_by_category['Adoption_Rate'],
                text=adoption_by_category['Adoption_Rate'].round(3),
                textposition='auto'
            ))
            fig.update_layout(
                title="Adoption Rate by Product Category",
                xaxis_title="Product Category",
                yaxis_title="Adoption Rate"
            )
            fig.show()
    else:
        print("⚠️ Cannot merge datasets - product_id column missing")
else:
    print("⚠️ Cannot perform cross-dataset analysis - datasets not available")

🔗 CROSS-DATASET ANALYSIS
Merged dataset shape: (949650, 35)

📊 Adoption Rate by Product Category:
Merged dataset shape: (949650, 35)

📊 Adoption Rate by Product Category:


Unnamed: 0_level_0,Adoption_Rate,Total_Records
category,Unnamed: 1_level_1,Unnamed: 2_level_1
CreditCard,0.257,4608
FixedDeposit,0.257,4813
DebitCard,0.252,5201
Insurance,0.252,4532
Overdraft,0.251,205080
Mortgage,0.251,105447
SavingsAccount,0.251,205273
FXTransfer,0.25,304789
PersonalLoan,0.25,104966
InvestmentFund,0.246,4941


## 5. Feature Correlation Analysis

In [12]:
# Correlation analysis for products dataset
if 'products' in locals():
    print("🔗 PRODUCT FEATURES CORRELATION ANALYSIS")
    print("=" * 50)
    
    # Select numeric columns for correlation
    numeric_cols = products.select_dtypes(include=[np.number]).columns.tolist()
    
    if len(numeric_cols) > 1:
        # Calculate correlation matrix
        corr_matrix = products[numeric_cols].corr()
        
        # Create heatmap
        plt.figure(figsize=(12, 8))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, 
                    mask=mask,
                    annot=True, 
                    cmap='RdBu_r', 
                    center=0,
                    square=True,
                    fmt='.2f')
        plt.title('Product Features Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
        # Find highly correlated pairs
        high_corr_pairs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.7:
                    high_corr_pairs.append((
                        corr_matrix.columns[i], 
                        corr_matrix.columns[j], 
                        corr_val
                    ))
        
        if high_corr_pairs:
            print(f"\n⚠️ Highly correlated features (|r| > 0.7):")
            for feat1, feat2, corr in high_corr_pairs:
                print(f"  {feat1} ↔ {feat2}: {corr:.3f}")
        else:
            print("\n✅ No highly correlated features found")
    else:
        print("⚠️ Insufficient numeric columns for correlation analysis")

🔗 PRODUCT FEATURES CORRELATION ANALYSIS

✅ No highly correlated features found

✅ No highly correlated features found


## 6. Business Insights & Key Findings

In [13]:
# Summary of key findings
print("🎯 KEY FINDINGS SUMMARY")
print("=" * 50)

findings = []

# Product insights
if 'products' in locals():
    print("\n📦 PRODUCT INSIGHTS:")
    
    # Most popular categories
    top_categories = products['category'].value_counts().head(3)
    print(f"  • Top 3 product categories: {', '.join(top_categories.index)}")
    findings.append(f"Product portfolio: {len(products)} products across {products['category'].nunique()} categories")
    
    # Performance metrics
    if 'hist_conv_rate' in products.columns:
        avg_conv_rate = products['hist_conv_rate'].mean()
        print(f"  • Average historical conversion rate: {avg_conv_rate:.2%}")
        findings.append(f"Average conversion rate: {avg_conv_rate:.2%}")
    
    if 'apr' in products.columns:
        apr_range = f"{products['apr'].min():.1f}% - {products['apr'].max():.1f}%"
        print(f"  • APR range: {apr_range}")

# Adoption insights
if 'adoption_logs' in locals():
    print("\n📈 ADOPTION INSIGHTS:")
    
    total_records = len(adoption_logs)
    print(f"  • Total adoption records: {total_records:,}")
    findings.append(f"Adoption logs: {total_records:,} records")
    
    if 'adopted' in adoption_logs.columns:
        adoption_rate = adoption_logs['adopted'].mean()
        print(f"  • Overall adoption rate: {adoption_rate:.2%}")
        findings.append(f"Overall adoption rate: {adoption_rate:.2%}")
        
        # Class imbalance check
        positive_rate = adoption_rate
        if positive_rate < 0.1 or positive_rate > 0.9:
            print(f"  ⚠️ Class imbalance detected: {positive_rate:.2%} positive rate")
            findings.append("Class imbalance issue identified")

# Data quality insights
print("\n🔍 DATA QUALITY INSIGHTS:")
for dataset_name, quality_info in datasets_quality.items():
    missing_issues = len([x for x in quality_info['missing_pct'] if x > 10])
    duplicates = quality_info['duplicates']
    
    print(f"  • {dataset_name.capitalize()}: {missing_issues} columns with >10% missing, {duplicates} duplicates")
    
    if missing_issues > 0 or duplicates > 0:
        findings.append(f"{dataset_name}: Data quality issues detected")

print(f"\n📋 SUMMARY:")
for i, finding in enumerate(findings, 1):
    print(f"  {i}. {finding}")

🎯 KEY FINDINGS SUMMARY

📦 PRODUCT INSIGHTS:
  • Top 3 product categories: SavingsAccount, Mortgage, Overdraft
  • Average historical conversion rate: 11.73%
  • APR range: 0.5% - 18.9%

📈 ADOPTION INSIGHTS:
  • Total adoption records: 949,650
  • Overall adoption rate: 25.07%

🔍 DATA QUALITY INSIGHTS:
  • Customers: 2 columns with >10% missing, 0 duplicates
  • Products: 0 columns with >10% missing, 0 duplicates
  • Adoption_logs: 0 columns with >10% missing, 0 duplicates

📋 SUMMARY:
  1. Product portfolio: 1000 products across 10 categories
  2. Average conversion rate: 11.73%
  3. Adoption logs: 949,650 records
  4. Overall adoption rate: 25.07%
  5. customers: Data quality issues detected


## 7. Recommendations for Next Steps

Based on our EDA, here are the key recommendations for moving forward:

### Data Preprocessing Priorities:
1. **Handle missing values** in high-missing columns
2. **Remove duplicates** where identified  
3. **Encode categorical variables** for modeling
4. **Scale numeric features** to similar ranges

### Feature Engineering Opportunities:
1. **Customer-Product Interaction Features**: Create features that capture compatibility between customer profiles and product characteristics
2. **Temporal Features**: Extract time-based patterns from adoption history
3. **Aggregation Features**: Customer-level and product-level summary statistics
4. **Risk-Reward Balance**: Combine risk and reward metrics for better decision features

### Modeling Considerations:
1. **Class Imbalance**: If adoption rate is very low/high, use appropriate sampling or cost-sensitive learning
2. **Feature Selection**: Address highly correlated features to avoid multicollinearity
3. **Cross-Validation Strategy**: Use time-based splits to avoid data leakage
4. **Evaluation Metrics**: Focus on business-relevant metrics like Precision@K and conversion lift

### Business Insights:
1. **Product Performance Variation**: Different product categories show varying adoption patterns
2. **Customer Segmentation Potential**: Rich customer features enable sophisticated segmentation
3. **Personalization Opportunities**: Customer-product matching can drive recommendations

In [14]:
# Save key insights for future reference
import json
from datetime import datetime

# Create insights summary
insights_summary = {
    'analysis_date': datetime.now().isoformat(),
    'datasets_analyzed': list(datasets_quality.keys()),
    'key_findings': findings,
    'data_shapes': {},
    'recommendations': [
        'Implement comprehensive data preprocessing pipeline',
        'Develop customer-product interaction features', 
        'Address class imbalance in modeling',
        'Use time-based validation strategy',
        'Focus on business-relevant evaluation metrics'
    ]
}

# Add dataset shapes if available
if 'products' in locals():
    insights_summary['data_shapes']['products'] = list(products.shape)
if 'customers' in locals():
    insights_summary['data_shapes']['customers'] = list(customers.shape)
if 'adoption_logs' in locals():
    insights_summary['data_shapes']['adoption_logs'] = list(adoption_logs.shape)

# Save insights
with open('eda_insights.json', 'w') as f:
    json.dump(insights_summary, f, indent=2)

print("✅ EDA analysis complete!")
print("📁 Insights saved to 'eda_insights.json'")
print("\n🚀 Ready to proceed with feature engineering and modeling!")

✅ EDA analysis complete!
📁 Insights saved to 'eda_insights.json'

🚀 Ready to proceed with feature engineering and modeling!


# II. Data Preprocessing Pipeline

Based on our EDA findings, we'll implement a comprehensive preprocessing pipeline to prepare the data for modeling.

## Preprocessing Strategy:
1. **Data Cleaning**: Handle missing values and duplicates
2. **Feature Engineering**: Create interaction and derived features
3. **Encoding**: Transform categorical variables
4. **Scaling**: Normalize numeric features
5. **Validation**: Ensure data quality and consistency

In [15]:
# Data Preprocessing Functions
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from datetime import datetime, timedelta
import ast

class DataPreprocessor:
    """Comprehensive data preprocessing pipeline for customer product adoption prediction"""
    
    def __init__(self):
        self.scalers = {}
        self.encoders = {}
        self.imputers = {}
        self.feature_names = []
        self.preprocessing_log = []
        
    def log_step(self, message):
        """Log preprocessing steps"""
        timestamp = datetime.now().strftime('%H:%M:%S')
        log_entry = f"[{timestamp}] {message}"
        self.preprocessing_log.append(log_entry)
        print(f"✓ {message}")
    
    def handle_missing_values(self, df, strategy='auto'):
        """Handle missing values with appropriate strategies"""
        self.log_step("Starting missing value treatment")
        
        df_processed = df.copy()
        missing_summary = df.isnull().sum()
        high_missing_cols = missing_summary[missing_summary > len(df) * 0.5].index.tolist()
        
        if high_missing_cols:
            self.log_step(f"Dropping columns with >50% missing: {high_missing_cols}")
            df_processed = df_processed.drop(columns=high_missing_cols)
        
        # Separate numeric and categorical columns
        numeric_cols = df_processed.select_dtypes(include=[np.number]).columns.tolist()
        categorical_cols = df_processed.select_dtypes(include=['object']).columns.tolist()
        
        # Handle numeric missing values
        if numeric_cols:
            for col in numeric_cols:
                if df_processed[col].isnull().sum() > 0:
                    if df_processed[col].isnull().sum() / len(df_processed) < 0.1:
                        # Low missing: use median
                        df_processed[col].fillna(df_processed[col].median(), inplace=True)
                        self.log_step(f"Filled {col} with median (low missing)")
                    else:
                        # High missing: use KNN imputation
                        imputer = KNNImputer(n_neighbors=5)
                        df_processed[col] = imputer.fit_transform(df_processed[[col]]).flatten()
                        self.log_step(f"Applied KNN imputation to {col}")
        
        # Handle categorical missing values
        if categorical_cols:
            for col in categorical_cols:
                if df_processed[col].isnull().sum() > 0:
                    # Fill with mode or 'Unknown'
                    mode_val = df_processed[col].mode()
                    fill_val = mode_val[0] if len(mode_val) > 0 else 'Unknown'
                    df_processed[col].fillna(fill_val, inplace=True)
                    self.log_step(f"Filled {col} with mode/Unknown")
        
        return df_processed
    
    def remove_duplicates(self, df):
        """Remove duplicate rows"""
        initial_shape = df.shape
        df_cleaned = df.drop_duplicates()
        duplicates_removed = initial_shape[0] - df_cleaned.shape[0]
        
        if duplicates_removed > 0:
            self.log_step(f"Removed {duplicates_removed} duplicate rows")
        else:
            self.log_step("No duplicates found")
            
        return df_cleaned
    
    def encode_categorical_features(self, df, high_cardinality_threshold=20):
        """Encode categorical features with appropriate methods"""
        self.log_step("Starting categorical encoding")
        
        df_encoded = df.copy()
        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
        
        for col in categorical_cols:
            unique_count = df[col].nunique()
            
            if unique_count == 2:
                # Binary encoding
                le = LabelEncoder()
                df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
                self.encoders[col] = le
                self.log_step(f"Binary encoded {col}")
                
            elif unique_count <= high_cardinality_threshold:
                # One-hot encoding for low cardinality
                dummies = pd.get_dummies(df_encoded[col], prefix=col, drop_first=True)
                df_encoded = pd.concat([df_encoded.drop(columns=[col]), dummies], axis=1)
                self.log_step(f"One-hot encoded {col} ({unique_count} categories)")
                
            else:
                # Target encoding or frequency encoding for high cardinality
                freq_encoding = df_encoded[col].value_counts().to_dict()
                df_encoded[f'{col}_frequency'] = df_encoded[col].map(freq_encoding)
                df_encoded = df_encoded.drop(columns=[col])
                self.log_step(f"Frequency encoded {col} (high cardinality: {unique_count})")
        
        return df_encoded
    
    def scale_numeric_features(self, df, method='standard'):
        """Scale numeric features"""
        self.log_step(f"Starting feature scaling ({method})")
        
        df_scaled = df.copy()
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
        
        if method == 'standard':
            scaler = StandardScaler()
        else:
            from sklearn.preprocessing import MinMaxScaler
            scaler = MinMaxScaler()
        
        if numeric_cols:
            df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])
            self.scalers['numeric'] = scaler
            self.log_step(f"Scaled {len(numeric_cols)} numeric features")
        
        return df_scaled

# Initialize preprocessor
preprocessor = DataPreprocessor()
print("🔧 Data Preprocessor initialized")

🔧 Data Preprocessor initialized


### 8.1 Feature Engineering Pipeline

Create new features that capture business logic and customer-product interactions.

In [16]:
class FeatureEngineer:
    """Advanced feature engineering for customer product adoption"""
    
    def __init__(self):
        self.feature_log = []
    
    def log_feature(self, message):
        """Log feature engineering steps"""
        timestamp = datetime.now().strftime('%H:%M:%S')
        self.feature_log.append(f"[{timestamp}] {message}")
        print(f"🔨 {message}")
    
    def create_customer_product_interactions(self, customers_df, products_df):
        """Create interaction features between customers and products"""
        self.log_feature("Creating customer-product interaction features")
        
        interactions = []
        
        # Create all customer-product combinations
        for _, customer in customers_df.iterrows():
            for _, product in products_df.iterrows():
                interaction = {
                    'user_id': customer.get('user_id'),
                    'product_id': product.get('product_id'),
                }
                
                # Income-Product Tier Match
                if 'income_tier' in customer.index and 'tier' in product.index:
                    income_tier = str(customer['income_tier']).lower()
                    product_tier = str(product['tier']).lower()
                    
                    tier_compatibility = {
                        ('low', 'standard'): 1,
                        ('middle', 'gold'): 1,
                        ('high', 'platinum'): 1,
                        ('affluent', 'signature'): 1,
                        ('affluent', 'infinite'): 1
                    }
                    
                    interaction['income_product_match'] = tier_compatibility.get(
                        (income_tier, product_tier), 0
                    )
                
                # Age-Product Category Match
                if 'age' in customer.index and 'category' in product.index:
                    age = customer['age']
                    category = str(product['category'])
                    
                    # Age-based product affinity
                    age_product_affinity = 0
                    if 18 <= age <= 30:
                        if category in ['DebitCard', 'PersonalLoan']: age_product_affinity = 1
                    elif 30 <= age <= 50:
                        if category in ['CreditCard', 'Mortgage', 'Insurance']: age_product_affinity = 1
                    elif age > 50:
                        if category in ['FixedDeposit', 'InvestmentFund']: age_product_affinity = 1
                    
                    interaction['age_product_affinity'] = age_product_affinity
                
                # Risk-Reward Balance
                if 'churn_risk' in customer.index and 'risk_adj_margin' in product.index:
                    customer_risk = customer['churn_risk']
                    product_risk = product['risk_adj_margin']
                    
                    # Risk compatibility (inverse relationship)
                    interaction['risk_compatibility'] = 1 / (1 + abs(customer_risk - product_risk))
                
                # CLV-Product Value Match
                if 'clv_score' in customer.index and 'expected_utility' in product.index:
                    clv = customer['clv_score']
                    utility = product['expected_utility']
                    interaction['clv_utility_match'] = clv * utility
                
                # Channel Preference Match
                if 'preferred_channel' in customer.index and 'channels' in product.index:
                    pref_channel = str(customer['preferred_channel'])
                    product_channels = str(product['channels'])
                    
                    # Simple string matching (could be improved with proper parsing)
                    interaction['channel_match'] = 1 if pref_channel.lower() in product_channels.lower() else 0
                
                interactions.append(interaction)
        
        interactions_df = pd.DataFrame(interactions)
        self.log_feature(f"Created {len(interactions_df)} customer-product interactions")
        
        return interactions_df
    
    def create_temporal_features(self, df):
        """Create time-based features"""
        self.log_feature("Creating temporal features")
        
        df_temporal = df.copy()
        
        # If we have date columns, extract temporal features
        date_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
        
        for col in date_cols:
            try:
                # Try to parse dates
                df_temporal[col] = pd.to_datetime(df_temporal[col], errors='coerce')
                
                # Extract temporal components
                df_temporal[f'{col}_year'] = df_temporal[col].dt.year
                df_temporal[f'{col}_month'] = df_temporal[col].dt.month
                df_temporal[f'{col}_quarter'] = df_temporal[col].dt.quarter
                df_temporal[f'{col}_dayofweek'] = df_temporal[col].dt.dayofweek
                df_temporal[f'{col}_is_weekend'] = (df_temporal[col].dt.dayofweek >= 5).astype(int)
                
                self.log_feature(f"Extracted temporal features from {col}")
                
            except Exception as e:
                self.log_feature(f"Could not parse {col} as date: {e}")
        
        # Create recency features
        if 'tenure_days' in df.columns:
            df_temporal['tenure_months'] = df_temporal['tenure_days'] / 30
            df_temporal['tenure_years'] = df_temporal['tenure_days'] / 365
            self.log_feature("Created tenure-based features")
        
        return df_temporal
    
    def create_aggregation_features(self, adoption_df, customers_df, products_df):
        """Create customer and product level aggregations"""
        self.log_feature("Creating aggregation features")
        
        # Customer-level aggregations
        customer_aggs = adoption_df.groupby('user_id').agg({
            'adopted': ['mean', 'sum', 'count'],
            'tenure_days': ['mean', 'max'],
            'monetary_volume': ['mean', 'sum', 'std'],
            'activity_intensity': ['mean', 'max']
        }).round(3)
        
        # Flatten column names
        customer_aggs.columns = ['_'.join(col) for col in customer_aggs.columns]
        customer_aggs = customer_aggs.add_prefix('customer_')
        customer_aggs.reset_index(inplace=True)
        
        # Product-level aggregations
        product_aggs = adoption_df.groupby('product_id').agg({
            'adopted': ['mean', 'sum', 'count'],
            'monetary_volume': ['mean', 'sum'],
            'utilisation_ratio': ['mean', 'std']
        }).round(3)
        
        product_aggs.columns = ['_'.join(col) for col in product_aggs.columns]
        product_aggs = product_aggs.add_prefix('product_')
        product_aggs.reset_index(inplace=True)
        
        self.log_feature(f"Created customer aggregations: {customer_aggs.shape}")
        self.log_feature(f"Created product aggregations: {product_aggs.shape}")
        
        return customer_aggs, product_aggs
    
    def create_business_logic_features(self, df):
        """Create features based on business logic"""
        self.log_feature("Creating business logic features")
        
        df_business = df.copy()
        
        # Financial stability indicators
        if 'avg_balance' in df.columns and 'monthly_salary' in df.columns:
            df_business['financial_stability'] = df_business['avg_balance'] / (df_business['monthly_salary'] + 1)
            self.log_feature("Created financial_stability ratio")
        
        # Digital engagement score
        digital_cols = ['mobile_login_freq', 'ecom_pos_ratio', 'offer_ctr']
        available_digital = [col for col in digital_cols if col in df.columns]
        
        if available_digital:
            df_business['digital_engagement_score'] = df_business[available_digital].mean(axis=1)
            self.log_feature("Created digital_engagement_score")
        
        # Risk-adjusted value
        if 'clv_score' in df.columns and 'churn_risk' in df.columns:
            df_business['risk_adjusted_value'] = df_business['clv_score'] * (1 - df_business['churn_risk'])
            self.log_feature("Created risk_adjusted_value")
        
        # Product portfolio diversity
        if 'products' in df.columns:
            # Assuming products is a string representation of list
            try:
                df_business['portfolio_diversity'] = df_business['products'].apply(
                    lambda x: len(str(x).split(',')) if pd.notna(x) else 0
                )
                self.log_feature("Created portfolio_diversity")
            except:
                self.log_feature("Could not create portfolio_diversity")
        
        return df_business

# Initialize feature engineer
feature_engineer = FeatureEngineer()
print("🔨 Feature Engineer initialized")

🔨 Feature Engineer initialized


### 8.2 Execute Preprocessing Pipeline

Now let's apply our preprocessing pipeline to the actual datasets.

In [17]:
# Execute preprocessing pipeline
print("🚀 STARTING DATA PREPROCESSING PIPELINE")
print("=" * 60)

# Store original datasets for comparison
original_shapes = {}
processed_datasets = {}

# Process Products Dataset
if 'products' in locals():
    print("\n📦 PREPROCESSING PRODUCTS DATASET")
    print("-" * 40)
    
    original_shapes['products'] = products.shape
    
    # Step 1: Handle missing values
    products_clean = preprocessor.handle_missing_values(products)
    
    # Step 2: Remove duplicates
    products_clean = preprocessor.remove_duplicates(products_clean)
    
    # Step 3: Feature engineering
    products_engineered = feature_engineer.create_business_logic_features(products_clean)
    products_engineered = feature_engineer.create_temporal_features(products_engineered)
    
    # Step 4: Encode categorical features
    products_encoded = preprocessor.encode_categorical_features(products_engineered)
    
    # Step 5: Scale numeric features
    products_final = preprocessor.scale_numeric_features(products_encoded)
    
    processed_datasets['products'] = products_final
    print(f"\n✅ Products preprocessing complete: {original_shapes['products']} → {products_final.shape}")

# Process Adoption Logs Dataset
if 'adoption_logs' in locals():
    print("\n📈 PREPROCESSING ADOPTION LOGS DATASET")
    print("-" * 40)
    
    original_shapes['adoption_logs'] = adoption_logs.shape
    
    # Step 1: Handle missing values
    adoption_clean = preprocessor.handle_missing_values(adoption_logs)
    
    # Step 2: Remove duplicates
    adoption_clean = preprocessor.remove_duplicates(adoption_clean)
    
    # Step 3: Feature engineering
    adoption_engineered = feature_engineer.create_temporal_features(adoption_clean)
    adoption_engineered = feature_engineer.create_business_logic_features(adoption_engineered)
    
    # Step 4: Encode categorical features (preserve target variable)
    target_col = 'adopted' if 'adopted' in adoption_engineered.columns else None
    if target_col:
        target_values = adoption_engineered[target_col].copy()
        adoption_features = adoption_engineered.drop(columns=[target_col])
        adoption_encoded = preprocessor.encode_categorical_features(adoption_features)
        adoption_encoded[target_col] = target_values
    else:
        adoption_encoded = preprocessor.encode_categorical_features(adoption_engineered)
    
    # Step 5: Scale numeric features (except target)
    if target_col:
        target_values = adoption_encoded[target_col].copy()
        adoption_features = adoption_encoded.drop(columns=[target_col])
        adoption_scaled = preprocessor.scale_numeric_features(adoption_features)
        adoption_scaled[target_col] = target_values
        adoption_final = adoption_scaled
    else:
        adoption_final = preprocessor.scale_numeric_features(adoption_encoded)
    
    processed_datasets['adoption_logs'] = adoption_final
    print(f"\n✅ Adoption logs preprocessing complete: {original_shapes['adoption_logs']} → {adoption_final.shape}")

# Process Customers Dataset (if available and not too large)
if 'customers' in locals() and len(customers) <= 50000:
    print("\n👥 PREPROCESSING CUSTOMERS DATASET")
    print("-" * 40)
    
    original_shapes['customers'] = customers.shape
    
    # Use sample for large datasets
    if len(customers) > 10000:
        customers_sample = customers.sample(n=10000, random_state=42)
        preprocessor.log_step(f"Using sample of {len(customers_sample)} customers")
    else:
        customers_sample = customers.copy()
    
    # Step 1: Handle missing values
    customers_clean = preprocessor.handle_missing_values(customers_sample)
    
    # Step 2: Remove duplicates
    customers_clean = preprocessor.remove_duplicates(customers_clean)
    
    # Step 3: Feature engineering
    customers_engineered = feature_engineer.create_business_logic_features(customers_clean)
    customers_engineered = feature_engineer.create_temporal_features(customers_engineered)
    
    # Step 4: Encode categorical features
    customers_encoded = preprocessor.encode_categorical_features(customers_engineered)
    
    # Step 5: Scale numeric features
    customers_final = preprocessor.scale_numeric_features(customers_encoded)
    
    processed_datasets['customers'] = customers_final
    print(f"\n✅ Customers preprocessing complete: {customers_sample.shape} → {customers_final.shape}")

else:
    print("\n⚠️ Customers dataset too large or not available - skipping detailed preprocessing")

print("\n" + "=" * 60)
print("🎉 PREPROCESSING PIPELINE COMPLETED!")
print("=" * 60)

🚀 STARTING DATA PREPROCESSING PIPELINE

📦 PREPROCESSING PRODUCTS DATASET
----------------------------------------
✓ Starting missing value treatment
✓ No duplicates found
🔨 Creating business logic features
🔨 Creating temporal features
🔨 Extracted temporal features from offer_dates
✓ Starting categorical encoding
✓ Frequency encoded product_id (high cardinality: 1000)
✓ One-hot encoded category (10 categories)
✓ One-hot encoded tier (5 categories)
✓ One-hot encoded reward_type (5 categories)
✓ One-hot encoded eligibility (13 categories)
✓ One-hot encoded compliance_tag (5 categories)
✓ Frequency encoded channels (high cardinality: 273)
✓ Frequency encoded target_segments (high cardinality: 277)
🔨 Extracted temporal features from offer_dates
✓ Starting categorical encoding
✓ Frequency encoded product_id (high cardinality: 1000)
✓ One-hot encoded category (10 categories)
✓ One-hot encoded tier (5 categories)
✓ One-hot encoded reward_type (5 categories)
✓ One-hot encoded eligibility (13 ca

### 8.3 Preprocessing Results Analysis

Analyze the impact of our preprocessing pipeline.

In [18]:
# Analyze preprocessing results
print("📊 PREPROCESSING RESULTS ANALYSIS")
print("=" * 50)

# Compare before and after shapes
print("\n📏 Dataset Shape Changes:")
for dataset_name, original_shape in original_shapes.items():
    if dataset_name in processed_datasets:
        new_shape = processed_datasets[dataset_name].shape
        print(f"  {dataset_name}:")
        print(f"    Before: {original_shape[0]:,} rows × {original_shape[1]} columns")
        print(f"    After:  {new_shape[0]:,} rows × {new_shape[1]} columns")
        print(f"    Change: {new_shape[0]-original_shape[0]:+,} rows, {new_shape[1]-original_shape[1]:+} columns")
        print()

# Feature engineering summary
print("\n🔨 Feature Engineering Summary:")
for log_entry in feature_engineer.feature_log:
    print(f"  {log_entry}")

# Preprocessing steps summary
print("\n🔧 Preprocessing Steps Summary:")
for log_entry in preprocessor.preprocessing_log:
    print(f"  {log_entry}")

# Check data quality after preprocessing
print("\n🔍 Post-Processing Data Quality:")
for dataset_name, df in processed_datasets.items():
    missing_count = df.isnull().sum().sum()
    duplicate_count = df.duplicated().sum()
    
    print(f"  {dataset_name}:")
    print(f"    Missing values: {missing_count}")
    print(f"    Duplicates: {duplicate_count}")
    print(f"    Data types: {df.dtypes.value_counts().to_dict()}")
    print()

# Memory usage comparison
print("\n💾 Memory Usage Analysis:")
for dataset_name in original_shapes.keys():
    if dataset_name in processed_datasets:
        if dataset_name == 'products' and 'products' in locals():
            original_memory = products.memory_usage(deep=True).sum() / 1024**2
        elif dataset_name == 'adoption_logs' and 'adoption_logs' in locals():
            original_memory = adoption_logs.memory_usage(deep=True).sum() / 1024**2
        elif dataset_name == 'customers' and 'customers' in locals():
            original_memory = customers.memory_usage(deep=True).sum() / 1024**2
        else:
            continue
            
        new_memory = processed_datasets[dataset_name].memory_usage(deep=True).sum() / 1024**2
        
        print(f"  {dataset_name}:")
        print(f"    Before: {original_memory:.2f} MB")
        print(f"    After:  {new_memory:.2f} MB")
        print(f"    Change: {new_memory-original_memory:+.2f} MB")
        print()

📊 PREPROCESSING RESULTS ANALYSIS

📏 Dataset Shape Changes:
  products:
    Before: 1,000 rows × 26 columns
    After:  1,000 rows × 70 columns
    Change: +0 rows, +44 columns

  adoption_logs:
    Before: 949,650 rows × 10 columns
    After:  949,650 rows × 12 columns
    Change: +0 rows, +2 columns


🔨 Feature Engineering Summary:
  [17:08:39] Creating business logic features
  [17:08:39] Creating temporal features
  [17:08:39] Extracted temporal features from offer_dates
  [17:08:40] Creating temporal features
  [17:08:40] Created tenure-based features
  [17:08:40] Creating business logic features

🔧 Preprocessing Steps Summary:
  [17:08:39] Starting missing value treatment
  [17:08:39] No duplicates found
  [17:08:39] Starting categorical encoding
  [17:08:39] Frequency encoded product_id (high cardinality: 1000)
  [17:08:39] One-hot encoded category (10 categories)
  [17:08:39] One-hot encoded tier (5 categories)
  [17:08:39] One-hot encoded reward_type (5 categories)
  [17:08:39]

### 8.4 Feature Importance & Selection

Analyze feature importance and prepare for modeling.

In [19]:
# Feature Analysis for Modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA

print("🎯 FEATURE ANALYSIS FOR MODELING")
print("=" * 50)

# Prepare data for feature analysis
if 'adoption_logs' in processed_datasets and 'adopted' in processed_datasets['adoption_logs'].columns:
    analysis_df = processed_datasets['adoption_logs'].copy()
    
    # Separate features and target
    target_col = 'adopted'
    X = analysis_df.drop(columns=[target_col])
    y = analysis_df[target_col]
    
    # Remove any remaining non-numeric columns
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    X_numeric = X[numeric_features]
    
    print(f"\n📊 Feature Analysis Dataset:")
    print(f"  Features: {X_numeric.shape[1]}")
    print(f"  Samples: {X_numeric.shape[0]:,}")
    print(f"  Target distribution: {y.value_counts().to_dict()}")
    
    if len(X_numeric.columns) > 0 and len(X_numeric) > 100:
        # 1. Random Forest Feature Importance
        print("\n🌲 Random Forest Feature Importance:")
        try:
            rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
            rf.fit(X_numeric.fillna(0), y)
            
            feature_importance = pd.DataFrame({
                'feature': X_numeric.columns,
                'importance': rf.feature_importances_
            }).sort_values('importance', ascending=False)
            
            print("\n  Top 10 Most Important Features:")
            for i, row in feature_importance.head(10).iterrows():
                print(f"    {row['feature']}: {row['importance']:.4f}")
            
            # Visualize feature importance
            plt.figure(figsize=(10, 6))
            sns.barplot(data=feature_importance.head(15), x='importance', y='feature')
            plt.title('Top 15 Feature Importances (Random Forest)')
            plt.tight_layout()
            plt.show()
            
        except Exception as e:
            print(f"  ❌ Could not compute RF importance: {e}")
        
        # 2. Mutual Information
        print("\n🔗 Mutual Information Analysis:")
        try:
            mi_scores = mutual_info_classif(X_numeric.fillna(0), y, random_state=42)
            mi_df = pd.DataFrame({
                'feature': X_numeric.columns,
                'mutual_info': mi_scores
            }).sort_values('mutual_info', ascending=False)
            
            print("\n  Top 10 Features by Mutual Information:")
            for i, row in mi_df.head(10).iterrows():
                print(f"    {row['feature']}: {row['mutual_info']:.4f}")
                
        except Exception as e:
            print(f"  ❌ Could not compute mutual information: {e}")
        
        # 3. Correlation with target
        print("\n📈 Correlation with Target:")
        try:
            correlations = X_numeric.corrwith(y).abs().sort_values(ascending=False)
            
            print("\n  Top 10 Features by Correlation:")
            for feature, corr in correlations.head(10).items():
                print(f"    {feature}: {corr:.4f}")
                
        except Exception as e:
            print(f"  ❌ Could not compute correlations: {e}")
        
        # 4. Feature Selection Recommendations
        print("\n🎯 Feature Selection Recommendations:")
        
        # High importance features
        if 'feature_importance' in locals():
            high_importance_features = feature_importance[feature_importance['importance'] > 0.01]['feature'].tolist()
            print(f"  • High RF importance features: {len(high_importance_features)}")
        
        # High mutual information features
        if 'mi_df' in locals():
            high_mi_features = mi_df[mi_df['mutual_info'] > 0.01]['feature'].tolist()
            print(f"  • High mutual information features: {len(high_mi_features)}")
        
        # High correlation features
        if 'correlations' in locals():
            high_corr_features = correlations[correlations > 0.1].index.tolist()
            print(f"  • High correlation features: {len(high_corr_features)}")
        
        # Combined feature selection
        if all(x in locals() for x in ['high_importance_features', 'high_mi_features', 'high_corr_features']):
            selected_features = list(set(high_importance_features + high_mi_features + high_corr_features))
            print(f"  • Recommended features for modeling: {len(selected_features)}")
            print(f"  • Feature reduction: {len(X_numeric.columns)} → {len(selected_features)} ({len(selected_features)/len(X_numeric.columns)*100:.1f}%)")
            
            # Store selected features
            with open('selected_features.txt', 'w') as f:
                f.write('\n'.join(selected_features))
            print("  • Selected features saved to 'selected_features.txt'")
    
    else:
        print("\n⚠️ Insufficient data for feature analysis")

else:
    print("\n⚠️ Target variable not available for feature analysis")

🎯 FEATURE ANALYSIS FOR MODELING

📊 Feature Analysis Dataset:
  Features: 10
  Samples: 949,650
  Target distribution: {False: 711603, True: 238047}

🌲 Random Forest Feature Importance:

📊 Feature Analysis Dataset:
  Features: 10
  Samples: 949,650
  Target distribution: {False: 711603, True: 238047}

🌲 Random Forest Feature Importance:
  ❌ Could not compute RF importance: could not allocate 33554432 bytes

🔗 Mutual Information Analysis:
  ❌ Could not compute RF importance: could not allocate 33554432 bytes

🔗 Mutual Information Analysis:

  Top 10 Features by Mutual Information:
    user_id_frequency: 0.0435
    activity_intensity: 0.0060
    product_id_frequency: 0.0030
    recency_days: 0.0007
    tenure_years: 0.0005
    tenure_days: 0.0002
    reward_redemption_rate: 0.0000
    monetary_volume: 0.0000
    utilisation_ratio: 0.0000
    tenure_months: 0.0000

📈 Correlation with Target:

  Top 10 Features by Correlation:
    utilisation_ratio: 0.0020
    recency_days: 0.0017
    activ

### 8.5 Preprocessing Summary & Next Steps

Final summary of preprocessing results and recommendations for modeling.

In [20]:
# Final Preprocessing Summary
import json
from datetime import datetime

print("📋 FINAL PREPROCESSING SUMMARY")
print("=" * 60)

# Create comprehensive summary
preprocessing_summary = {
    'preprocessing_date': datetime.now().isoformat(),
    'datasets_processed': list(processed_datasets.keys()),
    'original_shapes': original_shapes,
    'processed_shapes': {name: df.shape for name, df in processed_datasets.items()},
    'preprocessing_steps': preprocessor.preprocessing_log,
    'feature_engineering_steps': feature_engineer.feature_log,
    'data_quality_metrics': {},
    'recommendations': []
}

# Calculate data quality metrics
for dataset_name, df in processed_datasets.items():
    preprocessing_summary['data_quality_metrics'][dataset_name] = {
        'missing_values': int(df.isnull().sum().sum()),
        'duplicates': int(df.duplicated().sum()),
        'numeric_features': len(df.select_dtypes(include=[np.number]).columns),
        'categorical_features': len(df.select_dtypes(include=['object']).columns),
        'total_features': len(df.columns)
    }

# Generate recommendations
recommendations = [
    "Data is now ready for machine learning modeling",
    "Consider ensemble methods due to feature diversity",
    "Use stratified sampling due to potential class imbalance",
    "Implement time-based cross-validation for robust evaluation",
    "Monitor for data drift in production deployment"
]

if 'selected_features' in locals():
    recommendations.append(f"Use the {len(selected_features)} selected high-importance features")

preprocessing_summary['recommendations'] = recommendations

# Display summary
print("\n🎯 Key Achievements:")
for dataset_name in processed_datasets.keys():
    metrics = preprocessing_summary['data_quality_metrics'][dataset_name]
    original_shape = original_shapes[dataset_name]
    new_shape = processed_datasets[dataset_name].shape
    
    print(f"\n  📊 {dataset_name.upper()}:")
    print(f"    ✓ Shape: {original_shape} → {new_shape}")
    print(f"    ✓ Missing values: {metrics['missing_values']}")
    print(f"    ✓ Duplicates: {metrics['duplicates']}")
    print(f"    ✓ Features: {metrics['total_features']} ({metrics['numeric_features']} numeric, {metrics['categorical_features']} categorical)")

print("\n🚀 Recommendations for Modeling:")
for i, rec in enumerate(recommendations, 1):
    print(f"  {i}. {rec}")

# Save processed datasets
print("\n💾 Saving Processed Datasets:")
for dataset_name, df in processed_datasets.items():
    filename = f'data/processed_{dataset_name}.csv'
    try:
        df.to_csv(filename, index=False)
        print(f"  ✓ Saved {filename} ({df.shape[0]:,} × {df.shape[1]})")
    except Exception as e:
        print(f"  ❌ Failed to save {filename}: {e}")

# Save preprocessing summary
with open('preprocessing_summary.json', 'w') as f:
    json.dump(preprocessing_summary, f, indent=2, default=str)
print("\n📁 Preprocessing summary saved to 'preprocessing_summary.json'")

print("\n" + "=" * 60)
print("🎉 DATA PREPROCESSING COMPLETE!")
print("✅ Ready for model development and training")
print("=" * 60)

📋 FINAL PREPROCESSING SUMMARY

🎯 Key Achievements:

  📊 PRODUCTS:
    ✓ Shape: (1000, 26) → (1000, 70)
    ✓ Missing values: 5000
    ✓ Duplicates: 0
    ✓ Features: 70 (22 numeric, 0 categorical)

  📊 ADOPTION_LOGS:
    ✓ Shape: (949650, 10) → (949650, 12)
    ✓ Missing values: 0
    ✓ Duplicates: 0
    ✓ Features: 12 (10 numeric, 0 categorical)

🚀 Recommendations for Modeling:
  1. Data is now ready for machine learning modeling
  2. Consider ensemble methods due to feature diversity
  3. Use stratified sampling due to potential class imbalance
  4. Implement time-based cross-validation for robust evaluation
  5. Monitor for data drift in production deployment

💾 Saving Processed Datasets:
  ✓ Saved data/processed_products.csv (1,000 × 70)

🎯 Key Achievements:

  📊 PRODUCTS:
    ✓ Shape: (1000, 26) → (1000, 70)
    ✓ Missing values: 5000
    ✓ Duplicates: 0
    ✓ Features: 70 (22 numeric, 0 categorical)

  📊 ADOPTION_LOGS:
    ✓ Shape: (949650, 10) → (949650, 12)
    ✓ Missing values