<a href="https://colab.research.google.com/github/joshtjoyce3/Interactive-ML-Workbench/blob/main/ShopNow_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üéØ Interactive Machine Learning Workbench
## Build Predictive Models Step-by-Step

This notebook guides you through building machine learning models using a **point-and-click interface**.

**How to use:**
1. Run each cell once (Shift+Enter or click ‚ñ∂Ô∏è)
2. Use the **dropdowns and sliders** to make your choices
3. Click the **buttons** to execute each step
4. Read the theory boxes to understand what you're doing

üí° **To see the Python code:** Double-click any cell or click `Show code`

---

In [1]:
#@title ‚öôÔ∏è **Run This First - Setup** { display-mode: "form" }
#@markdown **Click ‚ñ∂Ô∏è to load all libraries. Wait for ‚úÖ before proceeding.**

# ============================================================
# IMPORTS AND SETUP
# ============================================================
from IPython.display import HTML, display, clear_output
import ipywidgets as widgets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from google.colab import files
import io

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, roc_curve,
                             mean_squared_error, mean_absolute_error, r2_score)

import warnings
warnings.filterwarnings('ignore')

# ============================================================
# GLOBAL STATE
# ============================================================
class MLState:
    def __init__(self):
        self.reset()

    def reset(self):
        self.df_original = None
        self.df = None
        self.dropped_columns = []
        self.target = None
        self.problem_type = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.X_train_scaled = None
        self.X_test_scaled = None
        self.scaler = None
        self.scaled_columns = []
        self.feature_names = []
        self.models = {}
        self.results = {}
        self.analysis_log = []
        # Cost matrix
        self.cost_tp = 15
        self.cost_fp = -5
        self.cost_tn = 0
        self.cost_fn = -10

    def log(self, message):
        timestamp = datetime.now().strftime("%H:%M:%S")
        self.analysis_log.append(f"[{timestamp}] {message}")

    def get_available_columns(self):
        if self.df is None:
            return []
        cols = [c for c in self.df.columns if c not in self.dropped_columns]
        if self.target and self.target in cols:
            cols.remove(self.target)
        return cols

    def get_categorical_columns(self):
        if self.df is None:
            return []
        cat_cols = self.df.select_dtypes(include=['object', 'category']).columns.tolist()
        return [c for c in cat_cols if c not in self.dropped_columns and c != self.target]

    def get_numeric_columns(self):
        if self.df is None:
            return []
        num_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
        return [c for c in num_cols if c not in self.dropped_columns and c != self.target]

state = MLState()

# ============================================================
# HELPER: INFO BOX
# ============================================================
def info_box(text, box_type="theory"):
    colors = {
        "info": ("#e7f3fe", "#2196F3"),
        "warning": ("#fff3cd", "#ff9800"),
        "success": ("#d4edda", "#28a745"),
        "theory": ("#f0f4f8", "#5c6bc0")
    }
    bg, border = colors.get(box_type, colors["theory"])
    return widgets.HTML(
        value=f'<div style="background-color:{bg}; border-left:4px solid {border}; padding:12px; margin:10px 0; border-radius:4px;">{text}</div>'
    )

print("‚úÖ Setup complete! All libraries loaded.")
print("\nüìö Proceed to Step 1 below.")

‚úÖ Setup complete! All libraries loaded.

üìö Proceed to Step 1 below.


---
# üìä STEP 1: Load Your Data

In [3]:
#@title üìÇ **Step 1: Load Data** { display-mode: "form" }
#@markdown **Run this cell, then click the button to upload your CSV file.**

# Theory box
theory = info_box("""
<b>üí° Why this matters:</b><br>
The first step in any ML project is loading and understanding your data:
<ul>
<li><b>Shape</b>: How many observations (rows) and variables (columns)?</li>
<li><b>Data Types</b>: Are variables numeric or categorical?</li>
<li><b>Missing Values</b>: Are there gaps in the data?</li>
</ul>
""")

# Upload button
upload_btn = widgets.Button(
    description='üìÅ Upload CSV File',
    button_style='primary',
    layout=widgets.Layout(width='200px', height='40px')
)

load_output = widgets.Output()

def upload_clicked(b):
    with load_output:
        clear_output()
        print("üì§ Select your CSV file...")
        uploaded = files.upload()

        if uploaded:
            filename = list(uploaded.keys())[0]
            state.reset()
            state.df_original = pd.read_csv(io.BytesIO(uploaded[filename]))
            state.df = state.df_original.copy()
            state.log(f"Loaded data: {filename}")

            print("\n" + "="*60)
            print("‚úÖ DATA LOADED SUCCESSFULLY!")
            print("="*60)
            print(f"\nüìä Shape: {state.df.shape[0]:,} rows √ó {state.df.shape[1]} columns")

            missing = state.df.isnull().sum()
            total_missing = missing.sum()
            if total_missing > 0:
                print(f"‚ö†Ô∏è  Missing Values: {total_missing:,} total")
            else:
                print(f"‚úÖ Missing Values: None found!")

            print(f"\nüìã Columns ({len(state.df.columns)}):")
            for i, col in enumerate(state.df.columns, 1):
                dtype = state.df[col].dtype
                missing_count = state.df[col].isnull().sum()
                missing_str = f" ‚ö†Ô∏è {missing_count} missing" if missing_count > 0 else ""
                print(f"   {i}. {col} ({dtype}){missing_str}")

            print(f"\nüîç Preview:")
            display(state.df.head())

upload_btn.on_click(upload_clicked)

display(widgets.VBox([
    theory,
    upload_btn,
    load_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# üéØ STEP 2: Select Target Variable & Problem Type

In [4]:
#@title üéØ **Step 2: Set Target & Problem Type** { display-mode: "form" }
#@markdown **Run this cell, select your target variable, then click Confirm.**

theory = info_box("""
<b>üí° Classification vs Regression:</b><br>
<ul>
<li><b>Classification</b>: Target is a <b>category</b> (yes/no, spam/not spam)<br>
   ‚Üí We predict <i>which group</i> something belongs to</li>
<li><b>Regression</b>: Target is a <b>continuous number</b> (price, sales)<br>
   ‚Üí We predict <i>how much</i> or <i>what value</i></li>
</ul>
<b>Example:</b> Predicting if a customer will buy (yes/no) = Classification.<br>
Predicting how much they'll spend = Regression.
""")

# Dropdown for target (will be populated dynamically)
target_dropdown = widgets.Dropdown(
    options=['-- Load data first --'],
    description='Target:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='300px')
)

problem_dropdown = widgets.Dropdown(
    options=['Classification', 'Regression'],
    value='Classification',
    description='Type:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='200px')
)

refresh_btn = widgets.Button(description='üîÑ Refresh Options', button_style='info', layout=widgets.Layout(width='150px'))
confirm_btn = widgets.Button(description='‚úÖ Confirm Selection', button_style='success', layout=widgets.Layout(width='170px'))

target_output = widgets.Output()

def refresh_target(b):
    if state.df is not None:
        target_dropdown.options = list(state.df.columns)
        with target_output:
            clear_output()
            print("‚úÖ Options refreshed!")

def confirm_target(b):
    with target_output:
        clear_output()
        if state.df is None:
            print("‚ùå Please load data first (Step 1)!")
            return

        state.target = target_dropdown.value
        state.problem_type = problem_dropdown.value.lower()
        state.log(f"Target: {state.target} ({state.problem_type})")

        print("="*60)
        print("‚úÖ TARGET CONFIGURED")
        print("="*60)
        print(f"\nüéØ Target Variable: {state.target}")
        print(f"üìã Problem Type: {state.problem_type.upper()}")

        if state.problem_type == 'classification':
            print(f"\nüìä Class Distribution:")
            dist = state.df[state.target].value_counts()
            for cls, count in dist.items():
                pct = count / len(state.df) * 100
                print(f"   {cls}: {count:,} ({pct:.1f}%)")
            if len(dist) == 2:
                ratio = dist.max() / dist.min()
                if ratio > 3:
                    print(f"\n‚ö†Ô∏è  Class Imbalance! Ratio: {ratio:.1f}:1")
        else:
            stats = state.df[state.target].describe()
            print(f"\nüìä Target Statistics:")
            print(f"   Mean: {stats['mean']:.2f}, Std: {stats['std']:.2f}")
            print(f"   Min: {stats['min']:.2f}, Max: {stats['max']:.2f}")

refresh_btn.on_click(refresh_target)
confirm_btn.on_click(confirm_target)

display(widgets.VBox([
    theory,
    refresh_btn,
    widgets.HBox([target_dropdown, problem_dropdown]),
    confirm_btn,
    target_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# üóëÔ∏è STEP 3: Drop Unnecessary Columns

In [5]:
#@title üóëÔ∏è **Step 3: Drop Columns** { display-mode: "form" }
#@markdown **Run this cell, select columns to drop, then click Drop.**

theory = info_box("""
<b>üí° What columns should you drop?</b><br>
<ul>
<li><b>Identifiers</b>: customer_id, transaction_id ‚Üí unique per row, no predictive value</li>
<li><b>Free Text</b>: comments, descriptions ‚Üí requires special NLP processing</li>
<li><b>Redundant</b>: If two columns contain the same information</li>
<li><b>Data Leakage</b>: Variables not available at prediction time</li>
</ul>
<b>‚ö†Ô∏è Once dropped, columns won't appear in later steps!</b>
""")

drop_select = widgets.SelectMultiple(
    options=['-- Load data first --'],
    description='Select:',
    layout=widgets.Layout(width='400px', height='150px'),
    style={'description_width': '60px'}
)

refresh_drop_btn = widgets.Button(description='üîÑ Refresh List', button_style='info', layout=widgets.Layout(width='130px'))
drop_btn = widgets.Button(description='üóëÔ∏è Drop Selected', button_style='warning', layout=widgets.Layout(width='150px'))

drop_output = widgets.Output()

def refresh_drop(b):
    if state.df is not None:
        drop_select.options = state.get_available_columns()
        with drop_output:
            clear_output()
            print(f"‚úÖ {len(drop_select.options)} columns available")

def drop_columns(b):
    with drop_output:
        clear_output()
        if len(drop_select.value) == 0:
            print("‚ÑπÔ∏è No columns selected")
            return

        for col in drop_select.value:
            if col in state.df.columns:
                state.df = state.df.drop(columns=[col])
                state.dropped_columns.append(col)

        state.log(f"Dropped: {list(drop_select.value)}")

        print("‚úÖ Columns dropped!")
        print(f"\nüóëÔ∏è Dropped: {list(drop_select.value)}")
        print(f"\nüìã Remaining ({len(state.get_available_columns())}):")
        for col in state.get_available_columns():
            print(f"   ‚Ä¢ {col}")

        # Refresh the list
        drop_select.options = state.get_available_columns()

refresh_drop_btn.on_click(refresh_drop)
drop_btn.on_click(drop_columns)

display(widgets.VBox([
    theory,
    widgets.HTML("<i>Hold Ctrl/Cmd to select multiple columns:</i>"),
    refresh_drop_btn,
    drop_select,
    drop_btn,
    drop_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# üîß STEP 4: Handle Missing Values

In [6]:
#@title üîß **Step 4: Handle Missing Values** { display-mode: "form" }
#@markdown **Run this cell, select strategies, then click Apply.**

theory = info_box("""
<b>üí° Missing Value Strategies:</b><br><br>
<b>For Numeric Variables:</b>
<ul>
<li><b>Mean</b>: Replace with average ‚Üí Good for normal distributions</li>
<li><b>Median</b>: Replace with middle value ‚Üí Better for skewed data</li>
<li><b>Zero</b>: Replace with 0 ‚Üí Only if 0 is meaningful</li>
</ul>
<b>For Categorical Variables:</b>
<ul>
<li><b>Mode</b>: Replace with most frequent category</li>
<li><b>Unknown</b>: Create a new "Unknown" category</li>
</ul>
""")

numeric_strategy = widgets.Dropdown(
    options=[('Median (recommended)', 'median'), ('Mean', 'mean'), ('Zero', 'zero'), ('Drop Rows', 'drop')],
    value='median',
    description='Numeric:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='250px')
)

categorical_strategy = widgets.Dropdown(
    options=[('Mode (most frequent)', 'mode'), ('Mark as "Unknown"', 'unknown'), ('Drop Rows', 'drop')],
    value='mode',
    description='Categorical:',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='250px')
)

check_missing_btn = widgets.Button(description='üîç Check Missing', button_style='info', layout=widgets.Layout(width='140px'))
apply_missing_btn = widgets.Button(description='‚ú® Apply Strategy', button_style='success', layout=widgets.Layout(width='150px'))

missing_output = widgets.Output()

def check_missing(b):
    with missing_output:
        clear_output()
        if state.df is None:
            print("‚ùå Load data first!")
            return
        missing = state.df.isnull().sum()
        missing = missing[missing > 0]
        if len(missing) > 0:
            print("‚ö†Ô∏è MISSING VALUES:")
            for col, count in missing.items():
                pct = count / len(state.df) * 100
                print(f"   {col}: {count:,} ({pct:.1f}%)")
        else:
            print("‚úÖ No missing values!")

def apply_missing(b):
    with missing_output:
        clear_output()
        if state.df is None:
            print("‚ùå Load data first!")
            return

        num_cols = state.get_numeric_columns()
        cat_cols = state.get_categorical_columns()

        # Numeric
        strat = numeric_strategy.value
        if strat == 'mean':
            for col in num_cols:
                state.df[col] = state.df[col].fillna(state.df[col].mean())
        elif strat == 'median':
            for col in num_cols:
                state.df[col] = state.df[col].fillna(state.df[col].median())
        elif strat == 'zero':
            for col in num_cols:
                state.df[col] = state.df[col].fillna(0)
        elif strat == 'drop':
            state.df = state.df.dropna(subset=num_cols)

        # Categorical
        strat = categorical_strategy.value
        if strat == 'mode':
            for col in cat_cols:
                mode = state.df[col].mode()
                state.df[col] = state.df[col].fillna(mode[0] if len(mode) > 0 else 'Unknown')
        elif strat == 'unknown':
            for col in cat_cols:
                state.df[col] = state.df[col].fillna('Unknown')
        elif strat == 'drop':
            state.df = state.df.dropna(subset=cat_cols)

        state.log(f"Missing values handled")
        print("‚úÖ Missing values handled!")
        print(f"\nüìè Dataset: {state.df.shape[0]:,} rows √ó {state.df.shape[1]} columns")
        remaining = state.df.isnull().sum().sum()
        print(f"{'‚úÖ No' if remaining == 0 else '‚ö†Ô∏è ' + str(remaining)} missing values remain")

check_missing_btn.on_click(check_missing)
apply_missing_btn.on_click(apply_missing)

display(widgets.VBox([
    theory,
    widgets.HBox([numeric_strategy, categorical_strategy]),
    widgets.HBox([check_missing_btn, apply_missing_btn]),
    missing_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# üìä STEP 5: Handle Outliers

In [None]:
#@title üìä **Step 5: Handle Outliers** { display-mode: "form" }
#@markdown **Run this cell, select strategy, then click Apply.**

theory = info_box("""
<b>üí° Outlier Handling Strategies:</b><br>
<ul>
<li><b>None</b>: Keep all data ‚Üí Use if outliers are meaningful (e.g., fraud)</li>
<li><b>Clip (IQR)</b>: Cap values at 1.5√óIQR ‚Üí Standard statistical method</li>
<li><b>Clip (Percentile)</b>: Cap at specific percentiles ‚Üí More control</li>
<li><b>Remove</b>: Delete outlier rows ‚Üí Use cautiously, loses data</li>
</ul>
<b>‚ö†Ô∏è Tree models (RF, XGBoost) are robust to outliers. Linear models are sensitive.</b>
""")

outlier_dropdown = widgets.Dropdown(
    options=[
        ('None - Keep all data', 'none'),
        ('Clip using IQR (1.5√ó)', 'clip_iqr'),
        ('Clip at percentiles', 'clip_pct'),
        ('Remove outlier rows', 'remove')
    ],
    value='none',
    description='Strategy:',
    style={'description_width': '70px'},
    layout=widgets.Layout(width='280px')
)

lower_pct = widgets.IntSlider(value=1, min=0, max=10, description='Lower %:', style={'description_width': '70px'})
upper_pct = widgets.IntSlider(value=99, min=90, max=100, description='Upper %:', style={'description_width': '70px'})

apply_outlier_btn = widgets.Button(description='üîß Apply', button_style='success', layout=widgets.Layout(width='120px'))
outlier_output = widgets.Output()

def apply_outliers(b):
    with outlier_output:
        clear_output()
        if state.df is None:
            print("‚ùå Load data first!")
            return

        num_cols = state.get_numeric_columns()
        original_len = len(state.df)
        strategy = outlier_dropdown.value

        if strategy == 'none':
            print("‚ÑπÔ∏è No outlier handling applied")
        elif strategy == 'clip_iqr':
            for col in num_cols:
                Q1, Q3 = state.df[col].quantile([0.25, 0.75])
                IQR = Q3 - Q1
                state.df[col] = state.df[col].clip(Q1 - 1.5*IQR, Q3 + 1.5*IQR)
            print("‚úÖ Outliers clipped using IQR")
        elif strategy == 'clip_pct':
            for col in num_cols:
                lo = state.df[col].quantile(lower_pct.value / 100)
                hi = state.df[col].quantile(upper_pct.value / 100)
                state.df[col] = state.df[col].clip(lo, hi)
            print(f"‚úÖ Clipped at {lower_pct.value}th and {upper_pct.value}th percentiles")
        elif strategy == 'remove':
            for col in num_cols:
                Q1, Q3 = state.df[col].quantile([0.25, 0.75])
                IQR = Q3 - Q1
                mask = (state.df[col] >= Q1 - 1.5*IQR) & (state.df[col] <= Q3 + 1.5*IQR)
                state.df = state.df[mask]
            print(f"‚úÖ Removed {original_len - len(state.df):,} rows")

        state.log(f"Outliers: {strategy}")
        print(f"\nüìè Dataset: {state.df.shape[0]:,} rows")

apply_outlier_btn.on_click(apply_outliers)

display(widgets.VBox([
    theory,
    outlier_dropdown,
    widgets.HTML("<i>Percentile bounds (only for 'Clip at percentiles'):</i>"),
    widgets.HBox([lower_pct, upper_pct]),
    apply_outlier_btn,
    outlier_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# üî† STEP 6: Encode Categorical Variables

In [7]:
#@title üî† **Step 6: Encode Categorical Variables** { display-mode: "form" }
#@markdown **Run this cell, select columns and encoding type, then click Apply.**

theory = info_box("""
<b>üí° Encoding Strategies:</b><br><br>
<b>One-Hot Encoding</b> (for unordered categories):
<ul>
<li>Creates binary columns for each category</li>
<li>Example: Color (Red, Blue) ‚Üí Color_Red, Color_Blue</li>
<li>Use for: gender, country, product type</li>
</ul>
<b>Ordinal Encoding</b> (for ordered categories):
<ul>
<li>Converts to numbers: Low=0, Medium=1, High=2</li>
<li>Use for: education level, satisfaction rating</li>
<li>‚ö†Ô∏è You must specify the correct order!</li>
</ul>
""")

onehot_select = widgets.SelectMultiple(
    options=['-- Refresh to see columns --'],
    description='One-Hot:',
    layout=widgets.Layout(width='250px', height='120px'),
    style={'description_width': '70px'}
)

ordinal_dropdown = widgets.Dropdown(
    options=['-- Select --'],
    description='Ordinal:',
    style={'description_width': '70px'},
    layout=widgets.Layout(width='250px')
)

ordinal_order_text = widgets.Text(
    placeholder='e.g., Low, Medium, High (low to high)',
    description='Order:',
    style={'description_width': '70px'},
    layout=widgets.Layout(width='350px')
)

refresh_encode_btn = widgets.Button(description='üîÑ Refresh', button_style='info', layout=widgets.Layout(width='100px'))
apply_onehot_btn = widgets.Button(description='üî† Apply One-Hot', button_style='success', layout=widgets.Layout(width='150px'))
apply_ordinal_btn = widgets.Button(description='üî¢ Apply Ordinal', button_style='success', layout=widgets.Layout(width='150px'))

encode_output = widgets.Output()

def refresh_encode(b):
    if state.df is not None:
        cat_cols = state.get_categorical_columns()
        onehot_select.options = cat_cols if cat_cols else ['-- No categorical columns --']
        ordinal_dropdown.options = ['-- Select --'] + cat_cols
        with encode_output:
            clear_output()
            if cat_cols:
                print(f"üìã Categorical columns: {cat_cols}")
                for col in cat_cols:
                    unique = list(state.df[col].unique())[:5]
                    print(f"   {col}: {unique}{'...' if len(state.df[col].unique()) > 5 else ''}")

def apply_onehot(b):
    with encode_output:
        clear_output()
        if len(onehot_select.value) == 0:
            print("‚ÑπÔ∏è No columns selected")
            return
        cols = [c for c in onehot_select.value if c in state.df.columns]
        if cols:
            state.df = pd.get_dummies(state.df, columns=cols, drop_first=True)
            state.log(f"One-hot encoded: {cols}")
            print(f"‚úÖ One-hot encoded: {cols}")
            print(f"\nüìè Dataset: {state.df.shape[1]} columns now")
            # Refresh the list
            refresh_encode(None)

def apply_ordinal(b):
    with encode_output:
        clear_output()
        col = ordinal_dropdown.value
        if col == '-- Select --' or not ordinal_order_text.value:
            print("‚ÑπÔ∏è Select column and enter order")
            return
        order = [x.strip() for x in ordinal_order_text.value.split(',')]
        order_map = {val: idx for idx, val in enumerate(order)}
        state.df[col] = state.df[col].map(order_map)
        state.log(f"Ordinal encoded: {col}")
        print(f"‚úÖ Ordinal encoded '{col}': {order_map}")
        # Refresh the list
        refresh_encode(None)

refresh_encode_btn.on_click(refresh_encode)
apply_onehot_btn.on_click(apply_onehot)
apply_ordinal_btn.on_click(apply_ordinal)

display(widgets.VBox([
    theory,
    refresh_encode_btn,
    widgets.HTML("<b>One-Hot Encoding:</b> <i>(Hold Ctrl/Cmd for multiple)</i>"),
    onehot_select,
    apply_onehot_btn,
    widgets.HTML("<br><b>Ordinal Encoding:</b> <i>(one at a time)</i>"),
    widgets.HBox([ordinal_dropdown, ordinal_order_text]),
    apply_ordinal_btn,
    encode_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# ‚öñÔ∏è STEP 7: Scale Features & Split Data

In [8]:
#@title ‚öñÔ∏è **Step 7: Scale & Split Data** { display-mode: "form" }
#@markdown **Run this cell, select columns to scale, set test size, then click Split.**

theory = info_box("""
<b>üí° Why Scale?</b>
<ul>
<li><b>Linear models (Ridge, Lasso)</b>: REQUIRE scaling - variables with larger ranges dominate</li>
<li><b>Tree models (RF, XGBoost)</b>: DON'T need scaling - they split on values</li>
</ul>
<b>üí° Why Split?</b>
<ul>
<li><b>Training Set</b>: Model learns from this data</li>
<li><b>Test Set</b>: Held out to evaluate generalization</li>
</ul>
<b>‚ö†Ô∏è Always fit scaler on TRAINING data only, then transform both train and test!</b>
""")

scale_select = widgets.SelectMultiple(
    options=['-- Refresh to see columns --'],
    description='Scale:',
    layout=widgets.Layout(width='300px', height='120px'),
    style={'description_width': '50px'}
)

test_size_slider = widgets.FloatSlider(
    value=0.2, min=0.1, max=0.4, step=0.05,
    description='Test Size:',
    readout_format='.0%',
    style={'description_width': '80px'},
    layout=widgets.Layout(width='350px')
)

random_seed_input = widgets.IntText(value=42, description='Seed:', style={'description_width': '50px'}, layout=widgets.Layout(width='120px'))

refresh_scale_btn = widgets.Button(description='üîÑ Refresh', button_style='info', layout=widgets.Layout(width='100px'))
split_btn = widgets.Button(description='‚úÇÔ∏è Split Data', button_style='success', layout=widgets.Layout(width='150px'))

split_output = widgets.Output()

def refresh_scale(b):
    if state.df is not None:
        scale_select.options = state.get_numeric_columns()
        with split_output:
            clear_output()
            print(f"üìä {len(scale_select.options)} numeric columns available")

def split_data(b):
    with split_output:
        clear_output()
        if state.df is None or state.target is None:
            print("‚ùå Complete Steps 1-2 first!")
            return

        X = state.df.drop(columns=[state.target])
        y = state.df[state.target]
        state.feature_names = list(X.columns)

        if state.problem_type == 'classification':
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size_slider.value, random_state=random_seed_input.value, stratify=y
            )
        else:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size_slider.value, random_state=random_seed_input.value
            )

        state.X_train, state.X_test = X_train, X_test
        state.y_train, state.y_test = y_train, y_test

        # Scaling
        state.scaled_columns = [c for c in scale_select.value if c in X_train.columns]
        state.X_train_scaled = X_train.copy()
        state.X_test_scaled = X_test.copy()

        if state.scaled_columns:
            state.scaler = StandardScaler()
            state.X_train_scaled[state.scaled_columns] = state.scaler.fit_transform(X_train[state.scaled_columns])
            state.X_test_scaled[state.scaled_columns] = state.scaler.transform(X_test[state.scaled_columns])

        state.log(f"Split: {len(X_train)} train, {len(X_test)} test")

        print("="*60)
        print("‚úÖ DATA SPLIT COMPLETE!")
        print("="*60)
        print(f"\nüìä Training: {len(X_train):,} samples ({(1-test_size_slider.value)*100:.0f}%)")
        print(f"üìä Test: {len(X_test):,} samples ({test_size_slider.value*100:.0f}%)")
        print(f"üìê Features: {len(state.feature_names)}")
        if state.scaled_columns:
            print(f"\n‚öñÔ∏è Scaled: {state.scaled_columns}")
        print("\nüéâ Ready to train models!")

refresh_scale_btn.on_click(refresh_scale)
split_btn.on_click(split_data)

display(widgets.VBox([
    theory,
    widgets.HTML("<b>Select columns to scale</b> <i>(for Ridge/Lasso)</i>:"),
    refresh_scale_btn,
    scale_select,
    widgets.HBox([test_size_slider, random_seed_input]),
    split_btn,
    split_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# üí∞ STEP 8: Define Cost Matrix

In [9]:
#@title üí∞ **Step 8: Define Cost Matrix** { display-mode: "form" }
#@markdown **Run this cell, enter costs/benefits, then click Save.**

theory = info_box("""
<b>üí° Understanding the Cost Matrix:</b><br><br>
<table style="border-collapse:collapse; width:100%;">
<tr><th></th><th style="border:1px solid #ddd; padding:8px;">Predicted Positive</th><th style="border:1px solid #ddd; padding:8px;">Predicted Negative</th></tr>
<tr><td style="border:1px solid #ddd; padding:8px;"><b>Actual Positive</b></td><td style="border:1px solid #ddd; padding:8px; background:#d4edda;">TP ‚úÖ Correct</td><td style="border:1px solid #ddd; padding:8px; background:#f8d7da;">FN ‚ùå Missed</td></tr>
<tr><td style="border:1px solid #ddd; padding:8px;"><b>Actual Negative</b></td><td style="border:1px solid #ddd; padding:8px; background:#fff3cd;">FP ‚ö†Ô∏è Wasted</td><td style="border:1px solid #ddd; padding:8px; background:#d4edda;">TN ‚úÖ Correct</td></tr>
</table>
<br><b>Example (Marketing):</b> TP=+\$15 (revenue), FP=-\$5 (wasted cost), TN=\$0, FN=-\$10 (missed sale)
""")

cost_tp_input = widgets.FloatText(value=15, description='TP Value ($):', style={'description_width': '100px'}, layout=widgets.Layout(width='180px'))
cost_fp_input = widgets.FloatText(value=-5, description='FP Cost ($):', style={'description_width': '100px'}, layout=widgets.Layout(width='180px'))
cost_tn_input = widgets.FloatText(value=0, description='TN Value ($):', style={'description_width': '100px'}, layout=widgets.Layout(width='180px'))
cost_fn_input = widgets.FloatText(value=-10, description='FN Cost ($):', style={'description_width': '100px'}, layout=widgets.Layout(width='180px'))

save_value_btn = widgets.Button(description='üíæ Save Costs', button_style='success', layout=widgets.Layout(width='130px'))
cost_output = widgets.Output()

def save_costs(b):
    with cost_output:
        clear_output()
        state.cost_tp = cost_tp_input.value
        state.cost_fp = cost_fp_input.value
        state.cost_tn = cost_tn_input.value
        state.cost_fn = cost_fn_input.value
        print("‚úÖ Cost matrix saved!")
        print(f"\n   TP: ${state.cost_tp}  |  FN: ${state.cost_fn}")
        print(f"   FP: ${state.cost_fp}  |  TN: ${state.cost_tn}")

save_value_btn.on_click(save_costs)

display(widgets.VBox([
    theory,
    widgets.HTML("<b>Enter positive values for benefits, negative for costs:</b>"),
    widgets.HBox([cost_tp_input, cost_fn_input]),
    widgets.HBox([cost_fp_input, cost_tn_input]),
    save_value_btn,
    cost_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
---
# ü§ñ MODEL TRAINING
## Train each model and see results immediately
---

In [10]:
#@title üîµ **Ridge Model** { display-mode: "form" }
#@markdown **Ridge (L2 Regularization):** Shrinks coefficients but keeps all features.

theory = info_box("""
<b>üí° Ridge (L2 Regularization):</b>
<ul>
<li>Adds penalty to large coefficients ‚Üí prevents overfitting</li>
<li>Shrinks coefficients toward zero but <b>never exactly to zero</b></li>
<li>Best when <b>all features are somewhat relevant</b></li>
<li><b>C</b> = inverse regularization (lower C = stronger penalty)</li>
</ul>
""")

ridge_c_slider = widgets.FloatSlider(value=1.0, min=0.01, max=10, step=0.1, description='C:',
                                      style={'description_width': '30px'}, layout=widgets.Layout(width='350px'))
ridge_importance_check = widgets.Checkbox(value=True, description='Show Feature Importance', style={'description_width': 'initial'})

train_ridge_btn = widgets.Button(description='üöÄ Train Ridge', button_style='primary', layout=widgets.Layout(width='150px', height='40px'))
ridge_output = widgets.Output()

def train_ridge(b):
    with ridge_output:
        clear_output()
        if state.X_train is None:
            print("‚ùå Complete data preparation first (Steps 1-7)!")
            return

        print("üîÑ Training Ridge...")

        if state.problem_type == 'classification':
            model = LogisticRegression(penalty='l2', C=ridge_c_slider.value, solver='lbfgs', max_iter=1000, random_state=42)
            model.fit(state.X_train_scaled, state.y_train)
            y_pred = model.predict(state.X_test_scaled)
            y_prob = model.predict_proba(state.X_test_scaled)[:, 1]

            cm = confusion_matrix(state.y_test, y_pred)
            tn, fp, fn, tp = cm.ravel()

            state.models['Ridge'] = model
            state.results['Ridge'] = {
                'y_pred': y_pred, 'y_prob': y_prob,
                'accuracy': accuracy_score(state.y_test, y_pred),
                'precision': precision_score(state.y_test, y_pred, zero_division=0),
                'recall': recall_score(state.y_test, y_pred, zero_division=0),
                'f1': f1_score(state.y_test, y_pred, zero_division=0),
                'auc': roc_auc_score(state.y_test, y_prob),
                'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn
            }

            total_cost = tp*state.cost_tp + fp*state.cost_fp + tn*state.cost_tn + fn*state.cost_fn
            state.results['Ridge']['total_cost'] = total_cost
            state.results['Ridge']['avg_cost'] = total_cost / len(state.y_test)

            print("="*60)
            print("‚úÖ RIDGE TRAINED!")
            print("="*60)
            print(f"\nüìä METRICS:")
            print(f"   Accuracy: {state.results['Ridge']['accuracy']:.4f}  |  AUC: {state.results['Ridge']['auc']:.4f}")
            print(f"   Precision: {state.results['Ridge']['precision']:.4f}  |  Recall: {state.results['Ridge']['recall']:.4f}")
            print(f"\nüìã CONFUSION MATRIX:  TP={tp} | FP={fp} | TN={tn} | FN={fn}")
            print(f"\nüí∞ VALUE: Total=${total_cost:,.2f}  |  Avg=${total_cost/len(state.y_test):.2f}")

            if ridge_importance_check.value:
                print(f"\nüìä TOP 10 FEATURES:")
                coef = model.coef_.flatten()
                imp_df = pd.DataFrame({'Feature': state.feature_names, 'Coefficient': coef})
                imp_df['|Coef|'] = imp_df['Coefficient'].abs()
                imp_df = imp_df.sort_values('|Coef|', ascending=False).head(10)
                display(imp_df[['Feature', 'Coefficient']].reset_index(drop=True))
        else:
            model = Ridge(alpha=1/ridge_c_slider.value, random_state=42)
            model.fit(state.X_train_scaled, state.y_train)
            y_pred = model.predict(state.X_test_scaled)
            state.models['Ridge'] = model
            state.results['Ridge'] = {
                'y_pred': y_pred,
                'rmse': np.sqrt(mean_squared_error(state.y_test, y_pred)),
                'mae': mean_absolute_error(state.y_test, y_pred),
                'r2': r2_score(state.y_test, y_pred)
            }
            print("="*60)
            print("‚úÖ RIDGE TRAINED!")
            print("="*60)
            print(f"\nüìä RMSE: {state.results['Ridge']['rmse']:.4f}  |  MAE: {state.results['Ridge']['mae']:.4f}  |  R¬≤: {state.results['Ridge']['r2']:.4f}")

        state.log("Ridge trained")

train_ridge_btn.on_click(train_ridge)

display(widgets.VBox([
    theory,
    widgets.HBox([ridge_c_slider, ridge_importance_check]),
    train_ridge_btn,
    ridge_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

In [11]:
#@title üü¢ **Lasso Model** { display-mode: "form" }
#@markdown **Lasso (L1 Regularization):** Can eliminate features by shrinking coefficients to exactly zero.

theory = info_box("""
<b>üí° Lasso (L1 Regularization):</b>
<ul>
<li>Can shrink coefficients <b>exactly to zero</b></li>
<li>Performs <b>automatic feature selection</b></li>
<li>Best when <b>many features are irrelevant</b></li>
<li><b>Ridge vs Lasso:</b> Ridge keeps all features, Lasso eliminates some</li>
</ul>
""")

lasso_c_slider = widgets.FloatSlider(value=1.0, min=0.01, max=10, step=0.1, description='C:',
                                      style={'description_width': '30px'}, layout=widgets.Layout(width='350px'))
lasso_importance_check = widgets.Checkbox(value=True, description='Show Feature Importance', style={'description_width': 'initial'})

train_lasso_btn = widgets.Button(description='üöÄ Train Lasso', button_style='primary', layout=widgets.Layout(width='150px', height='40px'))
lasso_output = widgets.Output()

def train_lasso(b):
    with lasso_output:
        clear_output()
        if state.X_train is None:
            print("‚ùå Complete data preparation first!")
            return

        print("üîÑ Training Lasso...")

        if state.problem_type == 'classification':
            model = LogisticRegression(penalty='l1', C=lasso_c_slider.value, solver='saga', max_iter=1000, random_state=42)
            model.fit(state.X_train_scaled, state.y_train)
            y_pred = model.predict(state.X_test_scaled)
            y_prob = model.predict_proba(state.X_test_scaled)[:, 1]

            cm = confusion_matrix(state.y_test, y_pred)
            tn, fp, fn, tp = cm.ravel()
            eliminated = sum(1 for c in model.coef_[0] if c == 0)

            state.models['Lasso'] = model
            state.results['Lasso'] = {
                'y_pred': y_pred, 'y_prob': y_prob,
                'accuracy': accuracy_score(state.y_test, y_pred),
                'precision': precision_score(state.y_test, y_pred, zero_division=0),
                'recall': recall_score(state.y_test, y_pred, zero_division=0),
                'f1': f1_score(state.y_test, y_pred, zero_division=0),
                'auc': roc_auc_score(state.y_test, y_prob),
                'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn
            }

            total_cost = tp*state.cost_tp + fp*state.cost_fp + tn*state.cost_tn + fn*state.cost_fn
            state.results['Lasso']['total_cost'] = total_cost
            state.results['Lasso']['avg_cost'] = total_cost / len(state.y_test)

            print("="*60)
            print("‚úÖ LASSO TRAINED!")
            print("="*60)
            print(f"\nüîç FEATURE SELECTION: {eliminated}/{len(state.feature_names)} eliminated")
            print(f"\nüìä METRICS:")
            print(f"   Accuracy: {state.results['Lasso']['accuracy']:.4f}  |  AUC: {state.results['Lasso']['auc']:.4f}")
            print(f"   Precision: {state.results['Lasso']['precision']:.4f}  |  Recall: {state.results['Lasso']['recall']:.4f}")
            print(f"\nüìã CONFUSION MATRIX:  TP={tp} | FP={fp} | TN={tn} | FN={fn}")
            print(f"\nüí∞ VALUE: Total=${total_cost:,.2f}  |  Avg=${total_cost/len(state.y_test):.2f}")

            if lasso_importance_check.value:
                print(f"\nüìä SELECTED FEATURES (non-zero):")
                coef = model.coef_.flatten()
                imp_df = pd.DataFrame({'Feature': state.feature_names, 'Coefficient': coef})
                imp_df = imp_df[imp_df['Coefficient'] != 0]
                imp_df['|Coef|'] = imp_df['Coefficient'].abs()
                imp_df = imp_df.sort_values('|Coef|', ascending=False)
                display(imp_df[['Feature', 'Coefficient']].reset_index(drop=True))
        else:
            model = Lasso(alpha=1/lasso_c_slider.value, max_iter=1000, random_state=42)
            model.fit(state.X_train_scaled, state.y_train)
            y_pred = model.predict(state.X_test_scaled)
            eliminated = sum(1 for c in model.coef_ if c == 0)
            state.models['Lasso'] = model
            state.results['Lasso'] = {
                'y_pred': y_pred,
                'rmse': np.sqrt(mean_squared_error(state.y_test, y_pred)),
                'mae': mean_absolute_error(state.y_test, y_pred),
                'r2': r2_score(state.y_test, y_pred)
            }
            print("="*60)
            print("‚úÖ LASSO TRAINED!")
            print("="*60)
            print(f"\nüîç Features eliminated: {eliminated}/{len(state.feature_names)}")
            print(f"\nüìä RMSE: {state.results['Lasso']['rmse']:.4f}  |  MAE: {state.results['Lasso']['mae']:.4f}  |  R¬≤: {state.results['Lasso']['r2']:.4f}")

        state.log("Lasso trained")

train_lasso_btn.on_click(train_lasso)

display(widgets.VBox([
    theory,
    widgets.HBox([lasso_c_slider, lasso_importance_check]),
    train_lasso_btn,
    lasso_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

In [12]:
#@title üå≤ **Random Forest Model** { display-mode: "form" }
#@markdown **Random Forest:** Builds many trees and averages predictions. No scaling needed!

theory = info_box("""
<b>üí° Random Forest:</b>
<ul>
<li>Builds <b>many decision trees</b> and averages predictions</li>
<li>Each tree sees <b>random subset</b> of data and features</li>
<li>Handles non-linear relationships, robust to outliers</li>
<li><b>No scaling needed!</b></li>
</ul>
""")

rf_trees_slider = widgets.IntSlider(value=100, min=10, max=500, step=10, description='Trees:', style={'description_width': '50px'}, layout=widgets.Layout(width='300px'))
rf_depth_slider = widgets.IntSlider(value=10, min=2, max=30, step=1, description='Depth:', style={'description_width': '50px'}, layout=widgets.Layout(width='300px'))
rf_importance_check = widgets.Checkbox(value=True, description='Show Feature Importance', style={'description_width': 'initial'})

train_rf_btn = widgets.Button(description='üöÄ Train Random Forest', button_style='primary', layout=widgets.Layout(width='180px', height='40px'))
rf_output = widgets.Output()

def train_rf(b):
    with rf_output:
        clear_output()
        if state.X_train is None:
            print("‚ùå Complete data preparation first!")
            return

        print("üîÑ Training Random Forest...")

        if state.problem_type == 'classification':
            model = RandomForestClassifier(n_estimators=rf_trees_slider.value, max_depth=rf_depth_slider.value, random_state=42)
            model.fit(state.X_train, state.y_train)
            y_pred = model.predict(state.X_test)
            y_prob = model.predict_proba(state.X_test)[:, 1]

            cm = confusion_matrix(state.y_test, y_pred)
            tn, fp, fn, tp = cm.ravel()

            state.models['Random Forest'] = model
            state.results['Random Forest'] = {
                'y_pred': y_pred, 'y_prob': y_prob,
                'accuracy': accuracy_score(state.y_test, y_pred),
                'precision': precision_score(state.y_test, y_pred, zero_division=0),
                'recall': recall_score(state.y_test, y_pred, zero_division=0),
                'f1': f1_score(state.y_test, y_pred, zero_division=0),
                'auc': roc_auc_score(state.y_test, y_prob),
                'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn
            }

            total_cost = tp*state.cost_tp + fp*state.cost_fp + tn*state.cost_tn + fn*state.cost_fn
            state.results['Random Forest']['total_cost'] = total_cost
            state.results['Random Forest']['avg_cost'] = total_cost / len(state.y_test)

            print("="*60)
            print("‚úÖ RANDOM FOREST TRAINED!")
            print("="*60)
            print(f"\nüå≤ Config: {rf_trees_slider.value} trees, depth {rf_depth_slider.value}")
            print(f"\nüìä METRICS:")
            print(f"   Accuracy: {state.results['Random Forest']['accuracy']:.4f}  |  AUC: {state.results['Random Forest']['auc']:.4f}")
            print(f"   Precision: {state.results['Random Forest']['precision']:.4f}  |  Recall: {state.results['Random Forest']['recall']:.4f}")
            print(f"\nüìã CONFUSION MATRIX:  TP={tp} | FP={fp} | TN={tn} | FN={fn}")
            print(f"\nüí∞ VALUE: Total=${total_cost:,.2f}  |  Avg=${total_cost/len(state.y_test):.2f}")

            if rf_importance_check.value:
                print(f"\nüìä TOP 10 FEATURES:")
                imp_df = pd.DataFrame({'Feature': state.feature_names, 'Importance': model.feature_importances_})
                imp_df = imp_df.sort_values('Importance', ascending=False).head(10)
                display(imp_df.reset_index(drop=True))
        else:
            model = RandomForestRegressor(n_estimators=rf_trees_slider.value, max_depth=rf_depth_slider.value, random_state=42)
            model.fit(state.X_train, state.y_train)
            y_pred = model.predict(state.X_test)
            state.models['Random Forest'] = model
            state.results['Random Forest'] = {
                'y_pred': y_pred,
                'rmse': np.sqrt(mean_squared_error(state.y_test, y_pred)),
                'mae': mean_absolute_error(state.y_test, y_pred),
                'r2': r2_score(state.y_test, y_pred)
            }
            print("="*60)
            print("‚úÖ RANDOM FOREST TRAINED!")
            print("="*60)
            print(f"\nüìä RMSE: {state.results['Random Forest']['rmse']:.4f}  |  MAE: {state.results['Random Forest']['mae']:.4f}  |  R¬≤: {state.results['Random Forest']['r2']:.4f}")

        state.log("Random Forest trained")

train_rf_btn.on_click(train_rf)

display(widgets.VBox([
    theory,
    widgets.HBox([rf_trees_slider, rf_depth_slider]),
    rf_importance_check,
    train_rf_btn,
    rf_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

In [13]:
#@title üöÄ **XGBoost Model** { display-mode: "form" }
#@markdown **XGBoost (Gradient Boosting):** Builds trees sequentially, each correcting previous errors.

theory = info_box("""
<b>üí° XGBoost (Gradient Boosting):</b>
<ul>
<li>Builds trees <b>sequentially</b>, each correcting previous errors</li>
<li>Often achieves <b>best predictive accuracy</b></li>
<li><b>No scaling needed!</b></li>
<li><b>RF vs XGBoost:</b> RF builds independently (parallel), XGBoost builds sequentially</li>
</ul>
""")

xgb_rounds_slider = widgets.IntSlider(value=100, min=10, max=500, step=10, description='Rounds:', style={'description_width': '60px'}, layout=widgets.Layout(width='280px'))
xgb_depth_slider = widgets.IntSlider(value=6, min=2, max=15, step=1, description='Depth:', style={'description_width': '60px'}, layout=widgets.Layout(width='280px'))
xgb_lr_slider = widgets.FloatSlider(value=0.1, min=0.01, max=0.5, step=0.01, description='Learn Rate:', style={'description_width': '80px'}, layout=widgets.Layout(width='280px'))
xgb_importance_check = widgets.Checkbox(value=True, description='Show Feature Importance', style={'description_width': 'initial'})

train_xgb_btn = widgets.Button(description='üöÄ Train XGBoost', button_style='primary', layout=widgets.Layout(width='150px', height='40px'))
xgb_output = widgets.Output()

def train_xgb(b):
    with xgb_output:
        clear_output()
        if state.X_train is None:
            print("‚ùå Complete data preparation first!")
            return

        print("üîÑ Training XGBoost...")

        if state.problem_type == 'classification':
            model = GradientBoostingClassifier(
                n_estimators=xgb_rounds_slider.value, max_depth=xgb_depth_slider.value,
                learning_rate=xgb_lr_slider.value, random_state=42
            )
            model.fit(state.X_train, state.y_train)
            y_pred = model.predict(state.X_test)
            y_prob = model.predict_proba(state.X_test)[:, 1]

            cm = confusion_matrix(state.y_test, y_pred)
            tn, fp, fn, tp = cm.ravel()

            state.models['XGBoost'] = model
            state.results['XGBoost'] = {
                'y_pred': y_pred, 'y_prob': y_prob,
                'accuracy': accuracy_score(state.y_test, y_pred),
                'precision': precision_score(state.y_test, y_pred, zero_division=0),
                'recall': recall_score(state.y_test, y_pred, zero_division=0),
                'f1': f1_score(state.y_test, y_pred, zero_division=0),
                'auc': roc_auc_score(state.y_test, y_prob),
                'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn
            }

            total_cost = tp*state.cost_tp + fp*state.cost_fp + tn*state.cost_tn + fn*state.cost_fn
            state.results['XGBoost']['total_cost'] = total_cost
            state.results['XGBoost']['avg_cost'] = total_cost / len(state.y_test)

            print("="*60)
            print("‚úÖ XGBOOST TRAINED!")
            print("="*60)
            print(f"\n‚ö° Config: {xgb_rounds_slider.value} rounds, depth {xgb_depth_slider.value}, LR {xgb_lr_slider.value}")
            print(f"\nüìä METRICS:")
            print(f"   Accuracy: {state.results['XGBoost']['accuracy']:.4f}  |  AUC: {state.results['XGBoost']['auc']:.4f}")
            print(f"   Precision: {state.results['XGBoost']['precision']:.4f}  |  Recall: {state.results['XGBoost']['recall']:.4f}")
            print(f"\nüìã CONFUSION MATRIX:  TP={tp} | FP={fp} | TN={tn} | FN={fn}")
            print(f"\nüí∞ VALUE: Total=${total_cost:,.2f}  |  Avg=${total_cost/len(state.y_test):.2f}")

            if xgb_importance_check.value:
                print(f"\nüìä TOP 10 FEATURES:")
                imp_df = pd.DataFrame({'Feature': state.feature_names, 'Importance': model.feature_importances_})
                imp_df = imp_df.sort_values('Importance', ascending=False).head(10)
                display(imp_df.reset_index(drop=True))
        else:
            model = GradientBoostingRegressor(
                n_estimators=xgb_rounds_slider.value, max_depth=xgb_depth_slider.value,
                learning_rate=xgb_lr_slider.value, random_state=42
            )
            model.fit(state.X_train, state.y_train)
            y_pred = model.predict(state.X_test)
            state.models['XGBoost'] = model
            state.results['XGBoost'] = {
                'y_pred': y_pred,
                'rmse': np.sqrt(mean_squared_error(state.y_test, y_pred)),
                'mae': mean_absolute_error(state.y_test, y_pred),
                'r2': r2_score(state.y_test, y_pred)
            }
            print("="*60)
            print("‚úÖ XGBOOST TRAINED!")
            print("="*60)
            print(f"\nüìä RMSE: {state.results['XGBoost']['rmse']:.4f}  |  MAE: {state.results['XGBoost']['mae']:.4f}  |  R¬≤: {state.results['XGBoost']['r2']:.4f}")

        state.log("XGBoost trained")

train_xgb_btn.on_click(train_xgb)

display(widgets.VBox([
    theory,
    widgets.HBox([xgb_rounds_slider, xgb_depth_slider]),
    widgets.HBox([xgb_lr_slider, xgb_importance_check]),
    train_xgb_btn,
    xgb_output
]))

VBox(children=(HTML(value='<div style="background-color:#f0f4f8; border-left:4px solid #5c6bc0; padding:12px; ‚Ä¶

---
# üìä Compare All Models & Calculate Values

In [14]:
#@title üìä **Compare Models & Value Analysis** { display-mode: "form" }
#@markdown **Compare all trained models and calculate value on Train or Test set.**

model_dropdown = widgets.Dropdown(
    options=['-- Select Model --'],
    description='Model:',
    style={'description_width': '60px'},
    layout=widgets.Layout(width='200px')
)

dataset_dropdown = widgets.Dropdown(
    options=['Test Set', 'Training Set'],
    description='Dataset:',
    style={'description_width': '60px'},
    layout=widgets.Layout(width='180px')
)

metric_dropdown = widgets.Dropdown(
    options=['Total Value', 'Average Value per Prediction'],
    description='Metric:',
    style={'description_width': '60px'},
    layout=widgets.Layout(width='250px')
)

compare_btn = widgets.Button(description='üìä Compare All Models', button_style='success', layout=widgets.Layout(width='180px'))
value_btn = widgets.Button(description='üí∞ Calculate Value', button_style='info', layout=widgets.Layout(width='150px'))
refresh_models_btn = widgets.Button(description='üîÑ Refresh', button_style='warning', layout=widgets.Layout(width='100px'))

compare_output = widgets.Output()

def refresh_models(b):
    model_dropdown.options = ['-- Select --'] + list(state.models.keys())

def compare_all(b):
    with compare_output:
        clear_output()
        if len(state.results) == 0:
            print("‚ùå Train at least one model first!")
            return

        print("="*80)
        print("üìä MODEL COMPARISON")
        print("="*80)

        if state.problem_type == 'classification':
            data = [{'Model': name, 'Accuracy': r['accuracy'], 'Precision': r['precision'],
                     'Recall': r['recall'], 'F1': r['f1'], 'AUC': r['auc'],
                     'Total $': r.get('total_cost', 0), 'Avg $': r.get('avg_cost', 0)}
                    for name, r in state.results.items()]
            df = pd.DataFrame(data).round(4)
            display(df)

            best_auc = max(state.results.keys(), key=lambda x: state.results[x]['auc'])
            best_val = max(state.results.keys(), key=lambda x: state.results[x].get('total_cost', -999999))
            print(f"\nüèÜ Best AUC: {best_auc} ({state.results[best_auc]['auc']:.4f})")
            print(f"üèÜ Best Value: {best_val} (${state.results[best_val].get('total_cost', 0):,.2f})")

            # ROC Curve
            fig, ax = plt.subplots(figsize=(8, 6))
            for name, r in state.results.items():
                fpr, tpr, _ = roc_curve(state.y_test, r['y_prob'])
                ax.plot(fpr, tpr, label=f"{name} ({r['auc']:.3f})")
            ax.plot([0, 1], [0, 1], 'k--')
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title('ROC Curves')
            ax.legend()
            plt.show()
        else:
            data = [{'Model': name, 'RMSE': r['rmse'], 'MAE': r['mae'], 'R¬≤': r['r2']}
                    for name, r in state.results.items()]
            df = pd.DataFrame(data).round(4)
            display(df)
            best_r2 = max(state.results.keys(), key=lambda x: state.results[x]['r2'])
            print(f"\nüèÜ Best R¬≤: {best_r2} ({state.results[best_r2]['r2']:.4f})")

def calc_value(b):
    with compare_output:
        clear_output()
        name = model_dropdown.value
        if name == '-- Select --' or name not in state.models:
            print("‚ùå Select a trained model!")
            return

        model = state.models[name]

        if dataset_dropdown.value == 'Test Set':
            X = state.X_test_scaled if name in ['Ridge', 'Lasso'] else state.X_test
            y_true = state.y_test
        else:
            X = state.X_train_scaled if name in ['Ridge', 'Lasso'] else state.X_train
            y_true = state.y_train

        y_pred = model.predict(X)
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()

        total = tp*state.cost_tp + fp*state.cost_fp + tn*state.cost_tn + fn*state.cost_fn
        avg = total / len(y_true)

        print("="*60)
        print(f"üí∞ VALUE ANALYSIS: {name}")
        print(f"üìä Dataset: {dataset_dropdown.value} ({len(y_true):,} obs)")
        print("="*60)
        print(f"\n   TP: {tp:,} √ó ${state.cost_tp} = ${tp*state.cost_tp:,.2f}")
        print(f"   FP: {fp:,} √ó ${state.cost_fp} = ${fp*state.cost_fp:,.2f}")
        print(f"   TN: {tn:,} √ó ${state.cost_tn} = ${tn*state.cost_tn:,.2f}")
        print(f"   FN: {fn:,} √ó ${state.cost_fn} = ${fn*state.cost_fn:,.2f}")
        print(f"\n" + "="*40)
        if metric_dropdown.value == 'Total Value':
            print(f"üíµ TOTAL: ${total:,.2f}")
        else:
            print(f"üíµ AVG PER PREDICTION: ${avg:.2f}")

refresh_models_btn.on_click(refresh_models)
compare_btn.on_click(compare_all)
value_btn.on_click(calc_value)

display(widgets.VBox([
    widgets.HBox([compare_btn, refresh_models_btn]),
    widgets.HTML("<br><b>Detailed Value Analysis:</b>"),
    widgets.HBox([model_dropdown, dataset_dropdown, metric_dropdown]),
    value_btn,
    compare_output
]))

VBox(children=(HBox(children=(Button(button_style='success', description='üìä Compare All Models', layout=Layout‚Ä¶