In [2]:
!pip install voila




In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2_contingency, entropy
from itertools import combinations
import random
from google.colab import files
from matplotlib.backends.backend_pdf import PdfPages
import warnings
warnings.filterwarnings('ignore')

# --- Load CSV ---
uploaded = files.upload()
file_path = list(uploaded.keys())[0]
df = pd.read_csv(file_path)

# --- Parse column types and roles ---
col_info = []
for col in df.columns:
    base_name = col.split()[0]
    dtype = 'numeric'  # default
    role = 'in'        # default

    # Parse data type
    if '(cat)' in col:
        dtype = 'categorical'
    elif '(text)' in col:
        dtype = 'text'
    elif '(num)' in col:
        dtype = 'numeric'
    elif '(others)' in col:
        dtype = 'others'

    # Parse role
    if '(id)' in col:
        role = 'id'
    elif '(target)' in col or '(out)' in col:
        role = 'target'
    elif '(both)' in col:
        role = 'both'
    elif '(none)' in col:
        role = 'none'

    col_info.append({"original_name": col, "name": base_name, "dtype": dtype, "role": role})

# --- Handle missing values and text cleaning ---
df_clean = df.copy()
meta_data = []
columns_to_drop = []

for c in col_info:
    col_name = c['original_name']

    # Fix the pandas FutureWarning by using proper assignment
    df_clean[col_name] = df_clean[col_name].replace(['?', 'NA'], np.nan)

    if c['dtype'] == 'text':
        df_clean[col_name] = df_clean[col_name].astype(str).str.replace("'", "")

    missing_count = df_clean[col_name].isna().sum()
    pct_missing = missing_count / len(df_clean) * 100
    replacement = 'None'

    if pct_missing >= 10:
        columns_to_drop.append(col_name)
        replacement = 'Dropped (>=10% missing)'
    else:
        if c['dtype'] == 'numeric' and missing_count > 0:
            mean_val = df_clean[col_name].mean()
            df_clean[col_name] = df_clean[col_name].fillna(mean_val)
            replacement = 'Mean'
        elif c['dtype'] in ['categorical', 'text'] and missing_count > 0:
            mode_val = df_clean[col_name].mode()
            if len(mode_val) > 0:
                df_clean[col_name] = df_clean[col_name].fillna(mode_val[0])
                replacement = 'Mode'

    meta_data.append({
        'attribute': col_name,
        'measurement_type': c['dtype'],
        'role': c['role'],
        'missing_count_pct': f"{missing_count} ({pct_missing:.2f}%)",
        'missing_replacement': replacement
    })

# Drop columns with >= 10% missing values
df_clean = df_clean.drop(columns=columns_to_drop)

# Update col_info to remove dropped columns
col_info = [c for c in col_info if c['original_name'] not in columns_to_drop]

# Display metadata
meta_df = pd.DataFrame(meta_data)
print("Metadata Table:")
display(meta_df)

# --- Encode categorical for modeling ---
df_encoded = df_clean.copy()
label_encoders = {}

for c in col_info:
    if c['dtype'] in ['categorical', 'text'] and c['role'] not in ['id', 'none']:
        if c['original_name'] in df_encoded.columns:
            le = LabelEncoder()
            df_encoded[c['original_name']] = le.fit_transform(df_encoded[c['original_name']].astype(str))
            label_encoders[c['original_name']] = le

# --- Helper functions ---
def compute_2d_entropy(df, x, y, target_col=None, bins_x=20, bins_y=20):
    """Compute 2D class entropy based on target variable distribution in grid cells"""
    if target_col is None or target_col not in df.columns:
        # If no target, compute regular 2D entropy
        hist, _, _ = np.histogram2d(df[x], df[y], bins=[bins_x, bins_y])
        prob = hist / hist.sum()
        prob = prob[prob > 0]  # Remove zero probabilities
        return -np.sum(prob * np.log2(prob + 1e-10))

    # Create 2D bins
    x_bins = np.linspace(df[x].min(), df[x].max(), bins_x + 1)
    y_bins = np.linspace(df[y].min(), df[y].max(), bins_y + 1)

    # Digitize the data
    x_dig = np.digitize(df[x], x_bins) - 1
    y_dig = np.digitize(df[y], y_bins) - 1

    # Compute entropy for each cell
    total_entropy = 0
    total_count = 0

    for i in range(bins_x):
        for j in range(bins_y):
            mask = (x_dig == i) & (y_dig == j)
            cell_data = df[mask]

            if len(cell_data) > 0:
                target_counts = cell_data[target_col].value_counts()
                if len(target_counts) > 1:
                    probs = target_counts / len(cell_data)
                    cell_entropy = -np.sum(probs * np.log2(probs + 1e-10))
                else:
                    cell_entropy = 0

                total_entropy += cell_entropy * len(cell_data)
                total_count += len(cell_data)

    return total_entropy / total_count if total_count > 0 else 0

def merge_categories_chaid(df, col1, col2, max_iterations=10):
    """Enhanced CHAID merging with iterative category combination"""
    ct = pd.crosstab(df[col1], df[col2])

    if ct.shape[0] <= 2:  # Can't merge if only 2 or fewer categories
        return ct

    iteration = 0
    merged = True

    while merged and iteration < max_iterations:
        merged = False
        iteration += 1

        if ct.shape[0] <= 2:
            break

        min_chi2 = float('inf')
        merge_pair = None

        # Try all possible pairs for merging
        for i, cat1 in enumerate(ct.index):
            for j, cat2 in enumerate(ct.index):
                if i >= j:  # Avoid duplicates and self-comparison
                    continue

                # Create temporary merged crosstab
                temp_ct = ct.copy()
                temp_ct.loc[cat1] = temp_ct.loc[cat1] + temp_ct.loc[cat2]
                temp_ct = temp_ct.drop(cat2)

                try:
                    chi2_val, p, _, _ = chi2_contingency(temp_ct)

                    # Merge if this gives the smallest chi-square (least significant difference)
                    if chi2_val < min_chi2:
                        min_chi2 = chi2_val
                        merge_pair = (cat1, cat2)
                except:
                    continue

        # Perform the merge if we found a suitable pair
        if merge_pair:
            cat1, cat2 = merge_pair
            ct.loc[cat1] = ct.loc[cat1] + ct.loc[cat2]
            ct = ct.drop(cat2)

            # Create new merged category name
            new_index = []
            for idx in ct.index:
                if idx == cat1:
                    new_index.append(f"{cat1}+{cat2}")
                else:
                    new_index.append(idx)
            ct.index = new_index
            merged = True

    return ct

# --- Select columns ---
target_cols = [c['original_name'] for c in col_info if c['role'] == 'target']
input_cols = [c['original_name'] for c in col_info if c['role'] in ['in', 'both']]

# Fixed the TypeError by properly filtering columns
numeric_inputs = []
categorical_inputs = []

for col_name in input_cols:
    col_dtype = None
    for c in col_info:
        if c['original_name'] == col_name:
            col_dtype = c['dtype']
            break

    if col_dtype == 'numeric':
        numeric_inputs.append(col_name)
    elif col_dtype in ['categorical', 'text']:
        categorical_inputs.append(col_name)

target = target_cols[0] if target_cols else None

print(f"Target column: {target}")
print(f"Numeric inputs: {len(numeric_inputs)}")
print(f"Categorical inputs: {len(categorical_inputs)}")

# --- Correlation analysis (no repeats) ---
top_corr_pairs = []
if len(numeric_inputs) >= 2:
    corr_matrix = df_clean[numeric_inputs].corr().abs()
    all_pairs = [(a, b, corr_matrix.loc[a, b]) for a, b in combinations(numeric_inputs, 2)]
    all_pairs.sort(key=lambda x: x[2], reverse=True)

    used_vars = set()
    for a, b, r in all_pairs:
        if a not in used_vars and b not in used_vars:
            top_corr_pairs.append((a, b, r))
            used_vars.update([a, b])
        if len(top_corr_pairs) == 3:
            break

# --- Entropy scatter plots using Random Forest feature importance ---
top_entropy_pairs = []
if len(numeric_inputs) >= 2 and target:
    try:
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(df_encoded[numeric_inputs], df_encoded[target])
        importances = pd.Series(rf.feature_importances_, index=numeric_inputs).sort_values(ascending=False)
        top_features_rf = importances.head(min(6, len(numeric_inputs))).index.tolist()
    except:
        top_features_rf = numeric_inputs[:6]
else:
    top_features_rf = numeric_inputs

if len(top_features_rf) >= 2:
    entropy_scores = {}
    for i, x in enumerate(top_features_rf):
        for y in top_features_rf[i+1:]:
            entropy_val = compute_2d_entropy(df_encoded, x, y, target)
            entropy_scores[(x, y)] = entropy_val

    # Sort by entropy (lowest first for most homogeneous)
    top_entropy_pairs = sorted(entropy_scores.items(), key=lambda x: x[1])[:3]

# --- ANOVA box plots ---
anova_pairs = []
if categorical_inputs and numeric_inputs:
    anova_scores = []
    for cat in categorical_inputs:
        for num in numeric_inputs:
            try:
                f_stat, p_val = f_regression(df_encoded[[num]], df_encoded[cat])
                anova_scores.append((cat, num, f_stat[0]))
            except:
                continue

    anova_pairs = sorted(anova_scores, key=lambda x: x[2], reverse=True)[:3]

# --- Chi-square heatmaps with CHAID merging ---
chi2_pairs = []
if len(categorical_inputs) >= 2:
    chi2_results = []
    for c1, c2 in combinations(categorical_inputs, 2):
        try:
            merged_ct = merge_categories_chaid(df_clean, c1, c2)
            chi2_val, p, _, _ = chi2_contingency(merged_ct)
            chi2_results.append((c1, c2, chi2_val, p, merged_ct))
        except:
            continue

    chi2_pairs = sorted(chi2_results, key=lambda x: x[2], reverse=True)[:3]

# --- HTML Report Generation (Ultra-Fast Alternative) ---
def generate_html_report(top_corr_pairs, top_entropy_pairs, anova_pairs, chi2_pairs, df_data, df_encoded_data, target_col, max_points=1000):
    """Generate interactive HTML report much faster than PDF"""
    import json

    # Sample data if too large
    if len(df_data) > max_points:
        sample_idx = np.random.choice(len(df_data), max_points, replace=False)
        df_sample = df_data.iloc[sample_idx]
        df_encoded_sample = df_encoded_data.iloc[sample_idx]
        print(f"Sampling {max_points} points from {len(df_data)} for faster plotting")
    else:
        df_sample = df_data
        df_encoded_sample = df_encoded_data

    html_content = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Smart vs Random Analysis</title>
        <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
        <style>
            body {
                font-family: Arial, sans-serif;
                margin: 20px;
                background-color: #f5f5f5;
            }
            .container {
                max-width: 1200px;
                margin: 0 auto;
                background-color: white;
                padding: 20px;
                border-radius: 10px;
                box-shadow: 0 2px 10px rgba(0,0,0,0.1);
            }
            .plot-section {
                margin-bottom: 40px;
                border-bottom: 1px solid #eee;
                padding-bottom: 20px;
            }
            .plot-container {
                display: flex;
                margin-bottom: 30px;
                gap: 20px;
            }
            .plot {
                width: 48%;
                min-height: 400px;
                border: 1px solid #ddd;
                border-radius: 5px;
            }
            .section-header {
                color: #2c3e50;
                border-bottom: 3px solid #3498db;
                padding-bottom: 10px;
                margin-bottom: 20px;
            }
            .plot-pair {
                background-color: #fafafa;
                padding: 15px;
                border-radius: 8px;
                margin-bottom: 20px;
            }
            .plot-title {
                text-align: center;
                font-weight: bold;
                color: #34495e;
                margin-bottom: 10px;
            }
        </style>
    </head>
    <body>
        <div class="container">
            <h1 style="text-align: center; color: #2c3e50;">Smart vs Random Data Analysis Report</h1>
    """

    plot_id = 0

    # Helper function to get color for target values
    def get_target_colors(df_sample, target_col):
        if target_col and target_col in df_sample.columns:
            unique_targets = df_sample[target_col].unique()
            colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6', '#1abc9c', '#34495e', '#e67e22']
            return {target: colors[i % len(colors)] for i, target in enumerate(unique_targets)}
        return {}

    target_colors = get_target_colors(df_sample, target_col)

    # 1. Correlation Analysis
    if top_corr_pairs:
        html_content += '<div class="plot-section"><h2 class="section-header">1. Correlation Analysis (Smart vs Random)</h2>'

        for i, (x, y, corr_val) in enumerate(top_corr_pairs):
            plot_id += 1

            # Smart plot data
            if target_col and target_col in df_sample.columns:
                smart_data = []
                for target_val in df_sample[target_col].unique():
                    mask = df_sample[target_col] == target_val
                    smart_data.append({
                        'x': df_sample.loc[mask, x].tolist(),
                        'y': df_sample.loc[mask, y].tolist(),
                        'mode': 'markers',
                        'type': 'scatter',
                        'name': str(target_val),
                        'marker': {'size': 8, 'opacity': 0.7, 'color': target_colors.get(target_val, '#3498db')}
                    })
            else:
                smart_data = [{
                    'x': df_sample[x].tolist(),
                    'y': df_sample[y].tolist(),
                    'mode': 'markers',
                    'type': 'scatter',
                    'name': 'Data Points',
                    'marker': {'size': 8, 'opacity': 0.7, 'color': '#3498db'}
                }]

            # Random plot data
            random_data = []
            if len(numeric_inputs) >= 4:
                available = [col for col in numeric_inputs if col not in [x, y]]
                if len(available) >= 2:
                    rx, ry = random.sample(available, 2)
                    random_corr = abs(df_sample[[rx, ry]].corr().iloc[0, 1])

                    if target_col and target_col in df_sample.columns:
                        for target_val in df_sample[target_col].unique():
                            mask = df_sample[target_col] == target_val
                            random_data.append({
                                'x': df_sample.loc[mask, rx].tolist(),
                                'y': df_sample.loc[mask, ry].tolist(),
                                'mode': 'markers',
                                'type': 'scatter',
                                'name': str(target_val),
                                'marker': {'size': 8, 'opacity': 0.7, 'color': target_colors.get(target_val, '#3498db')}
                            })
                    else:
                        random_data = [{
                            'x': df_sample[rx].tolist(),
                            'y': df_sample[ry].tolist(),
                            'mode': 'markers',
                            'type': 'scatter',
                            'name': 'Data Points',
                            'marker': {'size': 8, 'opacity': 0.7, 'color': '#e74c3c'}
                        }]

                    html_content += f"""
                    <div class="plot-pair">
                        <h3>Correlation Pair {i+1}</h3>
                        <div class="plot-container">
                            <div class="plot" id="plot{plot_id}">
                                <div class="plot-title">Smart: {x} vs {y} (r={corr_val:.3f})</div>
                            </div>
                            <div class="plot" id="plot{plot_id+1}">
                                <div class="plot-title">Random: {rx} vs {ry} (r={random_corr:.3f})</div>
                            </div>
                        </div>
                    </div>
                    <script>
                        Plotly.newPlot('plot{plot_id}', {json.dumps(smart_data)},
                            {{title: '', xaxis: {{title: '{x}'}}, yaxis: {{title: '{y}'}}, showlegend: true}});
                        Plotly.newPlot('plot{plot_id+1}', {json.dumps(random_data)},
                            {{title: '', xaxis: {{title: '{rx}'}}, yaxis: {{title: '{ry}'}}, showlegend: true}});
                    </script>
                    """
                    plot_id += 2

        html_content += '</div>'

    # 2. Entropy Analysis
    if top_entropy_pairs:
        html_content += '<div class="plot-section"><h2 class="section-header">2. Entropy Analysis (Smart vs Random)</h2>'

        for i, ((x, y), entropy_val) in enumerate(top_entropy_pairs):
            plot_id += 1
            corr_val = abs(df_sample[[x, y]].corr().iloc[0, 1])

            # Smart plot data (same structure as correlation)
            if target_col and target_col in df_sample.columns:
                smart_data = []
                for target_val in df_sample[target_col].unique():
                    mask = df_sample[target_col] == target_val
                    smart_data.append({
                        'x': df_sample.loc[mask, x].tolist(),
                        'y': df_sample.loc[mask, y].tolist(),
                        'mode': 'markers',
                        'type': 'scatter',
                        'name': str(target_val),
                        'marker': {'size': 8, 'opacity': 0.7, 'color': target_colors.get(target_val, '#3498db')}
                    })
            else:
                smart_data = [{
                    'x': df_sample[x].tolist(),
                    'y': df_sample[y].tolist(),
                    'mode': 'markers',
                    'type': 'scatter',
                    'name': 'Data Points',
                    'marker': {'size': 8, 'opacity': 0.7, 'color': '#2ecc71'}
                }]

            # Random plot
            if len(numeric_inputs) >= 4:
                available = [col for col in numeric_inputs if col not in [x, y]]
                if len(available) >= 2:
                    rx, ry = random.sample(available, 2)
                    random_entropy = compute_2d_entropy(df_encoded_sample, rx, ry, target_col)

                    if target_col and target_col in df_sample.columns:
                        random_data = []
                        for target_val in df_sample[target_col].unique():
                            mask = df_sample[target_col] == target_val
                            random_data.append({
                                'x': df_sample.loc[mask, rx].tolist(),
                                'y': df_sample.loc[mask, ry].tolist(),
                                'mode': 'markers',
                                'type': 'scatter',
                                'name': str(target_val),
                                'marker': {'size': 8, 'opacity': 0.7, 'color': target_colors.get(target_val, '#3498db')}
                            })
                    else:
                        random_data = [{
                            'x': df_sample[rx].tolist(),
                            'y': df_sample[ry].tolist(),
                            'mode': 'markers',
                            'type': 'scatter',
                            'name': 'Data Points',
                            'marker': {'size': 8, 'opacity': 0.7, 'color': '#f39c12'}
                        }]

                    html_content += f"""
                    <div class="plot-pair">
                        <h3>Entropy Pair {i+1}</h3>
                        <div class="plot-container">
                            <div class="plot" id="plot{plot_id}">
                                <div class="plot-title">Smart: {x} vs {y}<br>Entropy={entropy_val:.3f}, r={corr_val:.3f}</div>
                            </div>
                            <div class="plot" id="plot{plot_id+1}">
                                <div class="plot-title">Random: {rx} vs {ry}<br>Entropy={random_entropy:.3f}</div>
                            </div>
                        </div>
                    </div>
                    <script>
                        Plotly.newPlot('plot{plot_id}', {json.dumps(smart_data)},
                            {{title: '', xaxis: {{title: '{x}'}}, yaxis: {{title: '{y}'}}, showlegend: true}});
                        Plotly.newPlot('plot{plot_id+1}', {json.dumps(random_data)},
                            {{title: '', xaxis: {{title: '{rx}'}}, yaxis: {{title: '{ry}'}}, showlegend: true}});
                    </script>
                    """
                    plot_id += 2

        html_content += '</div>'

    # 3. ANOVA Analysis (Box plots)
    if anova_pairs:
        html_content += '<div class="plot-section"><h2 class="section-header">3. ANOVA Analysis (Smart vs Random)</h2>'

        for i, (cat_col, num_col, f_stat) in enumerate(anova_pairs):
            plot_id += 1

            # Smart box plot
            categories = df_sample[cat_col].unique()
            smart_data = []
            for cat in categories:
                cat_data = df_sample[df_sample[cat_col] == cat][num_col].dropna()
                smart_data.append({
                    'y': cat_data.tolist(),
                    'type': 'box',
                    'name': str(cat)[:15],  # Truncate long category names
                    'boxpoints': 'outliers'
                })

            # Random box plot
            random_data = []
            if categorical_inputs and numeric_inputs:
                available_cat = [col for col in categorical_inputs if col != cat_col]
                available_num = [col for col in numeric_inputs if col != num_col]
                if available_cat and available_num:
                    rcat = random.choice(available_cat)
                    rnum = random.choice(available_num)
                    try:
                        random_f, _ = f_regression(df_encoded_sample[[rnum]], df_encoded_sample[rcat])
                        categories_rand = df_sample[rcat].unique()
                        for cat in categories_rand:
                            cat_data = df_sample[df_sample[rcat] == cat][rnum].dropna()
                            random_data.append({
                                'y': cat_data.tolist(),
                                'type': 'box',
                                'name': str(cat)[:15],
                                'boxpoints': 'outliers'
                            })
                    except:
                        random_data = [{'y': [0], 'type': 'box', 'name': 'Error'}]

            if random_data:
                html_content += f"""
                <div class="plot-pair">
                    <h3>ANOVA Pair {i+1}</h3>
                    <div class="plot-container">
                        <div class="plot" id="plot{plot_id}">
                            <div class="plot-title">Smart: {cat_col} vs {num_col}<br>F-statistic={f_stat:.2f}</div>
                        </div>
                        <div class="plot" id="plot{plot_id+1}">
                            <div class="plot-title">Random: {rcat} vs {rnum}<br>F-statistic={random_f[0]:.2f}</div>
                        </div>
                    </div>
                </div>
                <script>
                    Plotly.newPlot('plot{plot_id}', {json.dumps(smart_data)},
                        {{title: '', yaxis: {{title: '{num_col}'}}, showlegend: false}});
                    Plotly.newPlot('plot{plot_id+1}', {json.dumps(random_data)},
                        {{title: '', yaxis: {{title: '{rnum}'}}, showlegend: false}});
                </script>
                """
                plot_id += 2

        html_content += '</div>'

    # 4. Chi-Square Analysis (Heatmaps)
    if chi2_pairs:
        html_content += '<div class="plot-section"><h2 class="section-header">4. Chi-Square Analysis (Smart vs Random)</h2>'

        for i, (col1, col2, chi2_val, p_val, merged_ct) in enumerate(chi2_pairs):
            plot_id += 1

            # Smart heatmap data
            smart_heatmap = {
                'z': merged_ct.values.tolist(),
                'x': [str(x)[:10] for x in merged_ct.columns],
                'y': [str(y)[:10] for y in merged_ct.index],
                'type': 'heatmap',
                'colorscale': 'Viridis',
                'showscale': True
            }

            # Random heatmap
            random_heatmap = {'z': [[0]], 'x': ['Error'], 'y': ['Error'], 'type': 'heatmap'}
            random_chi2_val = 0

            if len(categorical_inputs) >= 4:
                available = [col for col in categorical_inputs if col not in [col1, col2]]
                if len(available) >= 2:
                    rc1, rc2 = random.sample(available, 2)
                    try:
                        random_ct = pd.crosstab(df_sample[rc1], df_sample[rc2])
                        random_chi2_val, _, _, _ = chi2_contingency(random_ct)
                        random_heatmap = {
                            'z': random_ct.values.tolist(),
                            'x': [str(x)[:10] for x in random_ct.columns],
                            'y': [str(y)[:10] for y in random_ct.index],
                            'type': 'heatmap',
                            'colorscale': 'Plasma',
                            'showscale': True
                        }
                    except:
                        pass

            html_content += f"""
            <div class="plot-pair">
                <h3>Chi-Square Pair {i+1}</h3>
                <div class="plot-container">
                    <div class="plot" id="plot{plot_id}">
                        <div class="plot-title">Smart: {col1} vs {col2} (CHAID merged)<br>Chi-Square={chi2_val:.2f}</div>
                    </div>
                    <div class="plot" id="plot{plot_id+1}">
                        <div class="plot-title">Random: {rc1 if 'rc1' in locals() else 'N/A'} vs {rc2 if 'rc2' in locals() else 'N/A'}<br>Chi-Square={random_chi2_val:.2f}</div>
                    </div>
                </div>
            </div>
            <script>
                Plotly.newPlot('plot{plot_id}', [{json.dumps(smart_heatmap)}],
                    {{title: '', xaxis: {{title: '{col2}'}}, yaxis: {{title: '{col1}'}}}});
                Plotly.newPlot('plot{plot_id+1}', [{json.dumps(random_heatmap)}],
                    {{title: '', xaxis: {{title: '{rc2 if "rc2" in locals() else "N/A"}'}}, yaxis: {{title: '{rc1 if "rc1" in locals() else "N/A"}'}}}});
            </script>
            """
            plot_id += 2

        html_content += '</div>'

    # Add summary statistics
    html_content += f"""
        <div class="plot-section">
            <h2 class="section-header">Summary Statistics</h2>
            <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px;">
                <div style="background-color: #ecf0f1; padding: 15px; border-radius: 5px;">
                    <h4>Dataset Info</h4>
                    <p><strong>Total Records:</strong> {len(df_data):,}</p>
                    <p><strong>Plotted Points:</strong> {len(df_sample):,}</p>
                    <p><strong>Target Variable:</strong> {target_col if target_col else 'None'}</p>
                </div>
                <div style="background-color: #ecf0f1; padding: 15px; border-radius: 5px;">
                    <h4>Variable Counts</h4>
                    <p><strong>Numeric Inputs:</strong> {len(numeric_inputs)}</p>
                    <p><strong>Categorical Inputs:</strong> {len(categorical_inputs)}</p>
                    <p><strong>Total Variables:</strong> {len(df_data.columns)}</p>
                </div>
                <div style="background-color: #ecf0f1; padding: 15px; border-radius: 5px;">
                    <h4>Analysis Results</h4>
                    <p><strong>Correlation Pairs:</strong> {len(top_corr_pairs)}</p>
                    <p><strong>Entropy Pairs:</strong> {len(top_entropy_pairs)}</p>
                    <p><strong>ANOVA Pairs:</strong> {len(anova_pairs)}</p>
                    <p><strong>Chi-Square Pairs:</strong> {len(chi2_pairs)}</p>
                </div>
            </div>
        </div>

        </div>
    </body>
    </html>
    """

    # Save HTML file
    html_path = "smart_vs_random_analysis.html"
    with open(html_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    return html_path

# --- Fast plotting function with data sampling ---
def generate_fast_plots(pairs_list, plot_type, df_data, df_encoded_data, target_col, pdf, max_points=1000):
    """Generate fast plots with data sampling for large datasets"""

    # Sample data if too large
    if len(df_data) > max_points:
        sample_idx = np.random.choice(len(df_data), max_points, replace=False)
        df_sample = df_data.iloc[sample_idx]
        df_encoded_sample = df_encoded_data.iloc[sample_idx]
        print(f"Sampling {max_points} points from {len(df_data)} for faster plotting")
    else:
        df_sample = df_data
        df_encoded_sample = df_encoded_data

    for i, pair_data in enumerate(pairs_list):
        fig, axes = plt.subplots(1, 2, figsize=(12, 5))  # Smaller figure size

        if plot_type == "correlation":
            x, y, corr_val = pair_data

            # Smart plot (left) - use matplotlib scatter (faster than seaborn)
            if target_col and target_col in df_sample.columns:
                unique_targets = df_sample[target_col].unique()
                colors = plt.cm.tab10(np.linspace(0, 1, len(unique_targets)))
                for j, target_val in enumerate(unique_targets):
                    mask = df_sample[target_col] == target_val
                    axes[0].scatter(df_sample.loc[mask, x], df_sample.loc[mask, y],
                                  c=[colors[j]], alpha=0.6, s=20, label=str(target_val))
                axes[0].legend(fontsize=8)
            else:
                axes[0].scatter(df_sample[x], df_sample[y], alpha=0.6, s=20)

            axes[0].set_xlabel(x)
            axes[0].set_ylabel(y)
            axes[0].set_title(f"Smart Corr {i+1}\nr={corr_val:.3f}", fontsize=10)

            # Random plot (right)
            if len(numeric_inputs) >= 4:
                available = [col for col in numeric_inputs if col not in [x, y]]
                if len(available) >= 2:
                    rx, ry = random.sample(available, 2)
                    random_corr = abs(df_sample[[rx, ry]].corr().iloc[0, 1])

                    if target_col and target_col in df_sample.columns:
                        for j, target_val in enumerate(unique_targets):
                            mask = df_sample[target_col] == target_val
                            axes[1].scatter(df_sample.loc[mask, rx], df_sample.loc[mask, ry],
                                          c=[colors[j]], alpha=0.6, s=20, label=str(target_val))
                    else:
                        axes[1].scatter(df_sample[rx], df_sample[ry], alpha=0.6, s=20)

                    axes[1].set_xlabel(rx)
                    axes[1].set_ylabel(ry)
                    axes[1].set_title(f"Random Corr {i+1}\nr={random_corr:.3f}", fontsize=10)

        elif plot_type == "entropy":
            (x, y), entropy_val = pair_data

            # Smart plot (left)
            if target_col and target_col in df_sample.columns:
                unique_targets = df_sample[target_col].unique()
                colors = plt.cm.tab10(np.linspace(0, 1, len(unique_targets)))
                for j, target_val in enumerate(unique_targets):
                    mask = df_sample[target_col] == target_val
                    axes[0].scatter(df_sample.loc[mask, x], df_sample.loc[mask, y],
                                  c=[colors[j]], alpha=0.6, s=20, label=str(target_val))
                axes[0].legend(fontsize=8)
            else:
                axes[0].scatter(df_sample[x], df_sample[y], alpha=0.6, s=20)

            corr_val = abs(df_sample[[x, y]].corr().iloc[0, 1])
            axes[0].set_xlabel(x)
            axes[0].set_ylabel(y)
            axes[0].set_title(f"Smart Entropy {i+1}\nr={corr_val:.3f}, E={entropy_val:.3f}", fontsize=10)

            # Random plot (right)
            if len(numeric_inputs) >= 4:
                available = [col for col in numeric_inputs if col not in [x, y]]
                if len(available) >= 2:
                    rx, ry = random.sample(available, 2)
                    random_corr = abs(df_sample[[rx, ry]].corr().iloc[0, 1])
                    random_entropy = compute_2d_entropy(df_encoded_sample, rx, ry, target_col)

                    if target_col and target_col in df_sample.columns:
                        for j, target_val in enumerate(unique_targets):
                            mask = df_sample[target_col] == target_val
                            axes[1].scatter(df_sample.loc[mask, rx], df_sample.loc[mask, ry],
                                          c=[colors[j]], alpha=0.6, s=20, label=str(target_val))
                    else:
                        axes[1].scatter(df_sample[rx], df_sample[ry], alpha=0.6, s=20)

                    axes[1].set_xlabel(rx)
                    axes[1].set_ylabel(ry)
                    axes[1].set_title(f"Random Entropy {i+1}\nr={random_corr:.3f}, E={random_entropy:.3f}", fontsize=10)

        elif plot_type == "anova":
            cat_col, num_col, f_stat = pair_data

            # Smart plot (left) - simplified boxplot
            categories = df_sample[cat_col].unique()
            box_data = [df_sample[df_sample[cat_col] == cat][num_col].dropna() for cat in categories]
            axes[0].boxplot(box_data, labels=[str(cat)[:10] for cat in categories])  # Truncate long labels
            axes[0].set_xlabel(cat_col)
            axes[0].set_ylabel(num_col)
            axes[0].set_title(f"Smart ANOVA {i+1}\nF={f_stat:.2f}", fontsize=10)
            axes[0].tick_params(axis='x', rotation=45, labelsize=8)

            # Random plot (right)
            if categorical_inputs and numeric_inputs:
                available_cat = [col for col in categorical_inputs if col != cat_col]
                available_num = [col for col in numeric_inputs if col != num_col]
                if available_cat and available_num:
                    rcat = random.choice(available_cat)
                    rnum = random.choice(available_num)
                    try:
                        random_f, _ = f_regression(df_encoded_sample[[rnum]], df_encoded_sample[rcat])
                        categories = df_sample[rcat].unique()
                        box_data = [df_sample[df_sample[rcat] == cat][rnum].dropna() for cat in categories]
                        axes[1].boxplot(box_data, labels=[str(cat)[:10] for cat in categories])
                        axes[1].set_xlabel(rcat)
                        axes[1].set_ylabel(rnum)
                        axes[1].set_title(f"Random ANOVA {i+1}\nF={random_f[0]:.2f}", fontsize=10)
                        axes[1].tick_params(axis='x', rotation=45, labelsize=8)
                    except:
                        axes[1].text(0.5, 0.5, 'Random plot failed', ha='center', va='center')

        elif plot_type == "chi2":
            col1, col2, chi2_val, p_val, merged_ct = pair_data

            # Smart plot (left) - simplified heatmap
            im1 = axes[0].imshow(merged_ct.values, cmap='YlGnBu', aspect='auto')
            axes[0].set_xticks(range(len(merged_ct.columns)))
            axes[0].set_yticks(range(len(merged_ct.index)))
            axes[0].set_xticklabels([str(x)[:10] for x in merged_ct.columns], rotation=45, fontsize=8)
            axes[0].set_yticklabels([str(x)[:10] for x in merged_ct.index], fontsize=8)
            axes[0].set_title(f"Smart Chi² {i+1}\nχ²={chi2_val:.2f}", fontsize=10)

            # Add text annotations
            for i_idx in range(len(merged_ct.index)):
                for j_idx in range(len(merged_ct.columns)):
                    axes[0].text(j_idx, i_idx, str(merged_ct.iloc[i_idx, j_idx]),
                               ha='center', va='center', fontsize=8)

            # Random plot (right)
            if len(categorical_inputs) >= 4:
                available = [col for col in categorical_inputs if col not in [col1, col2]]
                if len(available) >= 2:
                    rc1, rc2 = random.sample(available, 2)
                    try:
                        random_ct = pd.crosstab(df_sample[rc1], df_sample[rc2])
                        random_chi2, _, _, _ = chi2_contingency(random_ct)

                        im2 = axes[1].imshow(random_ct.values, cmap='YlGnBu', aspect='auto')
                        axes[1].set_xticks(range(len(random_ct.columns)))
                        axes[1].set_yticks(range(len(random_ct.index)))
                        axes[1].set_xticklabels([str(x)[:10] for x in random_ct.columns], rotation=45, fontsize=8)
                        axes[1].set_yticklabels([str(x)[:10] for x in random_ct.index], fontsize=8)
                        axes[1].set_title(f"Random Chi² {i+1}\nχ²={random_chi2:.2f}", fontsize=10)

                        for i_idx in range(len(random_ct.index)):
                            for j_idx in range(len(random_ct.columns)):
                                axes[1].text(j_idx, i_idx, str(random_ct.iloc[i_idx, j_idx]),
                                           ha='center', va='center', fontsize=8)
                    except:
                        axes[1].text(0.5, 0.5, 'Random plot failed', ha='center', va='center')

        plt.tight_layout()
        pdf.savefig(fig, dpi=100, bbox_inches='tight')  # Lower DPI for faster saving
        plt.close(fig)

# --- Choose output format and generate report ---
print("\n" + "="*50)
print("REPORT GENERATION")
print("="*50)

# Determine best format based on dataset size
dataset_size = len(df_clean)
if dataset_size > 5000:
    print(f"Dataset has {dataset_size:,} rows - Recommending HTML format for speed")
    use_html = True
else:
    print(f"Dataset has {dataset_size:,} rows - Both formats suitable")
    use_html = False

# Ask user for preference (you can change this to always use HTML if preferred)
output_format = "html"  # Change to "pdf" if you prefer PDF, or "both" for both formats

print(f"Generating {output_format.upper()} report...")

if output_format in ["html", "both"]:
    # Generate HTML report (much faster)
    print("Creating interactive HTML report...")
    html_path = generate_html_report(top_corr_pairs, top_entropy_pairs, anova_pairs,
                                   chi2_pairs, df_clean, df_encoded, target)
    print(f"✅ HTML report generated: {html_path}")
    files.download(html_path)

if output_format in ["pdf", "both"]:
    # Generate PDF report (traditional)
    print("Creating PDF report...")
    pdf_path = "smart_vs_random_analysis.pdf"

    # Set matplotlib backend for faster rendering
    plt.rcParams['figure.max_open_warning'] = 0
    plt.rcParams['axes.formatter.useoffset'] = False

    with PdfPages(pdf_path) as pdf:
        # 1. Correlation scatter plots
        if top_corr_pairs:
            print("  - Generating correlation plots...")
            generate_fast_plots(top_corr_pairs, "correlation", df_clean, df_encoded, target, pdf)

        # 2. Entropy scatter plots
        if top_entropy_pairs:
            print("  - Generating entropy plots...")
            generate_fast_plots(top_entropy_pairs, "entropy", df_clean, df_encoded, target, pdf)

        # 3. ANOVA box plots
        if anova_pairs:
            print("  - Generating ANOVA plots...")
            generate_fast_plots(anova_pairs, "anova", df_clean, df_encoded, target, pdf)

        # 4. Chi-square heatmaps
        if chi2_pairs:
            print("  - Generating Chi-square plots...")
            generate_fast_plots(chi2_pairs, "chi2", df_clean, df_encoded, target, pdf)

    print(f"✅ PDF report generated: {pdf_path}")
    files.download(pdf_path)

print(f"Analysis complete! 🎉")
print("\n" + "="*50)
print("ANALYSIS SUMMARY")
print("="*50)
print(f"📊 Dataset: {len(df_clean):,} rows, {len(df_clean.columns)} columns")
print(f"🎯 Target variable: {target if target else 'None detected'}")
print(f"🔢 Numeric inputs: {len(numeric_inputs)}")
print(f"📝 Categorical inputs: {len(categorical_inputs)}")
print(f"📈 Correlation pairs analyzed: {len(top_corr_pairs)}")
print(f"🌀 Entropy pairs analyzed: {len(top_entropy_pairs)}")
print(f"📊 ANOVA pairs analyzed: {len(anova_pairs)}")
print(f"🔥 Chi-square pairs analyzed: {len(chi2_pairs)}")
print("="*50)

Saving iris.csv to iris.csv
Metadata Table:


Unnamed: 0,attribute,measurement_type,role,missing_count_pct,missing_replacement
0,sepal length,numeric,in,0 (0.00%),
1,sepal width,numeric,in,0 (0.00%),
2,petal length,numeric,in,0 (0.00%),
3,petal width,numeric,in,0 (0.00%),
4,class (cat) (target),categorical,target,0 (0.00%),


Target column: class (cat) (target)
Numeric inputs: 4
Categorical inputs: 0

REPORT GENERATION
Dataset has 150 rows - Both formats suitable
Generating HTML report...
Creating interactive HTML report...
✅ HTML report generated: smart_vs_random_analysis.html


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Analysis complete! 🎉

ANALYSIS SUMMARY
📊 Dataset: 150 rows, 5 columns
🎯 Target variable: class (cat) (target)
🔢 Numeric inputs: 4
📝 Categorical inputs: 0
📈 Correlation pairs analyzed: 2
🌀 Entropy pairs analyzed: 3
📊 ANOVA pairs analyzed: 0
🔥 Chi-square pairs analyzed: 0
