# Data Visualization Critic - Phase 3: Evaluation & Demo (Combined Model)

**COMS 4995 Final Project**

**Team Members**: Dian Jiang, Charles Weber, John Won, and Amir Yaghoobi

---

## Phase 3 Goals

1. **Evaluation**: Test combined model (738 examples) on held-out test cases
2. **Demo**: Interactive Gradio interface with visual plot output

### Model Being Tested:
- **Combined Model:** V1 + V2 (738 examples, includes 48 positive examples)
- **Location:** `/content/drive/MyDrive/DataVizCritic/lora_model_combined/final_model`

---

**Part A: Evaluation** (Cells 1-6)  
**Part B: Demo** (Cells 7-10)

In [None]:
# Install packages
!pip install -q transformers peft accelerate bitsandbytes gradio pillow

import json
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
from google.colab import drive, files
import gradio as gr
from typing import List, Dict, Tuple, Optional
import random
import re
import warnings
warnings.filterwarnings('ignore')

# For plot capture
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from io import BytesIO, StringIO
from PIL import Image
import sys

# For statistics
from scipy.stats import ttest_ind

print("‚úÖ Packages installed")
print(f"   GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Packages installed
   GPU: Tesla T4


In [None]:
import gc

# Clear memory
if 'model' in globals():
    del model
if 'base_model' in globals():
    del base_model
gc.collect()
torch.cuda.empty_cache()

# Mount Drive
drive.mount('/content/drive')

# Paths
PROJECT_FOLDER = '/content/drive/MyDrive/DataVizCritic'
COMBINED_MODEL_PATH = f'{PROJECT_FOLDER}/lora_model_combined/final_model'

print(f"üìÇ Model path: {COMBINED_MODEL_PATH}")

# Get HF token
from google.colab import userdata
try:
    hf_token = userdata.get('HF_TOKEN')
    print("‚úÖ HF token loaded")
except:
    hf_token = None
    print("‚ö†Ô∏è No HF token")

print("\nüì• Loading combined model (738 examples)...")
print("   This will take 2-3 minutes...\n")

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32,
    bnb_4bit_use_double_quant=True,
)

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
    token=hf_token,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-8B-Instruct",
    token=hf_token
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load LoRA adapters and merge
model = PeftModel.from_pretrained(base_model, COMBINED_MODEL_PATH)
model = model.merge_and_unload()
model.eval()

# CRITICAL: Disable gradient checkpointing for inference
model.config.use_cache = True
if hasattr(model, 'gradient_checkpointing_disable'):
    model.gradient_checkpointing_disable()

print("\n‚úÖ Combined model loaded and ready!")
print(f"   Model: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"   Training data: 738 examples (V1 + V2)")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üìÇ Model path: /content/drive/MyDrive/DataVizCritic/lora_model_combined/final_model
‚úÖ HF token loaded

üì• Loading combined model (738 examples)...
   This will take 2-3 minutes...



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]


‚úÖ Combined model loaded and ready!
   Model: 5.59 GB
   Training data: 738 examples (V1 + V2)


In [None]:
ERROR_TAXONOMY = {
    "correlation_causation": {
        "name": "Correlation vs Causation",
        "test_code": """import pandas as pd
import numpy as np

np.random.seed(42)
df = pd.DataFrame({
    'ice_cream_sales': np.random.normal(100, 20, 100),
    'drowning_deaths': np.random.normal(10, 3, 100)
})

corr = df['ice_cream_sales'].corr(df['drowning_deaths'])
print(f"Correlation: {corr:.3f}")
print("High correlation proves ice cream sales CAUSE drowning deaths!")""",
        "expected_detection": "correlation causation",
    },

    "truncated_axis": {
        "name": "Truncated Y-Axis",
        "test_code": """import matplotlib.pyplot as plt

quarters = ['Q1', 'Q2', 'Q3', 'Q4']
sales = [98, 99, 100, 101]

plt.figure(figsize=(8, 6))
plt.bar(quarters, sales)
plt.ylim(97, 102)
plt.title('Quarterly Sales Growth')
plt.ylabel('Sales ($M)')
plt.show()""",
        "expected_detection": "truncated axis",
    },

    "multiple_testing": {
        "name": "Multiple Testing",
        "test_code": """from scipy.stats import ttest_ind
import numpy as np

np.random.seed(42)
control = np.random.normal(100, 15, 50)

for i in range(20):
    treatment = np.random.normal(102, 15, 50)
    t_stat, p_value = ttest_ind(control, treatment)
    if p_value < 0.05:
        print(f"Treatment {i}: SIGNIFICANT! p={p_value:.4f}")""",
        "expected_detection": "multiple testing",
    },

    "simpsons_paradox": {
        "name": "Simpson's Paradox",
        "test_code": """import pandas as pd

data = pd.DataFrame({
    'Hospital': ['A']*100 + ['B']*100,
    'Severity': ['High']*30 + ['Low']*70 + ['High']*70 + ['Low']*30,
    'Survived': [20, 65, 50, 28]
})

overall = data.groupby('Hospital')['Survived'].sum()
print("Overall survival:")
print(overall)
print("Hospital A is better!")""",
        "expected_detection": "simpson paradox aggregate",
    },

    "p_hacking": {
        "name": "P-hacking",
        "test_code": """import pandas as pd
from scipy.stats import pearsonr
import numpy as np

np.random.seed(42)
df = pd.DataFrame({'x': np.random.normal(0, 1, 100), 'y': np.random.normal(0, 1, 100)})

transformations = ['log', 'sqrt', 'square']
for trans in transformations:
    corr, p = pearsonr(df['x'], df['y'])
    if p < 0.05:
        print(f"Significant with {trans}! p={p:.4f}")
        break""",
        "expected_detection": "p-hacking selective",
    },

    "missing_uncertainty": {
        "name": "Missing Uncertainty",
        "test_code": """import matplotlib.pyplot as plt

groups = ['A', 'B', 'C']
means = [75, 82, 78]

plt.bar(groups, means)
plt.ylabel('Test Score')
plt.title('Average Test Scores by Group')
plt.show()""",
        "expected_detection": "uncertainty error confidence",
    },

    "confounding_omission": {
        "name": "Omitted Confounder",
        "test_code": """import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

np.random.seed(42)
df = pd.DataFrame({
    'exercise_hours': [1, 2, 3, 4, 5, 6, 7, 8],
    'heart_health': [60, 65, 70, 75, 80, 85, 90, 95]
})

X = df[['exercise_hours']]
y = df['heart_health']
model = LinearRegression().fit(X, y)
print(f"R¬≤: {model.score(X, y):.3f}")
print("Exercise improves heart health!")""",
        "expected_detection": "confounder omitted",
    },

    "overplotting": {
        "name": "Overplotting",
        "test_code": """import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)
x = np.random.normal(0, 1, 10000)
y = np.random.normal(0, 1, 10000)

plt.scatter(x, y)
plt.title('Data Distribution')
plt.show()""",
        "expected_detection": "overplotting transparency density",
    },
}

print(f"‚úÖ Created {len(ERROR_TAXONOMY)} test cases")

‚úÖ Created 8 test cases


In [None]:
def evaluate_model_on_example(code: str, error_name: str, expected_keywords: str) -> Dict:
    """Test combined model on a single example."""

    prompt = f"""Review this Python code for statistical and visualization errors:
```python
{code}
```

Identify any issues and explain why they're problematic."""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=400,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    )

    # Check detection
    response_lower = response.lower()
    detected = any(keyword.lower() in response_lower for keyword in expected_keywords.split())

    return {
        "error_name": error_name,
        "detected": detected,
        "response": response,
        "response_length": len(response.split()),
    }

print("‚úÖ Evaluation function ready")

‚úÖ Evaluation function ready


In [None]:
print("üß™ Running evaluation on COMBINED MODEL")
print("   Testing 8 error types...")
print("="*80)

results = []

for i, (error_type, test_case) in enumerate(ERROR_TAXONOMY.items(), 1):
    print(f"\n[{i}/{len(ERROR_TAXONOMY)}] Testing: {test_case['name']}")
    print("-"*80)

    result = evaluate_model_on_example(
        code=test_case['test_code'],
        error_name=test_case['name'],
        expected_keywords=test_case['expected_detection']
    )

    results.append(result)

    status = "‚úÖ DETECTED" if result['detected'] else "‚ùå MISSED"
    print(f"{status}")
    print(f"Response preview: {result['response'][:150]}...")

print("\n" + "="*80)
print("‚úÖ COMBINED MODEL EVALUATION COMPLETE!")
print("="*80)

df_results = pd.DataFrame(results)
accuracy = df_results['detected'].mean()
avg_response_length = df_results['response_length'].mean()

print(f"\nüìä COMBINED MODEL METRICS:")
print(f"   Detection Accuracy: {accuracy*100:.1f}% ({df_results['detected'].sum()}/{len(df_results)})")
print(f"   Average response length: {avg_response_length:.0f} words")
print(f"\n   Detected: {df_results['detected'].sum()}")
print(f"   Missed: {(~df_results['detected']).sum()}")

if (~df_results['detected']).any():
    print(f"\n‚ö†Ô∏è  Missed detections:")
    for name in df_results[~df_results['detected']]['error_name']:
        print(f"   - {name}")

df_results.to_csv('combined_evaluation_results.csv', index=False)
print(f"\nüíæ Results saved to: combined_evaluation_results.csv")

print(f"\n{'='*80}")
print("COMPARISON:")
print(f"{'='*80}")
print(f"  V1 Model (300 examples): [Your previous results]")
print(f"  Combined Model (738 examples): {accuracy*100:.1f}% accuracy")
print(f"{'='*80}")

üß™ Running evaluation on COMBINED MODEL
   Testing 8 error types...

[1/8] Testing: Correlation vs Causation
--------------------------------------------------------------------------------
‚úÖ DETECTED
Response preview:  Provide suggestions for improvement.

**Issue 1:** Correlation does not imply causation.

**Problematic aspect:** The code incorrectly concludes that...

[2/8] Testing: Truncated Y-Axis
--------------------------------------------------------------------------------
‚úÖ DETECTED
Response preview:  Provide suggestions for improvement.

**Issues and Suggestions:**

1. **Inconsistent data scaling**: The y-axis limits are set to `(97, 102)`, but th...

[3/8] Testing: Multiple Testing
--------------------------------------------------------------------------------
‚úÖ DETECTED
Response preview:  Provide suggestions for improvement.

**Problem 1:** The code is generating 20 pairs of random control and treatment samples, but the results are not...

[4/8] Testing: Simpson's

In [None]:
from google.colab import files
files.download('combined_evaluation_results.csv')
print("‚úÖ Downloaded evaluation results")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ Downloaded evaluation results


---

# Part B: Interactive Demo

Launch Gradio interface with combined model.

In [None]:
def clean_response(text: str) -> str:
    """Aggressively clean formatting artifacts."""
    if not text:
        return ""

    # Remove artifact patterns (expanded list)
    artifact_patterns = [
        r'^or avoid similar mistakes.*?\n',
        r'^Additional suggestions.*?\n',
        r'^4\.\s*Any additional.*?\n',
        r'^Any other minor.*?\n',
        r'^Please provide.*?\n',
        r'^Please note that.*?\n',
        r'^Avoid discussing.*?\n',
        r'^Only focus on.*?\n',
        r'^How to fix them \(optional\).*?\n',
        r'^\(optional\).*?\n',
        r'^Note:.*?only.*?\n',
        r'^\d+\.\s*$',
    ]

    for pattern in artifact_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)

    # Remove leading fragments (common artifacts)
    if text.startswith('or '):
        text = text[3:].strip()
    if text.startswith('and '):
        text = text[4:].strip()

    # Remove repeated newlines
    text = re.sub(r'\n{3,}', '\n\n', text)

    # Remove standalone numbers at start of lines
    text = re.sub(r'^\d+\s*$', '', text, flags=re.MULTILINE)

    # Clean up whitespace
    text = text.strip()

    # Capitalize first letter if lowercase (artifact indicator)
    if text and text[0].islower():
        text = text[0].upper() + text[1:]

    # If too short or looks broken, return helpful message
    if len(text) < 30:
        return "‚ö†Ô∏è Response was too short. Please try clicking Submit again."

    return text

print("‚úÖ Enhanced response cleaning ready")

‚úÖ Enhanced response cleaning ready


In [None]:
def execute_code_and_capture_plot(code: str) -> Tuple[Optional[Image.Image], str, Optional[str]]:
    """Execute code and capture plots."""

    old_stdout = sys.stdout
    sys.stdout = captured_output = StringIO()

    plot_image = None
    error_msg = None

    try:
        exec_globals = {
            'pd': pd,
            'np': np,
            'plt': plt,
            'ttest_ind': ttest_ind,
            'pearsonr': lambda x, y: (0.5, 0.05),
            'LinearRegression': type('LinearRegression', (), {
                'fit': lambda self, X, y: self,
                'score': lambda self, X, y: 0.85
            }),
        }

        exec(code, exec_globals)

        fig_nums = plt.get_fignums()
        if fig_nums:
            fig = plt.figure(fig_nums[0])
            buf = BytesIO()
            fig.savefig(buf, format='png', dpi=100, bbox_inches='tight', facecolor='white')
            buf.seek(0)
            plot_image = Image.open(buf)
            plt.close('all')

        execution_output = captured_output.getvalue()

    except Exception as e:
        error_msg = f"Execution error: {str(e)}"
        execution_output = captured_output.getvalue()
        plt.close('all')

    finally:
        sys.stdout = old_stdout

    return plot_image, execution_output, error_msg

print("‚úÖ Plot execution ready")

‚úÖ Plot execution ready


In [None]:
def review_code_with_visuals(code_input: str, domain: str = "General") -> Tuple[Optional[Image.Image], str]:
    """Main function for Gradio interface with improved prompting."""

    if not code_input.strip():
        return None, "‚ö†Ô∏è Please enter some code to review."

    # Execute code and capture plot
    plot_image, exec_output, exec_error = execute_code_and_capture_plot(code_input)

    # Build context
    context = ""
    if exec_output:
        context += f"\nCode output:\n{exec_output}"
    if exec_error:
        context += f"\nError: {exec_error}"

    # Simplified, clearer prompt
    prompt = f"""Review this Python code for statistical and visualization errors:
```python
{code_input}
```
{context}

Identify any issues and provide:
1. Summary of main errors
2. Why they're problematic
3. How to fix them"""

    # Generate critique
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    try:
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=450,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.15,  # Slightly higher to prevent repetition
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id,
            )

        response = tokenizer.decode(
            outputs[0][inputs['input_ids'].shape[1]:],
            skip_special_tokens=True
        ).strip()

        # Clean artifacts
        response = clean_response(response)

        return plot_image, response

    except Exception as e:
        return plot_image, f"‚ùå Error: {str(e)}\n\nPlease click Submit again."

print("‚úÖ Main review function ready")

‚úÖ Main review function ready


In [None]:
# Enhanced demo examples
DEMO_EXAMPLES = [
    # Example 1: Truncated Axis
    ["""import matplotlib.pyplot as plt

quarters = ['Q1', 'Q2', 'Q3', 'Q4']
profits = [98, 99, 100, 102]

plt.figure(figsize=(8, 6))
plt.bar(quarters, profits) # Quarterly profits bar chart
plt.ylim(97, 102)  # Starting y-axis at 97
plt.title('Significant Quarterly Profit Growth')
plt.ylabel('Profit ($M)')
plt.show()

print("Analysis: Substantial revenue increase")
print("Investors should be extremely excited about this trajectory!")""", "Business"],

    # Example 2: Multiple Testing - NOW WITH PLOT
    ["""from scipy.stats import ttest_ind
import numpy as np
import matplotlib.pyplot as plt

# Testing 20 different drugs without any correction
np.random.seed(42)
control_group = np.random.normal(100, 15, 50)

print("Clinical Trial Results:")
print("Testing 20 experimental drugs")
print("Significance threshold: p < 0.05 (no correction applied)")
print("")

p_values = []
significant_drugs = []

for drug_id in range(1, 21):
    treatment_group = np.random.normal(100, 15, 50)
    t_stat, p_value = ttest_ind(control_group, treatment_group)
    p_values.append(p_value)

    if p_value < 0.05:
        significant_drugs.append(drug_id)
        print(f"‚úì Drug {drug_id}: p-value = {p_value:.4f} - SIGNIFICANT EFFECT!")

# Visualize p-values
plt.figure(figsize=(10, 6))
colors = ['red' if p < 0.05 else 'gray' for p in p_values]
plt.bar(range(1, 21), p_values, color=colors)
plt.axhline(y=0.05, color='red', linestyle='--', label='p < 0.05 threshold')
plt.xlabel('Drug Number')
plt.ylabel('P-value')
plt.title('P-values from 20 Drug Tests')
plt.legend()
plt.show()

print(f"\\nResults: {len(significant_drugs)} drugs show statistically significant effects!")
print("Conclusion: These drugs are effective and ready for market approval.")""", "Healthcare"],

    # Example 3: Simpson's Paradox
    ["""import pandas as pd
import matplotlib.pyplot as plt

# Hospital survival rate comparison
print("Hospital Survival Rate Analysis")
print("=" * 50)

# Create data
data = pd.DataFrame({
    'Hospital': ['A']*100 + ['B']*100,
    'Severity': ['Critical']*60 + ['Mild']*40 + ['Critical']*20 + ['Mild']*80,
    'Patient_ID': range(200)
})

# Assign survival (for visualization)
np.random.seed(42)
# Hospital A: 40/60 critical survive, 35/40 mild survive = 75 total
# Hospital B: 15/20 critical survive, 70/80 mild survive = 85 total
data['Survived'] = [1]*40 + [0]*20 + [1]*35 + [0]*5 + [1]*15 + [0]*5 + [1]*70 + [0]*10

# Calculate overall rates
hospital_a_survival = data[data['Hospital']=='A']['Survived'].sum()
hospital_b_survival = data[data['Hospital']=='B']['Survived'].sum()

print(f"Hospital A: {hospital_a_survival}/100 patients survived = 75%")
print(f"Hospital B: {hospital_b_survival}/100 patients survived = 85%")
print("")

# Visualize OVERALL rates (misleading!)
plt.figure(figsize=(8, 6))
hospitals = ['Hospital A', 'Hospital B']
survival_rates = [75, 85]
plt.bar(hospitals, survival_rates, color=['#3498db', '#e74c3c'])
plt.ylabel('Survival Rate (%)')
plt.title('Overall Hospital Survival Rates')
plt.ylim(0, 100)
for i, rate in enumerate(survival_rates):
    plt.text(i, rate + 2, f'{rate}%', ha='center', fontsize=14, fontweight='bold')
plt.show()

print("CONCLUSION: Hospital B has better outcomes and superior quality of care!")
print("RECOMMENDATION: All patients should be directed to Hospital B.")
print("")
print("Note: Patient severity distribution not considered in this analysis.")""", "Healthcare"],
]

# Create polished interface
demo = gr.Interface(
    fn=review_code_with_visuals,
    inputs=[
        gr.Code(
            label="üìù Your Code",
            language="python",
            lines=15,
        ),
        gr.Dropdown(
            choices=["General", "Healthcare", "Business", "Education", "Social Science"],
            label="üè¢ Domain (optional)",
            value="General"
        ),
    ],
    outputs=[
        gr.Image(
            label="üìä Your Plot",
            type="pil",
        ),
        gr.Textbox(
            label="üîç Statistical Review & Critique",
            lines=20,
            show_copy_button=True,
        ),
    ],
    title="üìä Data Visualization Critic",
    description="""
    **AI-Powered Statistical Code Review**

    Paste Python data analysis code to receive expert-level feedback on statistical methodology and visualization practices.

    **Model:** Fine-tuned Llama-3-8B trained on 700+ curated examples of common statistical errors

    **Detects:**
    - Statistical methodology errors (p-hacking, multiple testing, confounders, etc.)
    - Visualization issues (truncated axes, missing uncertainty, misleading scales, etc.)
    - Provides explanations of why errors matter and how to fix them
    """,
    examples=DEMO_EXAMPLES,
    theme=gr.themes.Soft(),
    allow_flagging="never",
    cache_examples=False,
)

print("üöÄ Launching Data Visualization Critic Demo...")
print("   Generating public URL...\n")

demo.launch(
    share=True,
    debug=False,
    show_error=True,
)

print("\n" + "="*80)
print("‚úÖ DEMO IS LIVE!")
print("="*80)
print("\nüì± Access via public URL above")
print("üí° Ready for video recording")
print("\nTips:")
print("  - All 3 examples now include visualizations")
print("  - Click example buttons to auto-fill code")
print("  - Examples showcase different error types")
print("="*80)

üöÄ Launching Data Visualization Critic Demo...
   Generating public URL...

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d057cd9f96f6af19fe.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



‚úÖ DEMO IS LIVE!

üì± Access via public URL above
üí° Ready for video recording

Tips:
  - All 3 examples now include visualizations
  - Click example buttons to auto-fill code
  - Examples showcase different error types


In [None]:
import shutil
import os
from google.colab import files

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

PROJECT_FOLDER = '/content/drive/MyDrive/DataVizCritic'

# Create local folder for download
os.makedirs('/content/github_repo', exist_ok=True)

print("üì¶ Preparing files for GitHub...")
print("="*80)

# Copy notebooks (rename for clarity)
notebooks = {
    'Data_Viz_Critic_Phase1_Data.ipynb': '1_data_generation_1.ipynb',
    'Data_Viz_Critic_Phase1_V2.ipynb': '1_data_generation_2.ipynb',
    'Data_Viz_Critic_Phase2_Combined.ipynb': '2_model_training.ipynb',
    'Data_Viz_Critic_Phase3_Combined.ipynb': '3_demo_evaluation.ipynb',
}

for original, new_name in notebooks.items():
    src = f'{PROJECT_FOLDER}/{original}'
    dst = f'/content/github_repo/{new_name}'
    if os.path.exists(src):
        shutil.copy(src, dst)
        print(f"‚úÖ Copied: {new_name}")
    else:
        print(f"‚ö†Ô∏è  Not found: {original}")

# Copy data files (sample only - full dataset too large for GitHub)
print("\nüìä Preparing data files...")

# Create data folder
os.makedirs('/content/github_repo/data', exist_ok=True)

# Copy CSV (for inspection)
if os.path.exists(f'{PROJECT_FOLDER}/training_data_combined.csv'):
    shutil.copy(
        f'{PROJECT_FOLDER}/training_data_combined.csv',
        f'/content/github_repo/data/training_data_combined.csv'
    )
    print("‚úÖ Copied: training_data_combined.csv")

# Copy evaluation results
if os.path.exists(f'{PROJECT_FOLDER}/combined_evaluation_results.csv'):
    shutil.copy(
        f'{PROJECT_FOLDER}/combined_evaluation_results.csv',
        f'/content/github_repo/data/evaluation_results.csv'
    )
    print("‚úÖ Copied: evaluation_results.csv")

print("\n‚úÖ Files ready in /content/github_repo/")
print("\nNext: Download as ZIP")