# Data Visualization Critic - Phase 1 V2: Enhanced Training Data

**COMS 4995 Final Project - V2 Improvement**

**Team:** Dian Jiang, Joey Weber, John Won, and Amir Yaghoobi

---

## V2 Improvements Over V1

**Problems with V1 (300 examples):**
- Model struggles to detect errors from code structure alone
- No positive examples (good code)
- Some low-quality examples

**V2 Solutions (400 new examples + 50 positive):**
- ‚úÖ Improved prompts focusing on CODE STRUCTURE errors
- ‚úÖ 50 positive examples teaching "no issues detected"
- ‚úÖ Quality filtering removes weak examples
- ‚úÖ More emphasis on visualization errors (truncated axes, etc.)

**Files (all in DataVizCritic folder):**
- V1 backup: `training_data.jsonl` (300 examples - PRESERVED)
- V2 output: `training_data_v2.jsonl` (450 examples - NEW)
- Combined: `training_data_combined.jsonl` (750 examples - OPTIONAL)

**Estimated Time:** 5-7 hours for 450 examples

---

In [None]:
import json
import os
import random
import re
import pandas as pd
import torch
from datetime import datetime
from typing import Dict, List
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from google.colab import userdata, drive

random.seed(42)
torch.manual_seed(42)

print("‚úÖ Packages installed")
print(f"   GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

# Mount Drive
drive.mount('/content/drive')
PROJECT_FOLDER = '/content/drive/MyDrive/DataVizCritic'
V2_JSONL = f'{PROJECT_FOLDER}/training_data_v2.jsonl'
V2_CSV = f'{PROJECT_FOLDER}/training_data_v2.csv'

print(f"‚úÖ Drive mounted: {PROJECT_FOLDER}")

‚úÖ Packages installed
   GPU: Tesla T4
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Drive mounted: /content/drive/MyDrive/DataVizCritic


## Section 1: Enhanced Error Taxonomy

Same 15 error types, but improved generation approach.

In [None]:
# Enhanced error taxonomy - same errors, better generation
ALL_ERRORS = {
    # Statistical Errors (10)
    "correlation_causation": {
        "name": "Correlation vs Causation Confusion",
        "description": "Implying causal relationship from correlational data",
        "severity": "critical",
        "principle": "Correlation does not imply causation without experimental design or causal inference methods",
        "code_indicators": ["corr(", "correlation", "np.corrcoef"]
    },
    "simpsons_paradox": {
        "name": "Simpson's Paradox",
        "description": "Aggregate trends that reverse when data is disaggregated",
        "severity": "critical",
        "principle": "Aggregated data can show opposite trends from stratified data",
        "code_indicators": ["groupby", "aggregate", "mean()", "sum()"]
    },
    "survivorship_bias": {
        "name": "Survivorship Bias",
        "description": "Analyzing only surviving/successful cases, ignoring failures",
        "severity": "critical",
        "principle": "Selection bias from only observing survivors distorts conclusions",
        "code_indicators": ["[df['status'] == 'success']", "survived", "winners"]
    },
    "confounding_omission": {
        "name": "Omitted Confounding Variables",
        "description": "Failing to control for confounders in observational data",
        "severity": "critical",
        "principle": "Omitted variable bias invalidates causal interpretation",
        "code_indicators": ["LinearRegression", "fit(X, y)", "single predictor"]
    },
    "multiple_testing": {
        "name": "Multiple Testing without Correction",
        "description": "Running many statistical tests without adjusting significance levels",
        "severity": "critical",
        "principle": "Family-wise error rate increases with multiple comparisons",
        "code_indicators": ["for i in range", "if p_value < 0.05", "multiple tests"]
    },
    "p_hacking": {
        "name": "P-hacking / Data Dredging",
        "description": "Selectively reporting significant results or manipulating analysis",
        "severity": "critical",
        "principle": "Selection bias in reporting inflates Type I error rate",
        "code_indicators": ["if p < 0.05:", "break", "trying multiple models"]
    },
    "regression_to_mean": {
        "name": "Regression to the Mean Misinterpretation",
        "description": "Attributing regression to mean as treatment effect",
        "severity": "warning",
        "principle": "Extreme values naturally regress toward average on retest",
        "code_indicators": ["extreme scores", "retest", "improvement"]
    },
    "base_rate_neglect": {
        "name": "Base Rate Neglect",
        "description": "Ignoring prior probabilities when interpreting results",
        "severity": "warning",
        "principle": "Posterior probability depends on both likelihood and base rate",
        "code_indicators": ["accuracy", "positive rate", "test results"]
    },
    "extrapolation": {
        "name": "Extrapolation Beyond Data Range",
        "description": "Making predictions outside observed data range",
        "severity": "warning",
        "principle": "Model validity is uncertain beyond training data range",
        "code_indicators": ["predict(", "future", "extrapolate"]
    },
    "assumption_violation": {
        "name": "Statistical Assumption Violation",
        "description": "Using methods when assumptions are violated (normality, independence)",
        "severity": "warning",
        "principle": "Violations of assumptions can invalidate statistical inference",
        "code_indicators": ["ttest", "anova", "regression"]
    },

    # Visualization Errors (5)
    "truncated_axis": {
        "name": "Truncated Y-Axis Manipulation",
        "description": "Starting y-axis at non-zero to exaggerate differences",
        "severity": "critical",
        "principle": "Truncated axes distort visual perception of magnitude",
        "code_indicators": ["plt.ylim(", "set_ylim", "non-zero start"]
    },
    "dual_axis_misleading": {
        "name": "Misleading Dual Axes",
        "description": "Using two y-axes with different scales to force correlation",
        "severity": "critical",
        "principle": "Arbitrary axis scaling can create spurious visual relationships",
        "code_indicators": ["twinx()", "secondary_y", "two y-axes"]
    },
    "wrong_chart_type": {
        "name": "Inappropriate Chart Type",
        "description": "Using wrong visualization for data type or relationship",
        "severity": "warning",
        "principle": "Chart type should match data structure and analytical goal",
        "code_indicators": ["pie chart for trends", "bar for continuous", "line for categorical"]
    },
    "overplotting": {
        "name": "Overplotting Without Transparency",
        "description": "Dense scatterplots hiding data density patterns",
        "severity": "warning",
        "principle": "Overlapping points obscure data distribution",
        "code_indicators": ["plt.scatter(", "many points", "no alpha"]
    },
    "missing_uncertainty": {
        "name": "Missing Uncertainty Visualization",
        "description": "Showing point estimates without error bars or confidence intervals",
        "severity": "warning",
        "principle": "Point estimates without uncertainty measures overstate confidence",
        "code_indicators": ["plt.bar(", "plt.plot(", "no errorbar"]
    },
}

print(f"‚úÖ Defined {len(ALL_ERRORS)} error types with code indicators")
print(f"   - Critical: {sum(1 for e in ALL_ERRORS.values() if e['severity'] == 'critical')}")
print(f"   - Warning: {sum(1 for e in ALL_ERRORS.values() if e['severity'] == 'warning')}")

‚úÖ Defined 15 error types with code indicators
   - Critical: 8


In [None]:
DOMAIN_CONTEXTS = [
    {
        "domain": "healthcare",
        "scenarios": [
            "clinical trial comparing drug efficacy",
            "observational study of patient outcomes",
            "disease prevalence analysis across demographics",
            "treatment effectiveness in hospital system"
        ]
    },
    {
        "domain": "business",
        "scenarios": [
            "customer churn prediction analysis",
            "marketing campaign effectiveness study",
            "sales performance across regions",
            "pricing strategy impact analysis"
        ]
    },
    {
        "domain": "education",
        "scenarios": [
            "teaching method effectiveness comparison",
            "student performance prediction",
            "graduation rate analysis by demographics",
            "online vs in-person learning outcomes"
        ]
    },
    {
        "domain": "social_science",
        "scenarios": [
            "social media usage and mental health",
            "income inequality trends",
            "voting behavior analysis",
            "crime rate factors"
        ]
    }
]

print(f"‚úÖ Defined {len(DOMAIN_CONTEXTS)} domain contexts")
for domain in DOMAIN_CONTEXTS:
    print(f"   - {domain['domain']}: {len(domain['scenarios'])} scenarios")

‚úÖ Defined 4 domain contexts
   - healthcare: 4 scenarios
   - business: 4 scenarios
   - education: 4 scenarios
   - social_science: 4 scenarios


## Section 2: Load Llama-3-8B Model

Same model as V1, but we'll use improved prompts.

In [None]:
print("üì• Loading Llama-3-8B-Instruct (4-bit quantized)...")
print("   This may take 2-3 minutes...\n")

# Get HF token
try:
    hf_token = userdata.get('HF_TOKEN')
    print("‚úÖ HuggingFace token loaded from secrets")
except:
    print("‚ö†Ô∏è  No HF_TOKEN found in secrets")
    print("   Add token: https://huggingface.co/settings/tokens")
    hf_token = None

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32,
    bnb_4bit_use_double_quant=True,
)

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    token=hf_token,
    trust_remote_code=True
)

print("\n‚úÖ Model loaded successfully!")
print(f"   Model memory: {model.get_memory_footprint() / 1e9:.2f} GB")
print(f"   Device: {model.device}")

üì• Loading Llama-3-8B-Instruct (4-bit quantized)...
   This may take 2-3 minutes...

‚úÖ HuggingFace token loaded from secrets


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]


‚úÖ Model loaded successfully!
   Model memory: 5.59 GB
   Device: cuda:0


## Section 3: V2 Prompt Engineering

**Key improvements:**
1. Emphasize CODE STRUCTURE
2. Request subtle errors (not obvious)
3. Make flawed code look more realistic
4. Focus on detection from analysis logic

In [None]:
def generate_with_llm(prompt: str, max_tokens: int = 1500) -> str:
    """Generate using Llama-3."""
    messages = [
        {"role": "system", "content": "You are an expert statistician."},
        {"role": "user", "content": prompt}
    ]

    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)


def create_simple_prompt(error_type: str, error_info: Dict, domain_context: Dict) -> str:
    """Simple V1-style prompt for NEGATIVE examples."""
    scenario = random.choice(domain_context['scenarios'])

    return f"""Create a Python code example demonstrating: {error_info['name']}

Context: {domain_context['domain']} - {scenario}

Generate:
1. FLAWED CODE (15-25 lines) with this error: {error_info['description']}
2. EXPLANATION (2-3 paragraphs) of why it's wrong
3. CORRECTED CODE (15-25 lines) that fixes it

Format with clear sections and code blocks."""


def create_positive_prompt(domain_context: Dict) -> str:
    """Prompt for POSITIVE examples (good code)."""
    scenario = random.choice(domain_context['scenarios'])

    return f"""Create a Python code example demonstrating GOOD statistical practices.

Context: {domain_context['domain']} - {scenario}

Generate:
1. GOOD CODE (15-25 lines) with proper statistical practices:
   - Use appropriate causal language (correlation, association, NOT causes)
   - Include uncertainty (error bars, confidence intervals, p-values)
   - Acknowledge limitations or confounders
   - Professional but cautious interpretation

2. BRIEF REVIEW (1-2 paragraphs):
   - Start with: "No major statistical issues detected"
   - Mention 2-3 good practices observed

Format with clear sections and code blocks."""


def extract_code_blocks(text: str) -> List[str]:
    """Extract Python code blocks."""
    blocks = re.findall(r'```python\s*\n(.*?)```', text, re.DOTALL)
    if not blocks:
        blocks = re.findall(r'```\s*\n(.*?)```', text, re.DOTALL)
    return [b.strip() for b in blocks if len(b.strip()) > 50]


def generate_negative_example(error_type: str, domain: str) -> Dict:
    """Generate one NEGATIVE (flawed) example."""
    error_info = ALL_ERRORS[error_type]
    domain_context = next(d for d in DOMAIN_CONTEXTS if d['domain'] == domain)
    scenario = random.choice(domain_context['scenarios'])

    try:
        prompt = create_simple_prompt(error_type, error_info, domain_context)
        response = generate_with_llm(prompt, max_tokens=1500)

        # Extract code blocks
        code_blocks = extract_code_blocks(response)
        if len(code_blocks) < 1:
            return None

        flawed_code = code_blocks[0]
        corrected_code = code_blocks[1] if len(code_blocks) >= 2 else flawed_code

        # Extract explanation
        if len(code_blocks) >= 2:
            start = response.find(code_blocks[0]) + len(code_blocks[0])
            end = response.find(code_blocks[1])
            explanation = response[start:end]
        else:
            explanation = response[len(code_blocks[0]):]

        explanation = re.sub(r'```.*?```', '', explanation, flags=re.DOTALL).strip()
        if len(explanation) < 50:
            explanation = f"{error_info['name']}: {error_info['principle']}"

        return {
            "error_type": error_type,
            "severity": error_info['severity'],
            "domain": domain,
            "scenario": scenario,
            "language": "python",
            "complexity": "intermediate",
            "flawed_code": flawed_code,
            "critique": {
                "summary": error_info['name'],
                "detailed_explanation": explanation[:500],
                "line_numbers": [10, 15],
                "consequences": "Could lead to incorrect conclusions"
            },
            "corrected_code": corrected_code,
            "learning_resources": [error_info['principle']],
            "principle": error_info['principle'],
            "generated_at": datetime.now().isoformat(),
            "model": "llama-3-8b-v2",
            "is_positive_example": False
        }
    except:
        return None


def generate_positive_example(domain: str) -> Dict:
    """Generate one POSITIVE (good code) example."""
    domain_context = next(d for d in DOMAIN_CONTEXTS if d['domain'] == domain)
    scenario = random.choice(domain_context['scenarios'])

    try:
        prompt = create_positive_prompt(domain_context)
        response = generate_with_llm(prompt, max_tokens=1500)

        # Extract code blocks
        code_blocks = extract_code_blocks(response)
        if len(code_blocks) < 1:
            return None

        good_code = code_blocks[0]

        # Extract review
        start = response.find(code_blocks[0]) + len(code_blocks[0])
        review = response[start:start+500].strip()
        review = re.sub(r'```.*?```', '', review, flags=re.DOTALL).strip()

        if len(review) < 30:
            review = "No major statistical issues detected. Code demonstrates good practices."

        return {
            "error_type": "none",
            "severity": "none",
            "domain": domain,
            "scenario": scenario,
            "language": "python",
            "complexity": "intermediate",
            "flawed_code": good_code,
            "critique": {
                "summary": "No major statistical issues detected",
                "detailed_explanation": review,
                "line_numbers": [],
                "consequences": "N/A - good practices"
            },
            "corrected_code": good_code,
            "learning_resources": ["Good statistical practices"],
            "principle": "Demonstrates proper methodology",
            "generated_at": datetime.now().isoformat(),
            "model": "llama-3-8b-v2",
            "is_positive_example": True
        }
    except:
        return None


print("‚úÖ Generation functions ready (negative + positive)")

‚úÖ Generation functions ready (negative + positive)


## Section 4: Quality Filtering

Filter out low-quality examples before saving.

In [None]:
def passes_quality_filter(example: Dict) -> bool:
    """Lenient filter - only reject obviously broken."""

    # Always keep positive examples
    if example.get('is_positive_example', False):
        return True

    flawed = example.get('flawed_code', '')
    critique = example.get('critique', {}).get('detailed_explanation', '')
    corrected = example.get('corrected_code', '')

    # Basic length checks
    if len(flawed) < 50 or len(critique) < 30 or len(corrected) < 50:
        return False

    # Must have some Python code
    if 'import' not in flawed.lower() and '=' not in flawed:
        return False

    return True

print("‚úÖ Lenient filter ready")

‚úÖ Lenient filter ready


## Section 5: Generate V2 Dataset

**Plan:**
- ~400 negative examples (flawed code)
- ~50 positive examples (good code)
- Quality filtering applied
- Saves to Drive every 10 examples

**Estimated time:** 5-7 hours

In [None]:
def generate_v2_dataset(n_negative: int = 400, n_positive: int = 50) -> pd.DataFrame:
    """Generate V2 dataset with negative + positive examples."""
    examples = []
    error_types = list(ALL_ERRORS.keys())
    domains = [d['domain'] for d in DOMAIN_CONTEXTS]

    negative_per_error = n_negative // len(error_types)
    positive_per_domain = n_positive // len(domains)

    print(f"üöÄ Generating V2 Training Data")
    print(f"="*80)
    print(f"   Negative: {n_negative} ({len(error_types)} errors √ó ~{negative_per_error} each)")
    print(f"   Positive: {n_positive} ({len(domains)} domains √ó ~{positive_per_domain} each)")
    print(f"   Total: {n_negative + n_positive}")
    print(f"   Save: {V2_JSONL}")
    print(f"="*80)

    successful = 0
    failed = 0
    filtered = 0

    # PHASE 1: Negative examples
    print("\nüìä PHASE 1: Generating Negative Examples (Flawed Code)")
    print("="*80)

    for i, error_type in enumerate(error_types):
        print(f"\n[{i+1}/{len(error_types)}] {ALL_ERRORS[error_type]['name']}")
        print("-"*80)

        for j in range(negative_per_error):
            domain = random.choice(domains)
            print(f"  [{j+1}/{negative_per_error}] {domain:15s} ", end="", flush=True)

            example = generate_negative_example(error_type, domain)

            if example and passes_quality_filter(example):
                examples.append(example)
                successful += 1
                print("‚úÖ")
            elif example:
                filtered += 1
                print("‚ö†Ô∏è")
            else:
                failed += 1
                print("‚ùå")

            # Checkpoint every 10
            if len(examples) % 10 == 0 and len(examples) > 0:
                with open(V2_JSONL, 'w') as f:
                    for ex in examples:
                        f.write(json.dumps(ex) + '\n')
                print(f"\n  üíæ Saved {len(examples)} to Drive")

    # PHASE 2: Positive examples
    print("\n\nüìä PHASE 2: Generating Positive Examples (Good Code)")
    print("="*80)

    for i, domain in enumerate(domains):
        print(f"\n[{i+1}/{len(domains)}] {domain}")
        print("-"*80)

        for j in range(positive_per_domain):
            print(f"  [{j+1}/{positive_per_domain}] Positive ", end="", flush=True)

            example = generate_positive_example(domain)

            if example and passes_quality_filter(example):
                examples.append(example)
                successful += 1
                print("‚úÖ")
            elif example:
                filtered += 1
                print("‚ö†Ô∏è")
            else:
                failed += 1
                print("‚ùå")

            if len(examples) % 10 == 0:
                with open(V2_JSONL, 'w') as f:
                    for ex in examples:
                        f.write(json.dumps(ex) + '\n')
                print(f"\n  üíæ Saved {len(examples)} to Drive")

    # Final save
    with open(V2_JSONL, 'w') as f:
        for ex in examples:
            f.write(json.dumps(ex) + '\n')

    df = pd.DataFrame(examples)

    print(f"\n{'='*80}")
    print(f"‚úÖ V2 COMPLETE!")
    print(f"{'='*80}")
    print(f"  Total: {len(examples)}")
    print(f"  Success: {successful}")
    print(f"  Filtered: {filtered}")
    print(f"  Failed: {failed}")
    print(f"  Rate: {successful/(successful+failed+filtered)*100:.1f}%")
    print(f"\n  Negative: {sum(1 for e in examples if not e.get('is_positive_example', False))}")
    print(f"  Positive: {sum(1 for e in examples if e.get('is_positive_example', False))}")
    print(f"\n  Saved: {V2_JSONL}")
    print(f"="*80)

    return df

print("‚úÖ Dataset function ready (with positive examples)")

‚úÖ Dataset function ready (with positive examples)


In [None]:
# GENERATE V2 (400 negative + 50 positive = 450 total)
v2_df = generate_v2_dataset(n_negative=400, n_positive=50)

# Save CSV
v2_df.to_csv(V2_CSV, index=False)
print(f"\n‚úÖ CSV saved: {V2_CSV}")

üöÄ Generating V2 Training Data
   Negative: 400 (15 errors √ó ~26 each)
   Positive: 50 (4 domains √ó ~12 each)
   Total: 450
   Save: /content/drive/MyDrive/DataVizCritic/training_data_v2.jsonl

üìä PHASE 1: Generating Negative Examples (Flawed Code)

[1/15] Correlation vs Causation Confusion
--------------------------------------------------------------------------------
  [1/26] healthcare      ‚úÖ
  [2/26] business        ‚úÖ
  [3/26] healthcare      ‚úÖ
  [4/26] healthcare      ‚úÖ
  [5/26] business        ‚úÖ
  [6/26] business        ‚úÖ
  [7/26] social_science  ‚úÖ
  [8/26] business        ‚úÖ
  [9/26] education       ‚úÖ
  [10/26] education       ‚úÖ


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/DataVizCritic/training_data_v2.jsonl'

In [None]:
import pandas as pd
import json

# Define paths
PROJECT_FOLDER = '/content/drive/MyDrive/DataVizCritic'
V2_JSONL = f'{PROJECT_FOLDER}/training_data_v2.jsonl'
V2_CSV = f'{PROJECT_FOLDER}/training_data_v2.csv'

print("üìÅ Loading V2 data from Google Drive...")
print(f"   Path: {V2_JSONL}\n")

# Load from JSONL
with open(V2_JSONL, 'r') as f:
    v2_data = [json.loads(line) for line in f]

v2_df = pd.DataFrame(v2_data)

print("="*80)
print("üìä V2 DATASET ANALYSIS")
print("="*80)

print(f"\nOverall Statistics:")
print(f"  Total examples: {len(v2_df)}")
negative_count = sum(~v2_df.get('is_positive_example', pd.Series([False]*len(v2_df))))
positive_count = sum(v2_df.get('is_positive_example', pd.Series([False]*len(v2_df))))
print(f"  Negative (flawed): {negative_count}")
print(f"  Positive (good): {positive_count}")
print(f"  Success rate: 100%")

print(f"\n{'='*80}")
print("By Error Type:")
print(f"{'='*80}")
error_counts = v2_df['error_type'].value_counts()
for error, count in error_counts.items():
    if error != 'none':
        print(f"  {error:30s} {count:3d} examples")

print(f"\n{'='*80}")
print("By Domain:")
print(f"{'='*80}")
domain_counts = v2_df['domain'].value_counts()
for domain, count in domain_counts.items():
    print(f"  {domain:20s} {count:3d} examples")

print(f"\n{'='*80}")
print("By Severity:")
print(f"{'='*80}")
severity_counts = v2_df['severity'].value_counts()
for severity, count in severity_counts.items():
    if severity != 'none':
        print(f"  {severity:15s} {count:3d} examples")

print(f"\n{'='*80}")
print("Code Quality Metrics:")
print(f"{'='*80}")
v2_df['flawed_lines'] = v2_df['flawed_code'].str.count('\n')
v2_df['corrected_lines'] = v2_df['corrected_code'].str.count('\n')
v2_df['critique_words'] = v2_df['critique'].apply(lambda x: len(x.get('detailed_explanation', '').split()))

print(f"  Flawed code length:")
print(f"    Average: {v2_df['flawed_lines'].mean():.1f} lines")
print(f"    Min: {v2_df['flawed_lines'].min()} lines")
print(f"    Max: {v2_df['flawed_lines'].max()} lines")

print(f"\n  Corrected code length:")
print(f"    Average: {v2_df['corrected_lines'].mean():.1f} lines")
print(f"    Min: {v2_df['corrected_lines'].min()} lines")
print(f"    Max: {v2_df['corrected_lines'].max()} lines")

print(f"\n  Critique length:")
print(f"    Average: {v2_df['critique_words'].mean():.1f} words")
print(f"    Min: {v2_df['critique_words'].min()} words")
print(f"    Max: {v2_df['critique_words'].max()} words")

print(f"\n{'='*80}")
print("‚úÖ V2 GENERATION COMPLETE!")
print(f"{'='*80}")
print(f"\nFiles saved:")
print(f"  JSONL: {V2_JSONL}")
print(f"  CSV:   {V2_CSV}")

print(f"\nNext steps:")
print(f"  1. Inspect examples in CSV file")
print(f"  2. Compare V1 vs V2 (V1: 300, V2: 438)")
print(f"  3. Train Phase 2 with V2 data")
print(f"  4. Evaluate which model performs better")
print(f"  5. Use best model for final demo")

print(f"\n{'='*80}")
print("V2 vs V1 Comparison:")
print(f"{'='*80}")
print(f"  V1: 300 examples (original)")
print(f"  V2: 438 examples (+46% more data)")
print(f"  V2 includes: 48 positive examples (good code)")
print(f"  V2 success rate: 100% (vs V1: ~100%)")
print(f"\n‚úÖ Both datasets available - can train separately and compare!")

In [None]:
import json
import pandas as pd
from google.colab import drive

# Mount drive if not already mounted
try:
    drive.mount('/content/drive')
except:
    print("Drive already mounted")

# Paths
PROJECT_FOLDER = '/content/drive/MyDrive/DataVizCritic'
V1_PATH = f'{PROJECT_FOLDER}/training_data.jsonl'
V2_PATH = f'{PROJECT_FOLDER}/training_data_v2.jsonl'
COMBINED_PATH = f'{PROJECT_FOLDER}/training_data_combined.jsonl'
COMBINED_CSV = f'{PROJECT_FOLDER}/training_data_combined.csv'

print("="*80)
print("Creating Combined Dataset (V1 + V2)")
print("="*80)

# Load V1
print(f"\nüìÅ Loading V1: {V1_PATH}")
with open(V1_PATH, 'r') as f:
    v1_data = [json.loads(line) for line in f]
print(f"   V1 examples: {len(v1_data)}")

# Load V2
print(f"\nüìÅ Loading V2: {V2_PATH}")
with open(V2_PATH, 'r') as f:
    v2_data = [json.loads(line) for line in f]
print(f"   V2 examples: {len(v2_data)}")

# Add version tags
for item in v1_data:
    item['data_version'] = 'v1'
    if 'is_positive_example' not in item:
        item['is_positive_example'] = False

for item in v2_data:
    item['data_version'] = 'v2'

# Combine
combined_data = v1_data + v2_data

print(f"\n{'='*80}")
print("Combined Dataset Statistics:")
print(f"{'='*80}")
print(f"  V1 examples: {len(v1_data)}")
print(f"  V2 examples: {len(v2_data)}")
print(f"  Total: {len(combined_data)}")
print(f"  Negative (flawed): {sum(1 for x in combined_data if not x.get('is_positive_example', False))}")
print(f"  Positive (good): {sum(1 for x in combined_data if x.get('is_positive_example', False))}")

# Save combined JSONL
print(f"\nüíæ Saving combined dataset...")
with open(COMBINED_PATH, 'w') as f:
    for item in combined_data:
        f.write(json.dumps(item) + '\n')
print(f"   JSONL: {COMBINED_PATH}")

# Save combined CSV
combined_df = pd.DataFrame(combined_data)
combined_df.to_csv(COMBINED_CSV, index=False)
print(f"   CSV: {COMBINED_CSV}")

print(f"\n‚úÖ Combined dataset created successfully!")
print(f"   Ready for Phase 2 training")