## Fix and Validate JSON Strings in Prompt CSV Files

This notebook checks JSON validity in prompt_male.csv and prompt_female.csv, and fixes any parsing issues.

In [1]:
import pandas as pd
import json
import ast
import re
from pathlib import Path

# Load both CSV files
male_csv = './prompt_male.csv'
female_csv = './prompt_female.csv'

print("Loading CSV files...")
df_male = pd.read_csv(male_csv)
df_female = pd.read_csv(female_csv)

print(f"Male prompts: {len(df_male)} rows")
print(f"Female prompts: {len(df_female)} rows")
print(f"\nColumns: {df_male.columns.tolist()}")

Loading CSV files...
Male prompts: 99 rows
Female prompts: 100 rows

Columns: ['Images', 'JSON', 'Prompt_text']


### Step 1: Check JSON validity for all entries

In [2]:
def safe_parse_json(json_str):
    """Try multiple methods to parse JSON string"""
    if pd.isna(json_str) or not json_str:
        return None, "Empty or NaN"
    
    # Try standard JSON parsing
    try:
        parsed = json.loads(json_str)
        return parsed, "Valid JSON"
    except Exception as e1:
        pass
    
    # Try ast.literal_eval
    try:
        parsed = ast.literal_eval(json_str)
        return parsed, "Valid via ast.literal_eval"
    except Exception as e2:
        pass
    
    # Try removing any leading/trailing characters
    try:
        cleaned = json_str.strip()
        if cleaned.startswith('[') and not cleaned.endswith(']'):
            cleaned = cleaned + ']'
        elif cleaned.startswith('{') and not cleaned.endswith('}'):
            cleaned = cleaned + '}'
        parsed = json.loads(cleaned)
        return parsed, "Valid after cleanup"
    except Exception as e3:
        return None, f"Failed: {str(e3)[:50]}"

# Check male prompts
print("Checking male prompts...")
male_results = []
for idx, row in df_male.iterrows():
    parsed, status = safe_parse_json(row['JSON'])
    male_results.append({
        'index': idx,
        'image': row['Images'],
        'valid': parsed is not None,
        'status': status,
        'json_preview': str(row['JSON'])[:100] if pd.notna(row['JSON']) else 'N/A'
    })

male_valid = sum(1 for r in male_results if r['valid'])
print(f"Male: {male_valid}/{len(male_results)} valid ({male_valid/len(male_results)*100:.1f}%)")

# Check female prompts
print("\nChecking female prompts...")
female_results = []
for idx, row in df_female.iterrows():
    parsed, status = safe_parse_json(row['JSON'])
    female_results.append({
        'index': idx,
        'image': row['Images'],
        'valid': parsed is not None,
        'status': status,
        'json_preview': str(row['JSON'])[:100] if pd.notna(row['JSON']) else 'N/A'
    })

female_valid = sum(1 for r in female_results if r['valid'])
print(f"Female: {female_valid}/{len(female_results)} valid ({female_valid/len(female_results)*100:.1f}%)")

Checking male prompts...
Male: 99/99 valid (100.0%)

Checking female prompts...
Female: 100/100 valid (100.0%)


### Step 2: Display invalid entries

In [3]:
# Show invalid male entries
invalid_male = [r for r in male_results if not r['valid']]
if invalid_male:
    print(f"Invalid male entries ({len(invalid_male)}):")
    for r in invalid_male[:5]:
        print(f"\n  Index {r['index']} - {r['image']}")
        print(f"  Status: {r['status']}")
        print(f"  Preview: {r['json_preview']}")

# Show invalid female entries
invalid_female = [r for r in female_results if not r['valid']]
if invalid_female:
    print(f"\n\nInvalid female entries ({len(invalid_female)}):")
    for r in invalid_female[:5]:
        print(f"\n  Index {r['index']} - {r['image']}")
        print(f"  Status: {r['status']}")
        print(f"  Preview: {r['json_preview']}")