# Multilingual Car Problem Dataset Generator

This notebook generates a realistic car problem dataset in multiple languages (English, French, Spanish, Japanese, Chinese Simplified, Greek, and Hebrew) with intentional grammatical variations to simulate real user input.

In [11]:
import pandas as pd
import random
from typing import List, Dict, Tuple
import uuid

# Define car brands and models
car_data = {
    "Toyota": ["Corolla", "Camry"],
    "Honda": ["Civic", "Accord"],
    "Ford": ["Focus", "Fusion"],
    "Volkswagen": ["Golf", "Passat"],
    "Nissan": ["Altima", "Sentra"]
}

# Define 3 common problems
problems = [
    "engine_overheating",
    "brake_noise",
    "battery_drain"
]

In [3]:
# Define translations with intentional variations to simulate real user input
translations = {
    "engine_overheating": {
        "fault": {
            "en": ["Engine overheating", "engine is overheating", "Engine gets too hot", "overheating problem"],
            "fr": ["Moteur surchauffe", "le moteur chauffe trop", "Problème de surchauffe moteur", "moteur trop chaud"],
            "es": ["Motor sobrecalentado", "el motor se calienta mucho", "Problema de sobrecalentamiento", "motor muy caliente"],
            "ja": ["エンジンオーバーヒート", "エンジンが熱い", "エンジン過熱問題", "エンジンが熱くなる"],
            "zh": ["发动机过热", "引擎太热了", "发动机温度过高", "引擎过热问题"],
            "el": ["Υπερθέρμανση κινητήρα", "ο κινητήρας υπερθερμαίνεται", "Πρόβλημα υπερθέρμανσης", "κινητήρας πολύ ζεστός"],
            "he": ["התחממות יתר של המנוע", "המנוע מתחמם", "בעיית חימום יתר", "מנוע חם מדי"]
        },
        "fix": {
            "en": ["Check coolant level and radiator", "refill coolant and check radiator", "Add coolant, inspect radiator for leaks", "coolant low - add more"],
            "fr": ["Vérifier niveau liquide refroidissement", "ajouter du liquide de refroidissement", "Verifier radiateur et liquide", "remplir liquide refroidissement"],
            "es": ["Revisar nivel de refrigerante", "añadir refrigerante y revisar radiador", "Verificar radiador y liquido", "poner mas refrigerante"],
            "ja": ["冷却水レベルチェック", "冷却液を補充する", "ラジエーター確認", "クーラント追加"],
            "zh": ["检查冷却液水平", "添加冷却液", "检查散热器", "加冷却液"],
            "el": ["Έλεγχος ψυκτικού υγρού", "προσθήκη ψυκτικού", "Έλεγχος ψυγείου", "βάλε ψυκτικό υγρό"],
            "he": ["בדוק רמת נוזל קירור", "הוסף נוזל קירור", "בדיקת רדיאטור", "מלא נוזל קירור"]
        }
    },
    "brake_noise": {
        "fault": {
            "en": ["Brake making noise", "brakes squeak", "Squeaking brakes when stopping", "brake noise problem"],
            "fr": ["Bruit de frein", "les freins grincent", "Freins qui font du bruit", "probleme bruit freins"],
            "es": ["Ruido en frenos", "frenos hacen ruido", "Frenos chirrian", "ruido al frenar"],
            "ja": ["ブレーキ音がする", "ブレーキがキーキー鳴る", "ブレーキノイズ", "ブレーキの音"],
            "zh": ["刹车有噪音", "刹车声音大", "制动器噪音", "刹车响"],
            "el": ["Θόρυβος φρένων", "τα φρένα κάνουν θόρυβο", "Φρένα τρίζουν", "θόρυβος στα φρένα"],
            "he": ["רעש בבלמים", "בלמים מרעישים", "צריחת בלמים", "רעש בזמן בלימה"]
        },
        "fix": {
            "en": ["Replace brake pads", "change brake pads", "New brake pads needed", "brake pads worn - replace"],
            "fr": ["Remplacer plaquettes de frein", "changer les plaquettes", "Nouvelles plaquettes necessaires", "plaquettes usées"],
            "es": ["Cambiar pastillas de freno", "reemplazar pastillas", "Pastillas nuevas necesarias", "cambiar las pastillas"],
            "ja": ["ブレーキパッド交換", "パッド交換必要", "新しいブレーキパッド", "パッド替える"],
            "zh": ["更换刹车片", "换新刹车片", "需要新刹车片", "刹车片要换"],
            "el": ["Αλλαγή τακάκια", "αντικατάσταση τακάκια φρένων", "Νέα τακάκια", "άλλαξε τακάκια"],
            "he": ["החלף רפידות בלמים", "רפידות חדשות", "צריך רפידות בלם חדשות", "להחליף רפידות"]
        }
    },
    "battery_drain": {
        "fault": {
            "en": ["Battery draining fast", "battery dies quickly", "Battery won't hold charge", "battery drain issue"],
            "fr": ["Batterie se vide vite", "batterie se décharge", "Batterie tient pas la charge", "probleme batterie"],
            "es": ["Batería se descarga rápido", "bateria no dura", "Batería no mantiene carga", "bateria se agota"],
            "ja": ["バッテリーが早く減る", "バッテリーすぐ切れる", "充電持たない", "バッテリー問題"],
            "zh": ["电池耗电快", "电池不耐用", "电池充不进电", "电池问题"],
            "el": ["Μπαταρία αδειάζει γρήγορα", "η μπαταρία δεν κρατάει", "Πρόβλημα μπαταρίας", "μπαταρία αδειάζει"],
            "he": ["סוללה מתרוקנת מהר", "הסוללה לא מחזיקה", "בעיית סוללה", "סוללה נגמרת מהר"]
        },
        "fix": {
            "en": ["Test alternator and replace battery", "check alternator", "Replace battery or alternator", "new battery needed"],
            "fr": ["Tester alternateur et remplacer batterie", "verifier alternateur", "Changer batterie", "nouvelle batterie"],
            "es": ["Probar alternador y cambiar batería", "revisar alternador", "Cambiar batería", "bateria nueva"],
            "ja": ["オルタネーター確認", "バッテリー交換", "新しいバッテリー", "バッテリー替える"],
            "zh": ["检查发电机和电池", "更换电池", "换新电池", "需要新电池"],
            "el": ["Έλεγχος δυναμό και μπαταρία", "αλλαγή μπαταρίας", "Νέα μπαταρία", "άλλαξε μπαταρία"],
            "he": ["בדוק אלטרנטור והחלף סוללה", "החלף סוללה", "סוללה חדשה", "צריך סוללה חדשה"]
        }
    }
}

In [None]:
# Generate the dataset
def generate_dataset() -> List[Dict[str, str]]:
    dataset = []
    
    # Language distribution for each problem to ensure all languages appear
    language_assignments = {
        "engine_overheating": ["en", "fr", "es", "ja", "zh", "el", "he", "en", "fr", "es"],
        "brake_noise": ["ja", "zh", "el", "he", "en", "fr", "es", "ja", "zh", "el"],
        "battery_drain": ["he", "en", "fr", "es", "ja", "zh", "el", "he", "en", "fr"]
    }
    
    for brand, models in car_data.items():
        for model in models:
            for problem in problems:
                # Generate entries for this problem with assigned languages
                languages = language_assignments[problem][:2]  # Take 2 languages per problem per model
                
                for lang in languages:
                    # Pick random variations for fault and fix
                    fault_text = random.choice(translations[problem]["fault"][lang])
                    fix_text = random.choice(translations[problem]["fix"][lang])
                    
                    dataset.append({
                        "Id": uuid.uuid4(),
                        "Brand": brand,
                        "Model": model,
                        "Fault": fault_text,
                        "Fix": fix_text
                    })
    
    # Shuffle to make it more realistic
    random.shuffle(dataset)
    return dataset

# Generate the data
data = generate_dataset()
print(f"Generated {len(data)} records")

In [None]:
# Create DataFrame and display sample
df = pd.DataFrame(data)
print("Sample of generated data:")
print(df.head(10))
print(f"\nTotal records: {len(df)}")
print(f"Unique brands: {df['Brand'].nunique()}")
print(f"Unique models: {df['Model'].nunique()}")

In [None]:
# Save to Excel file
output_file = "car_problems_multilingual.xlsx"
df.to_excel(output_file, index=False, engine='openpyxl')
print(f"\nData saved to {output_file}")

# Display distribution statistics
print("\nDistribution by Brand:")
print(df['Brand'].value_counts())
print("\nSample entries showing language variety:")
print(df.sample(n=15))

## Dataset Summary

The generated dataset includes:
- **5 car brands**: Toyota, Honda, Ford, Volkswagen, Nissan
- **2 models per brand**: 10 models total
- **3 common problems**: Engine overheating, Brake noise, Battery drain
- **7 languages**: English, French, Spanish, Japanese, Chinese (Simplified), Greek, Hebrew
- **Realistic variations**: Multiple ways to describe the same problem/solution with varying grammar quality

Each problem appears in multiple languages across different cars, simulating real-world user input with natural language variations and occasional grammatical imperfections.