In [4]:
import csv
import random

sex_words = [
    ("male", "man", "boy", "he", "him"), 
    ("female", "woman", "girl", "she", "her"), 
    ("unknown", "", "", "", "")
]
age_ranges = list(range(1, 90))

# Contexts médicaux urgents variés
symptoms_urgent = [
    "sudden severe chest pain and shortness of breath",
    "loss of consciousness after a head injury",
    "heavy bleeding from a deep cut in the leg",
    "severe abdominal pain with vomiting",
    "difficulty speaking and weakness on one side of the body",
    "seizures with high fever",
    "vomiting blood and feeling faint",
    "blood in stool with strong abdominal pain",
    "very rapid irregular heartbeat and dizziness",
    "severe persistent headache and stiff neck",
    "major burns with skin peeling off",
    "sudden vision loss in one eye",
    "not able to pass urine and distended abdomen",
    "severe sudden back pain and weakness in legs",
    "loss of movement in both arms after electric shock",
    "breathing difficulty and wheezing after bee sting",
    "intense abdominal pain after eating questionable food",
    "unable to speak and face drooping on one side",
    "vomiting and severe diarrhea in a small child",
    "mental confusion and agitation in elderly",
    "choking and cannot breathe",
    "high fever and purple skin spots",
    "sudden chest tightness after intense exercise",
    "swallowing household product (poison ingestion)",
    "severe allergic reaction with tongue swelling",
    "broken bone visible through skin",
    "severe dehydration after prolonged vomiting",
    "new epileptic seizures for the first time",
    "excruciating testicular pain and swelling",
    "acute eye pain and sudden vision blurring",
    "uncontrolled bleeding after dental procedure"
]

contexts = [
    "since this morning", "after a cycling accident", "for the past hour",
    "during the night", "while exercising", "with sweating and nausea",
    "with mental confusion", "after eating unknown food", "with blue lips",
    "with very high fever", "following a fall down the stairs", "after a car accident",
    "while at work", "after being stung by an insect", "following heavy exertion",
    "just after taking my medication", "when I woke up", "after a heated argument",
    "during a meal", "in the middle of the night", "while shopping",
    "after returning from traveling abroad", "after fainting at home",
    "when I tried to stand up", "after an episode of choking",
    "following a seizure", "after a dental procedure", "after a fever for three days",
    "as soon as I started running", "after intense abdominal cramps"
]

# Templates pour variabilité
question_templates = [
    # 1ère personne
    "I am {age} years old {sex_word}. Since {context}, I have been having {symptom}. Should I go to the emergency room?",
    "I'm {age}, {sex_word}, feeling {symptom} {context}. Is this urgent?",
    "I suddenly have {symptom} and I am {age} years old {sex_word}. I'm really scared, what do I do?",
    "Can you help? I'm a {age} year old {sex_word} and I just had {symptom} {context}.",
    "Do I need to worry about {symptom}? I'm {age}, {sex_word}, it started {context}.",
    "I have {symptom} since {context}. I am {age} years old. Emergency?",
    # 3ème personne proche/témoignage
    "My {relative} ({age} years old) just developed {symptom} {context}. Should we call an ambulance?",
    "My {relative}, aged {age}, suddenly got {symptom} {context}. Is this life-threatening?",
    "Rushed my {relative} ({age}, {sex_word}) to hospital because of {symptom}. Was this the right thing?",
    "I live with a {age} year old {sex_word}, now they are experiencing {symptom} {context}. Do I need to act fast?",
    # Tiers, style hôpital ou medical
    "Patient: {age} years, {sex_word}. Presents: {symptom} {context}. Admission needed?",
    "Observing {symptom} in {age} year old {sex_word} {context}. Should we go to ER?",
    "In what cases should someone with {symptom} {context} (age {age}) seek emergency care?",
    # Narratif/court/perplexe
    "{age} years old, {sex_word} with {symptom} {context}. Really scared, advice please!",
    "Emergency? {symptom} started {context} in a {age} year old {sex_word}.",
    "{symptom} {context}, happened to a {age} year old. Is this time critical?",
    "What should I do right now about {symptom} {context}? Age {age}, {sex_word}.",
    "How urgent is {symptom} {context} for someone aged {age}?",
    # Registre informel/question simple
    "ER or wait? {symptom} {context}, age {age}.",
    "Is {symptom} {context} dangerous? Happened to my {relative}, {age} years old.",
    "Worried! {symptom} started in my {relative}, aged {age}, {context}."
]

relatives = [
    "father", "mother", "friend", "daughter", "son", "sister", "brother",
    "wife", "husband", "grandmother", "grandfather",
    "uncle", "aunt", "cousin", "nephew", "niece",
    "roommate", "colleague", "neighbor", "partner", "boyfriend", "girlfriend",
    "my child", "my parent", "my spouse", "my boss",
    "my classmate", "my teacher", "my teammate", "my student",
    "stepfather", "stepmother", "stepson", "stepdaughter",
    "someone I know", "our housekeeper", "my friend’s parent",
    "baby brother", "baby sister", "my twin",
    "my patient", "my sibling"
]


def gen_question():
    age = random.choice(age_ranges)
    sex_idx = random.choice([0, 1, 2])
    sex_word = random.choice(sex_words[sex_idx][1:])
    symptom = random.choice(symptoms_urgent)
    context = random.choice(contexts)
    template = random.choice(question_templates)
    relative = random.choice(relatives)
    # Ajoute condition si template utilise relative ou pas
    question = template.format(
        age=age, sex_word=sex_word, symptom=symptom, context=context, relative=relative
    ).replace("  ", " ").strip()
    return [question, "urgent"]

with open("gen_4000_urgent_questions.csv", "w", newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(["question", "triage"])
    for _ in range(4000):
        writer.writerow(gen_question())


In [6]:
import pandas as pd

# Charger les deux datasets
df1 = pd.read_csv('/Users/ines/NLP/emergency_chatbot/data/triage_dataset.csv')      # Le tien
df2 = pd.read_csv('/Users/ines/NLP/emergency_chatbot/data/gen_4000_urgent_questions.csv')    # Les questions générées

# Verifier colonnes équivalentes
assert list(df1.columns) == list(df2.columns), "Colonnes différentes !"

# Concaténer les deux datasets
df_full = pd.concat([df1, df2], ignore_index=True)

# (Optionnel) Supprimer les doublons
df_full = df_full.drop_duplicates(subset=['question'])

# Sauvegarder le dataset fusionné si besoin
df_full.to_csv('dataset_concatene.csv', index=False)
