# Generating Synthetic Users

Let's use `Faker` to generate some synthetic user data

In [49]:
import instructor
import openai
from faker import Faker
import random
from pydantic import BaseModel, Field
from typing import List
from tqdm.asyncio import tqdm_asyncio as asyncio
from asyncio import run
import pandas as pd

client = openai.AsyncOpenAI()
client = instructor.from_openai(client)
fake = Faker()

In [51]:

prompt = """
You are a creative writer tasked with generating a detailed backstory for a language learner based on their profile. Use the provided information to create a compelling narrative about their background, motivation for learning the language, and their journey so far. Be sure to incorporate all the given details while expanding on them in a realistic and engaging way.

Here's the learner's profile:

{profile}

Please write a backstory of approximately 300-400 words that covers the following points:

1. The learner's background, including their upbringing and how it relates to their native language.
2. Their decision to learn the target language, incorporating their occupation and learning goal.
3. The journey of their language learning so far, mentioning their current level and learning duration.
4. Their study habits and preferred methods, including how these fit into their daily life.
5. Challenges they've faced and how they've worked to overcome them.
6. How their interests have influenced their language learning process.
7. Their future aspirations related to language learning.

Make sure the backstory feels personal and authentic, with specific anecdotes or experiences that bring the learner's profile to life. The narrative should be coherent and logically connect all aspects of the learner's profile.
"""


class LearnerProfile(BaseModel):
    name: str = Field(default_factory=fake.name)
    age: int = Field(default_factory=lambda: random.randint(18, 60))
    native_language: str = Field(default_factory=lambda: fake.language_name())
    target_language: str = "French"
    occupation: str = Field(default_factory=fake.job)
    learning_duration: str = Field(
        default_factory=lambda: f"{random.randint(1, 24)} months"
    )
    country_of_origin: str = Field(default_factory=fake.country)
    study_frequency: str = Field(
        default_factory=lambda: f"{random.randint(3, 7)} days per week, {random.randint(30, 120)} minutes per day"
    )
    interests: List[str] = Field(
        default_factory=lambda: random.sample(
            [
                "Reading",
                "Cooking",
                "Gardening",
                "Photography",
                "Hiking",
                "Painting",
                "Playing guitar",
                "Yoga",
                "Traveling",
                "Chess",
                "Knitting",
                "Birdwatching",
                "Woodworking",
                "Cycling",
                "Meditation",
            ],
            3,
        )
    )
    preferred_resources: List[str] = Field(
        default_factory=lambda: random.sample(
            [
                "Language learning apps",
                "Online news articles",
                "Podcasts",
                "YouTube videos",
                "Language exchange websites",
                "Grammar books",
                "Online courses",
            ],
            3,
        )
    )
    current_level: str = Field(
        default_factory=lambda: random.choice(["A1", "A2", "B1"])
    )

In [52]:
class GeneratedBackstory(BaseModel):
    """
    Represents a generated backstory for a language learner.

    This class encapsulates the backstory text generated based on a learner's profile.
    The backstory is a detailed narrative (300-400 words) that covers various aspects
    of the learner's language learning journey, including:

    1. Personal background and native language context
    2. Motivation for learning the target language
    3. Progress in language learning so far
    4. Study habits and preferred learning methods
    5. Challenges faced and overcome
    6. Influence of personal interests on language learning
    7. Future language learning aspirations

    The backstory aims to provide a compelling and authentic narrative that brings
    the learner's profile to life with specific anecdotes and experiences.

    Attributes:
        backstory (str): The generated backstory text.
    """

    user_backstory: str


async def generate_backstory(user: LearnerProfile):
    response = await client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": prompt.format(profile=user.model_dump_json()),
            },
        ],
        model="gpt-4o-mini",
        response_model=GeneratedBackstory,
    )
    return response


async def generate_multiple_backstories(num_backstories: int = 4):
    learners = [LearnerProfile() for _ in range(num_backstories)]
    tasks = [generate_backstory(learner) for learner in learners]
    results = await asyncio.gather(*tasks)

    data = [
        {
            "User Information": learner.model_dump_json(),
            "backstory": result.user_backstory,
        }
        for learner, result in zip(learners, results)
    ]

    df = pd.DataFrame(data)
    df.to_csv("backstories.csv", index=False)


await generate_multiple_backstories(2)

100%|██████████| 2/2 [00:06<00:00,  3.21s/it]


# Synthetic Users

Let's see how our users behave and look like


In [58]:
from textwrap import wrap

df = pd.read_csv("./backstories.csv")
# Display the entire DataFrame, including the full backstory

backstories = df['backstory']

for backstory in backstories:
    print('\n'.join(wrap(backstory, 180)))
    print("====")




Shannon Rios, a warm-hearted therapist with a passion for horticulture, was born in the picturesque landscapes of Austria. Growing up, she was surrounded by lush gardens where her
family, who immigrated from Indonesia, cultivated unique plants that spoke to her native language. Her parents often shared stories of their homeland, weaving a rich tapestry of
Indonesian culture into her upbringing. This connection to her native language created a deep-rooted appreciation for the transformative power of communication.  At age 42, after
years of helping others heal through therapy, Shannon decided to embark on a new journey: learning French. The motivation stemmed from her desire to enhance her career. She
believed that being bilingual would not only broaden her professional reach but also allow her to connect with a diverse clientele in her therapeutic practice. Additionally, the
melodic tones of the French language captured her heart, prompting her to start this language-learning endeavor 

## Generating Language Errors

We now take in a language learner's profile and use it to simulate a short conversation ( with errors )

In [48]:
from pydantic import BaseModel, field_validator, Field, model_validator
from typing import Literal,Union

roles = Literal["assistant", "user"]

class Message(BaseModel):
    role: roles
    content: str

class GrammaticalError(BaseModel):
    root_verb:str
    tense: Literal["présent", "passé composé", "imparfait", "plus-que-parfait", "futur simple", "futur antérieur", "conditionnel présent", "conditionnel passé", "subjonctif présent", "subjonctif passé", "impératif"]
    explanation_of_error:str
    incorrect_usage_example: str
    correct_example:str

class Vocabulary(BaseModel):
    vocabulary_term:str
    explanation_of_error:str
    incorrect_vocabulary_example: str
    usage_examples:list[str]
    

class Mistake(BaseModel):
    mistake: Union[GrammaticalError, Vocabulary]
    citation: str = Field(description="Original message where the mistake occurs in the conversation")

class SimulatedConversation(BaseModel):
    conversation:list[Message]
    mistake_breakdown: list[Mistake] = Field(description="The breakdown of the mistakes the user made in the conversation")

    @field_validator("conversation")
    @classmethod
    def validate_valid_structure(cls, v:list[Message]):
        if len(v) < 2:
            raise ValueError("Conversation must have at least 3 messages where assistant and user roles alternate")
        
        if v[0].role == "assistant":
            raise ValueError("Conversation must start with a user message")

        expected_role:roles = "user"
        for message in v:
            if message.role != expected_role:
                raise ValueError("Messages must alternate between assistant and user roles")
            expected_role = "assistant" if expected_role == "user" else "user"

        return v

    @model_validator(mode='after')
    def validate_mistake_breakdown(self) -> 'SimulatedConversation':
        if len(self.mistake_breakdown) == 0:
            raise ValueError("Mistake breakdown must have at least 1 mistake")
        
        for mistake in self.mistake_breakdown:
            if not any([mistake.citation in message.content for message in self.conversation]):
                raise ValueError("Mistake citation must be present in the conversation")
        
        return self


In [13]:
from instructor import from_openai
from openai import OpenAI

client = from_openai(OpenAI())

In [16]:
user_profile = df["backstory"][0]
user_profile

'Meredith Gilbert grew up in a tranquil Estonian village surrounded by lush forests and majestic cliffs that seemed to cradle her childhood dreams. Surrounded by the lyrical sounds of her native language, Estonian, she developed a profound appreciation for her roots. Her parents, both educators, imbued her with a love for learning, often recounting tales of their travels to far-off lands, which lit a spark of wanderlust in her heart that remains unquenchable to this day.\n\nAt 55, Meredith is a seasoned quarry manager, overseeing operations in the picturesque landscape that reminded her of her childhood. However, the mining industry has become more globalized, and interactions with French partners became more frequent. Recognizing the importance of communicating effectively with colleagues across borders, Meredith decided to learn French. She set her sights on achieving a conversational level, envisioning herself confidently discussing plans and negotiating contracts.\n\nFor the past 4

In [77]:
prompt = """
# French Conversation Simulation Prompt

You are an AI assistant tasked with simulating an entire conversation between a French language learner and their tutor. You will alternate between these two roles, creating a realistic language learning scenario. Try to make at least {number_of_mistakes} of {mistake_type} in the conversation with a maximum of 4 mistakes in this conversation.

## French Learner Role

When playing the role of the French learner:

1. Use an intermediate level of French, forming complete sentences but making occasional mistakes.
2. Incorporate the following types of mistakes:
   a. Improper tense: Use incorrect verb conjugations or tenses.
   b. Wrong word usage: Use incorrect French words that are similar in sound or meaning, or direct translations from English.
   c. Vocabulary gaps: Use English words when you don't know the French equivalent.
3. Show enthusiasm for learning and ask questions about language and culture.
4. Occasionally ask for clarification or repetition.

Here is some information about the french learner that you are simulating

{learner_profile}

## Tutor Role

When playing the role of the French tutor:

1. Use fluent, correct French.
2. Be patient, encouraging, and supportive.
3. Correct the learner's mistakes gently, providing explanations when necessary.
4. Ask questions to encourage the learner to practice and expand their skills.
5. Introduce new vocabulary or grammar concepts as appropriate.

## Guidelines for the Conversation

- Alternate between the learner and tutor roles with each message.
- Maintain a natural flow of conversation while incorporating learning elements.
- Ensure the learner's mistakes are noticeable but don't impede overall understanding.
- Not every learner's sentence needs to contain a mistake. Mix correct and incorrect usage.
- The tutor should not correct every single mistake, focusing on the most important ones.

Examples of Mistake Types
1. Improper Tense
Tutor: Qu'est-ce que tu as fait hier? (What did you do yesterday?)
User: Hier, je vais au parc et je lis un livre. J'ai aussi rencontré un ami pour un café.
(Incorrect use of present tense "vais" instead of past tense "suis allé(e)")
Tutor: Depuis combien de temps apprenez-vous le français? (How long have you been learning French?)
User: J'apprendre le français depuis deux ans maintenant. C'est une expérience difficile mais enrichissante.
(Incorrect infinitive "apprendre" instead of conjugated "apprends")

2. Wrong Word Usage
Tutor: Pouvez-vous me recommander un bon restaurant? (Can you recommend a good restaurant?)
User: Oui, je peux penser à un excellent restaurant au centre-ville qui sert de délicieux fruits de mer. Ils ont des plats de poisson et de homard incroyables.
(Incorrect use of "penser à" (to think of) instead of "recommander" (to recommend) or "suggérer" (to suggest))

Tutor: Quelle est votre opinion sur le changement climatique? (What's your opinion on climate change?)
User: Le changement climatique est un sérieux mouchoir qui affecte notre planète entière. Nous devons prendre des mesures pour réduire notre empreinte carbone.
(Incorrect use of "mouchoir" (handkerchief) instead of "problème" or "enjeu")

3. Vocabulary Gaps
Tutor: Quelle est votre saison préférée? (What's your favorite season?)
User: Ma saison préférée est... Je ne suis pas sûr du mot en français. En anglais, c'est "autumn". J'aime les feuilles colorées et l'air frais pendant cette période.
(Using English "autumn" instead of French "automne")

Tutor: Avez-vous des passe-temps? (Do you have any hobbies?)
User: Oui, j'ai plusieurs passe-temps. J'aime lire, écouter de la musique, et... comment dit-on "to ride a bicycle" en français? C'est quand on roule sur un véhicule à deux roues avec des pédales.
(Using English phrase "to ride a bicycle" instead of French "faire du vélo")
"""

In [79]:
import random


def simulate_conversation(user_profile: str):
    mistake_type = ["improper tense","wrong word usage","vocabulary gaps"]
    number_of_mistakes = random.randint(1, 3)
    return client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": prompt.format(learner_profile=user_profile, number_of_mistakes=number_of_mistakes, mistake_type=mistake_type),
            }
        ],
        model="gpt-4o-mini",
        max_retries=4,
        response_model=SimulatedConversation,
    )


conversation = await simulate_conversation(user_profile)
print(conversation)


conversation=[Message(role='user', content="Bonjour! Je suis très excité de pratiquer le français aujourd'hui."), Message(role='assistant', content="Bonjour Meredith! Je suis ravi de vous voir aussi enthousiaste! De quoi aimeriez-vous parler aujourd'hui?"), Message(role='user', content="Je veux parler de mes hobbies. J'aime faire de la randonnée et photographie des paysages."), Message(role='assistant', content='C\'est formidable! Mais attention, vous avez utilisé une petite erreur. On dit "photographier" au lieu de "photographie." Donc, vous diriez "j\'aime faire de la randonnée et photographier des paysages." Quels types de paysages aimez-vous photographier?'), Message(role='user', content="J'aime photographier les montagnes et les forêts. Parfois, je vais à la plage pour voir les oiseaux et prendre des photos aussi."), Message(role='assistant', content='C\'est magnifique! Juste une petite correction: vous devriez dire "aller à la plage" au lieu de "aller à le plage." Les noms en fra

  conversation = await simulate_conversation(user_profile)


In [80]:
for message in conversation.conversation:
    print(message.role, message.content)

user Bonjour! Je suis très excité de pratiquer le français aujourd'hui.
assistant Bonjour Meredith! Je suis ravi de vous voir aussi enthousiaste! De quoi aimeriez-vous parler aujourd'hui?
user Je veux parler de mes hobbies. J'aime faire de la randonnée et photographie des paysages.
assistant C'est formidable! Mais attention, vous avez utilisé une petite erreur. On dit "photographier" au lieu de "photographie." Donc, vous diriez "j'aime faire de la randonnée et photographier des paysages." Quels types de paysages aimez-vous photographier?
user J'aime photographier les montagnes et les forêts. Parfois, je vais à la plage pour voir les oiseaux et prendre des photos aussi.
assistant C'est magnifique! Juste une petite correction: vous devriez dire "aller à la plage" au lieu de "aller à le plage." Les noms en français, comme "plage," ont des articles qui changent. Quelles sont vos endroits préférés pour aller en randonnée?
user Je préfère aller aux collines près de ma maison, c'est tranquill

In [81]:
errors = conversation.mistake_breakdown
errors[0]

Mistake(mistake=Vocabulary(vocabulary_term='photographie', explanation_of_error='L\'apprenant a utilisé le terme incorrect "photographie" au lieu du verbe "photographier."', incorrect_vocabulary_example='"J\'aime faire de la randonnée et photographie des paysages."', usage_examples=["J'aime photographier des oiseaux pendant mes promenades.", 'Elle adore photographier les éléments de la nature.', 'Nous avons photographié les aurores boréales.']), citation="Je veux parler de mes hobbies. J'aime faire de la randonnée et photographie des paysages.")

# Generating Positive examples

Let's now try to generate some good positive examples of conversations given specific mistakes that we identified previously

In [59]:
class GoodConversationExample(BaseModel):
    conversation:list[Message]
    
    @field_validator("conversation")
    @classmethod
    def validate_valid_structure(cls, v:list[Message]):
        if len(v) < 2:
            raise ValueError("Conversation must have at least 3 messages where assistant and user roles alternate")
        
        if v[0].role == "assistant":
            raise ValueError("Conversation must start with a user message")

        expected_role:roles = "user"
        for message in v:
            if message.role != expected_role:
                raise ValueError("Messages must alternate between assistant and user roles")
            expected_role = "assistant" if expected_role == "user" else "user"

        return v

In [87]:
from typing import Union

async def generate_good_conversation(error: Union[GrammaticalError, Vocabulary]) -> GoodConversationExample:
    import random

    time_of_day = ["morning", "night", "afternoon", "dawn", "dusk", "midday", "evening", "sunrise", "sunset", "noon"]
    weather = ["sunny", "cloudy", "rainy", "snowy", "windy", "foggy", "stormy", "humid", "frosty", "hazy"]
    drink_of_choice = ["coffee", "tea", "chai", "vodka", "sprite", "gin and tonic", "water", "juice", "hot chocolate", "lemonade"]
    situation = ["Buying something", "Selling Something", "Running late", "Going to a restaurant", "Going to a bar", "Going to a cafe", "Meeting a friend", "Asking for directions", "Making a reservation", "Ordering food"]

    if isinstance(error, GrammaticalError):
        verb = error.root_verb
        negative_example = error.incorrect_usage_example
        positive_example = error.correct_example
    elif isinstance(error, Vocabulary):
        verb = error.vocabulary_term
        negative_example = error.incorrect_vocabulary_example
        positive_example = error.usage_examples
    else:
        raise ValueError("Unsupported error type")

    prompt = f"""Simulate a natural conversation between a French language learner and a native speaker that demonstrates the correct usage of the verb '{verb}'. 
    You may optionally incorporate some or all of these elements: time of day ({random.choice(time_of_day)}), a drink of choice ({random.choice(drink_of_choice)}), or a situation ({random.choice(situation)}). 
    The conversation should mimic something that might happen in real life. 
    As a negative example, avoid using the incorrect phrase '{negative_example}' 
    Instead, show the correct usage: '{positive_example}' 
    Provide positive examples that showcase proper conjugation and idiomatic expressions with '{verb}'. 
    The conversation should feel authentic and cover everyday situations where '{verb}' is commonly used."""

    good_convo = await client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": prompt
            }
        ],
        model="gpt-4o-mini",
        response_model=GoodConversationExample
    )
    
    return good_convo


In [86]:
await generate_good_conversation(errors[0].mistake)

GoodConversationExample(conversation=[Message(role='user', content='Salut ! Quelque chose de prévu pour ce soir ?'), Message(role='assistant', content="Salut ! Pas grand-chose, je pensais m'installer avec un bon café et peut-être photographier le coucher de soleil au bar près de la rivière."), Message(role='user', content="Ça a l'air super ! J'aime photographier les oiseaux pendant mes promenades."), Message(role='assistant', content="Ah oui ? Quel genre d'oiseaux aimes-tu photographier ?"), Message(role='user', content="Surtout les canards et les hirondelles. J'ai toujours voulu essayer de photographier des animaux sauvages."), Message(role='assistant', content="C'est une bonne idée ! Elle adore photographier les éléments de la nature pendant ses voyages."), Message(role='user', content="C'est vrai, la nature offre de si belles occasions. Peut-être qu'on pourrait y aller ensemble un jour."), Message(role='assistant', content="Oui, avec plaisir ! Nous avons photographié les aurores bor

In [88]:
await generate_good_conversation(errors[1].mistake)

GoodConversationExample(conversation=[Message(role='user', content='Salut! Que fais-tu ce soir?'), Message(role='assistant', content='Salut! J’ retourne à la maison après le travail. Et toi, que vas-tu faire?'), Message(role='user', content='Je vais à la plage pour voir le coucher de soleil.'), Message(role='assistant', content="Ah, ça a l'air génial! Tu veux prendre un café avant d’y aller?"), Message(role='user', content='Oui, je vais prendre un café. Est-ce que tu veux venir avec moi?'), Message(role='assistant', content='Bien sûr! Je vais au café en face de la plage.'), Message(role='user', content='Parfait! On y va ensemble alors.'), Message(role='assistant', content="D'accord, allons-y! Après, nous pourrons acheter quelque chose à manger sur le chemin."), Message(role='user', content='Bonne idée! Je vais prendre un croissant.'), Message(role='assistant', content='Moi aussi, je vais prendre un croissant.'), Message(role='user', content='Super! Allons-y!')])