In [1]:
from datetime import datetime, timedelta
import requests 
import time
import json
import os
from google import genai
from google.genai import types
import os
from dotenv import load_dotenv
import random 

load_dotenv()

import boto3

client = genai.Client(api_key=os.environ["GEMINI_KEY"])

In [2]:
demographic_parameters = {
    "Age Range": [
        "18-24",
        "25-34",
        "35-44",
        "45-54",
        "55-64",
        "65+"
    ],
    "Gender": [
        "Male",
        "Female",
        "Non-binary",
    ],
    "Location": [
        "Urban",
        "Suburban",
        "Rural",
        "International/Global"
    ],
    "Education Level": [
        "High School",
        "Some College",
        "Bachelor's Degree",
        "Master's Degree",
        "Doctorate/Professional",
        "Vocational/Technical"
    ],
    "Occupation": [
        "Student",
        "Freelancer",
        "Office Worker",
        "Manager",
        "Executive",
        "Entrepreneur",
        "Skilled Trades",
        "Retired",
        "Unemployed",
        "Technical/IT",
        "Healthcare Professional",
        "Creative Professional",
        "Sales/Marketing",
        "Customer Service",
        "Education/Teaching",
        "Government/Public Service",
        "Hospitality/Service Industry",
        "Finance/Accounting",
        "Legal Professional",
        "Research",
        "Scientist",
        "Logistics/Transportation",
        "Manufacturing",
    ],
    "Income Bracket": [
        "< 25k",
        "25k-50k",
        "50k-75k",
        "75k-100k",
        "100k-150k",
        "150k+"
    ],
    "Marital Status": [
        "Single",
        "Married",
        "Domestic Partnership",
        "Divorced",
        "Widowed"
    ],
    "Household Size": [
        "1",
        "2",
        "3",
        "4+"
    ],
    "Homeownership": [
        "Own Home",
        "Renting",
        "Living with Family",
        "Other"
    ],
    "Ethnicity": [
        "Caucasian",
        "African American",
        "Hispanic",
        "Asian",
        "Mixed",
    ],
    "Religion": [
        "Christianity",
        "Islam",
        "Hinduism",
        "Buddhism",
        "Judaism",
        "Secular/Atheist",
    ]
}


In [3]:
# Different personas of potential customers that the salesman will be talking to.
# Each persona will have a different background and will be looking for different things in the product.
people_personas = [
    {
        "name": "John",
        "age": 30,
        "gender": "male",
        "occupation": "software engineer",
        "education": "bachelor's degree in computer science",
        "interests": "technology, gaming, sports",
    }
    ,{
        "name": "Jane",
        "age": 25,
        "gender": "female",
        "occupation": "marketing manager",
        "education": "master's degree in marketing",
        "interests": "simple lifestyle, crunchy, frugality",
    }
]




In [4]:
# Create 10 randomized personas for the customer
personas_test = []

for i in range(10):
    persona = {
        "Age Range": random.choice(demographic_parameters["Age Range"]),
        "Gender": random.choice(demographic_parameters["Gender"]),
        "Location": random.choice(demographic_parameters["Location"]),
        "Education Level": random.choice(demographic_parameters["Education Level"]),
        "Occupation": random.choice(demographic_parameters["Occupation"]),
        "Income Bracket": random.choice(demographic_parameters["Income Bracket"]),
        "Marital Status": random.choice(demographic_parameters["Marital Status"]),
        "Household Size": random.choice(demographic_parameters["Household Size"]),
        "Homeownership": random.choice(demographic_parameters["Homeownership"]),
        "Ethnicity": random.choice(demographic_parameters["Ethnicity"]),
        "Religion": random.choice(demographic_parameters["Religion"])
    }
    
    personas_test.append(persona)

In [5]:
personas_test

[{'Age Range': '18-24',
  'Gender': 'Non-binary',
  'Location': 'International/Global',
  'Education Level': "Bachelor's Degree",
  'Occupation': 'Manufacturing',
  'Income Bracket': '50k-75k',
  'Marital Status': 'Single',
  'Household Size': '2',
  'Homeownership': 'Renting',
  'Ethnicity': 'Caucasian',
  'Religion': 'Hinduism'},
 {'Age Range': '25-34',
  'Gender': 'Non-binary',
  'Location': 'Urban',
  'Education Level': 'Some College',
  'Occupation': 'Logistics/Transportation',
  'Income Bracket': '50k-75k',
  'Marital Status': 'Domestic Partnership',
  'Household Size': '4+',
  'Homeownership': 'Other',
  'Ethnicity': 'Mixed',
  'Religion': 'Judaism'},
 {'Age Range': '35-44',
  'Gender': 'Female',
  'Location': 'Rural',
  'Education Level': "Master's Degree",
  'Occupation': 'Logistics/Transportation',
  'Income Bracket': '75k-100k',
  'Marital Status': 'Single',
  'Household Size': '2',
  'Homeownership': 'Living with Family',
  'Ethnicity': 'Asian',
  'Religion': 'Judaism'},
 {

In [6]:

# chat = client.chats.create(model='gemini-2.0-flash', config = seller_config)

# response = chat.send_message(
#     message = f"[INFORMATION]: The product is a new type of phone that has a foldable screen and a long battery life. Customer: {people_personas[0]}",
# )
# print(response.text)
def run_conversation_with_persona(product_text, persona):
    seller_config = types.GenerateContentConfig(
        system_instruction = """You are a salesman selling a product for your client. 
        You will be talking to a potential customer and will be trying to convince them to buy the product. 
        You will be given a description of the product and the customer, and you will generate a response to the customer. 
        You will generate a response that is persuasive and convincing, and that will make the customer want to buy the product.
        The conversation is finished once the customer asks to purchase the product or states that they are not interested in the product.
        Do not hallucinate or make up information about the product, only use the information that is given to you.
        If the buyer says "finished", output "finished" and do not generate any more responses. 
        .""",
        max_output_tokens = 1000,
        temperature = 0.1,
        top_p = 0.5,
        top_k = 64,
    )

    local_buyer_config = types.GenerateContentConfig(
        system_instruction = f"""You are a potential customer where a salesman will try to convince you to buy a product.
        This is your persona: {persona}.
        Generate responses as if you are this persona.
        Once you clearly state that you want to buy the product, or that you are not interested in the product, the conversation is finished.
        When the conversation is finished, output "finished" and do not generate any more responses.
        """,
        temperature=0.1,
        top_p=0.5,
        top_k=64,
        max_output_tokens=1000,
    )
    seller_chat = client.chats.create(model='gemini-2.0-flash', config=seller_config)
    buyer_chat = client.chats.create(model='gemini-2.0-flash', config=local_buyer_config)

    response = seller_chat.send_message(
        message=f"[INFORMATION]: Product to sell: {product_text}. Customer: {persona}."
    )
    local_chat_history = [{"role": "seller", "parts": [response.text]}]

    for _ in range(10):
        response = buyer_chat.send_message(message=response.text)
        local_chat_history.append({"role": "buyer", "parts": [response.text]})
        if response.text.strip() == "finished":
            break

        response = seller_chat.send_message(message=response.text)
        local_chat_history.append({"role": "seller", "parts": [response.text]})
        if response.text.strip() == "finished":
            break
        
    # print("Chat Finished")

    return {"persona": persona, "chat_history": local_chat_history}

def run_all_personas_in_parallel(personas, product_text, product_image = None):
    import concurrent.futures
    results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_persona = {
            executor.submit(run_conversation_with_persona, product_text, persona): persona for persona in personas
        }
        for future in concurrent.futures.as_completed(future_to_persona):
            results.append(future.result())
    return results

def run_product_chatting(product_text, product_image = None):
    return run_all_personas_in_parallel(personas_test, product_text, product_image)

# Create FastAPI endpoint to run the product chatting
from fastapi import FastAPI

app = FastAPI()

@app.get("/run_product_chatting")
def run_product_chatting_endpoint(product_text: str, product_image: str = None):
    return run_product_chatting(product_text, product_image)

In [7]:
results = run_product_chatting("This is a new type of phone that has a foldable screen and a long battery life.")

for result in results:
    print(result)



{'persona': {'Age Range': '55-64', 'Gender': 'Male', 'Location': 'Suburban', 'Education Level': "Bachelor's Degree", 'Occupation': 'Government/Public Service', 'Income Bracket': '25k-50k', 'Marital Status': 'Domestic Partnership', 'Household Size': '4+', 'Homeownership': 'Own Home', 'Ethnicity': 'Caucasian', 'Religion': 'Hinduism'}, 'chat_history': [{'role': 'seller', 'parts': ["Hello sir, I understand you're in the market for a new phone. I'd like to introduce you to our latest model, a revolutionary device with a foldable screen and an exceptionally long battery life.\n\nGiven your busy life in public service and managing a household, I imagine you value efficiency and reliability. The foldable screen offers you the convenience of a larger display for reading documents, browsing the web, or even video conferencing with family, all while remaining compact enough to easily carry in your pocket.\n\nAnd with its extended battery life, you won't have to worry about constantly searching fo

In [8]:
# reponse = chat.send_message(
#     message = "I'm not sure. I already own a phone, and it wasn't cheap to buy. Why should I buy this new phone? What makes it better than my current one?",
# )

# for result in results:
#     print(result)

In [9]:
results[0]['persona']

{'Age Range': '55-64',
 'Gender': 'Male',
 'Location': 'Suburban',
 'Education Level': "Bachelor's Degree",
 'Occupation': 'Government/Public Service',
 'Income Bracket': '25k-50k',
 'Marital Status': 'Domestic Partnership',
 'Household Size': '4+',
 'Homeownership': 'Own Home',
 'Ethnicity': 'Caucasian',
 'Religion': 'Hinduism'}

In [10]:
results[0]['chat_history']

[{'role': 'seller',
  'parts': ["Hello sir, I understand you're in the market for a new phone. I'd like to introduce you to our latest model, a revolutionary device with a foldable screen and an exceptionally long battery life.\n\nGiven your busy life in public service and managing a household, I imagine you value efficiency and reliability. The foldable screen offers you the convenience of a larger display for reading documents, browsing the web, or even video conferencing with family, all while remaining compact enough to easily carry in your pocket.\n\nAnd with its extended battery life, you won't have to worry about constantly searching for an outlet. You can stay connected and productive throughout your day, whether you're at work, running errands, or spending quality time with your loved ones.\n\nWould you be interested in learning more about how this phone can simplify your daily life?\n"]},
 {'role': 'buyer',
  'parts': ["Well, a foldable screen sounds interesting, I'll admit. 

In [29]:
from pydub import AudioSegment
from io import BytesIO
from tqdm import tqdm

In [64]:
polly_client = boto3.Session(
                aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'],                 
    aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'],
    region_name='us-east-1').client('polly')

audio = AudioSegment.silent(duration=0)

buyer_gender = results[0]['persona']['Gender']

speech_switch = [0]

for turn in tqdm(results[0]['chat_history']):
    # print(chat['parts'][0])
    role = turn['role']
    chat = turn['parts'][0]
    
    voice = None
    if role == "seller":
        voice = "Matthew"
    else:
        if buyer_gender == "Male":
            voice = "Stephen"
        else:
            voice = "Ruth"
         
    reponse = polly_client.synthesize_speech(VoiceId=voice,
                    OutputFormat='mp3', 
                    Text = chat,
                    Engine = 'generative')
    
    with open(f"audio_files/audio.mp3", "wb") as file:
        file.write(reponse['AudioStream'].read())
        
    next_audio = AudioSegment.from_mp3(f"audio_files/audio.mp3")
    # audio.export(f"audio_files/{chat['role']}.wav", format="wav")
    
    audio = audio + next_audio + AudioSegment.silent(duration=200)
    speech_switch.append(len(audio))
    
# Save the final audio
audio.export(f"audio_files/final_audio.wav", format="wav")

# Delete the temporary audio files
os.remove(f"audio_files/audio.mp3")
print(speech_switch)

100%|██████████| 7/7 [00:37<00:00,  5.41s/it]

[0, 43016, 70120, 119448, 135632, 192112, 209208, 210151]





In [44]:
str(results[0])

'{\'persona\': {\'Age Range\': \'55-64\', \'Gender\': \'Male\', \'Location\': \'Suburban\', \'Education Level\': "Bachelor\'s Degree", \'Occupation\': \'Government/Public Service\', \'Income Bracket\': \'25k-50k\', \'Marital Status\': \'Domestic Partnership\', \'Household Size\': \'4+\', \'Homeownership\': \'Own Home\', \'Ethnicity\': \'Caucasian\', \'Religion\': \'Hinduism\'}, \'chat_history\': [{\'role\': \'seller\', \'parts\': ["Hello sir, I understand you\'re in the market for a new phone. I\'d like to introduce you to our latest model, a revolutionary device with a foldable screen and an exceptionally long battery life.\\n\\nGiven your busy life in public service and managing a household, I imagine you value efficiency and reliability. The foldable screen offers you the convenience of a larger display for reading documents, browsing the web, or even video conferencing with family, all while remaining compact enough to easily carry in your pocket.\\n\\nAnd with its extended battery

In [47]:
simple = {
    "persona": {
        "Gender": "Male"
    },
    "chat_history": [
        {
            "role": "seller",
            "parts": [
                "Hello, how can I help you today?"
            ]
        }
    ]
    
}

str(simple)

"{'persona': {'Gender': 'Male'}, 'chat_history': [{'role': 'seller', 'parts': ['Hello, how can I help you today?']}]}"

In [58]:
results[0]

{'persona': {'Age Range': '55-64',
  'Gender': 'Male',
  'Location': 'Suburban',
  'Education Level': "Bachelor's Degree",
  'Occupation': 'Government/Public Service',
  'Income Bracket': '25k-50k',
  'Marital Status': 'Domestic Partnership',
  'Household Size': '4+',
  'Homeownership': 'Own Home',
  'Ethnicity': 'Caucasian',
  'Religion': 'Hinduism'},
 'chat_history': [{'role': 'seller',
   'parts': ["Hello sir, I understand you're in the market for a new phone. I'd like to introduce you to our latest model, a revolutionary device with a foldable screen and an exceptionally long battery life.\n\nGiven your busy life in public service and managing a household, I imagine you value efficiency and reliability. The foldable screen offers you the convenience of a larger display for reading documents, browsing the web, or even video conferencing with family, all while remaining compact enough to easily carry in your pocket.\n\nAnd with its extended battery life, you won't have to worry about

In [60]:
json_response = json.dumps(results[0])

In [61]:
json_response

'{"persona": {"Age Range": "55-64", "Gender": "Male", "Location": "Suburban", "Education Level": "Bachelor\'s Degree", "Occupation": "Government/Public Service", "Income Bracket": "25k-50k", "Marital Status": "Domestic Partnership", "Household Size": "4+", "Homeownership": "Own Home", "Ethnicity": "Caucasian", "Religion": "Hinduism"}, "chat_history": [{"role": "seller", "parts": ["Hello sir, I understand you\'re in the market for a new phone. I\'d like to introduce you to our latest model, a revolutionary device with a foldable screen and an exceptionally long battery life.\\n\\nGiven your busy life in public service and managing a household, I imagine you value efficiency and reliability. The foldable screen offers you the convenience of a larger display for reading documents, browsing the web, or even video conferencing with family, all while remaining compact enough to easily carry in your pocket.\\n\\nAnd with its extended battery life, you won\'t have to worry about constantly sea

In [83]:
import json
import requests

simple_json = json.dumps(simple)

url = "http://localhost:8000/conversation/convert"
headers = {
    "Content-Type": "application/json"
}
response = requests.post(url, headers=headers, data=json_response)

print(response.status_code)
if response.status_code == 200:
    with open("test_audio.wav", "wb") as file:
        file.write(response.content)
        
    # Print response headers
    print(response.headers["speech_switch"])
else:
    print("Error:", response.status_code, response.text)

200
[43016, 70120, 119448, 135632, 192112, 209208, 210151]


In [None]:
from fastapi.responses import FileResponse



@app.post("/generate_audio")
def generate_audio_endpoint(result: dict):
    # Start with an empty silent audio segment
    combined_audio = AudioSegment.silent(duration=0)

    # Determine buyer gender
    buyer_gender = result['persona']['Gender']

    # Generate audio clips for each turn in the chat history
    for turn in tqdm(result['chat_history']):
        role = turn['role']
        chat_text = turn['parts'][0]

        # Determine which Polly voice to use
        if role == "seller":
            voice = "Matthew"
        else:
            if buyer_gender == "Male":
                voice = "Stephen"
            else:
                voice = "Ruth"

        # Call Polly to synthesize speech
        response = polly_client.synthesize_speech(
            VoiceId=voice,
            OutputFormat='mp3', 
            Text=chat_text,
            Engine='generative'
        )

        # Save the temporary MP3 file
        with open("audio_files/audio.mp3", "wb") as file:
            file.write(response['AudioStream'].read())

        # Load the MP3 and append it with a short pause
        clip = AudioSegment.from_mp3("audio_files/audio.mp3")
        combined_audio += clip + AudioSegment.silent(duration=200)

    # Export the final WAV file
    output_path = "audio_files/final_audio.wav"
    combined_audio.export(output_path, format="wav")

    # Clean up the temporary file
    os.remove("audio_files/audio.mp3")

    # Return the final WAV file as a response
    return FileResponse(output_path, media_type="audio/wav")

In [17]:
audio = AudioSegment.from_mp3("speech.mp3")

In [18]:
audio = audio.reverse()

audio.export("speech.wav", format="wav")

<_io.BufferedRandom name='speech.wav'>