In [2]:
# Install dependencies
# pip install openai

In [3]:
# File: /Users/olenapleshan/Desktop/tse_takehome_dataset.csv

In [4]:
import pandas as pd
from openai import OpenAI


In [5]:
INPUT_FILE = "/Users/olenapleshan/Desktop/tse_takehome_dataset_attempt3.csv"

In [6]:
# Check out the file provided by the client to understand the data they are working with
df = pd.read_csv(INPUT_FILE)
print(df.columns)

Index(['date', 'name', 'company_name', 'description_of_company',
       'favourite_memory', 'favourite_city_and_why', 'favourite_food_and_why',
       'occupation', 'description_of_job', 'experience_relevant_to_job',
       'growth_plan'],
      dtype='object')


In [7]:
pd.set_option('display.max_colwidth', 200)
df['favourite_city_and_why']

0     London, for its historical landmarks and diverse cultural scene. for its historical landmarks and diverse cultural scene. Additionally, London has hosted the Summer Olympics three times: in 1908, ...
1     Paris, for its beautiful architecture for its beautiful architecture Additionally, The Eiffel Tower was supposed to be a temporary installation, intended to stand for 20 years after being construc...
2            Tokyo, for its unique blend of traditional and modern for its unique blend of traditional and modern Additionally, It's considered one of the world's most important and powerful global cities.
3                                New York, because of its vibrant city life and diversity. because of its vibrant city life and diversity. Additionally, It's home to the largest metropolitan zoo in the US.
4     Paris, for its beautiful architecture for its beautiful architecture Additionally, Paris is known as the 'City of Light', originally because of its leading role during th

Additional Recommendation 1: Data clean up, a lot of repetition that may impede model perforamce (LINK)

In [15]:

client = OpenAI(
  api_key="sk-proj-<REDACTED>"
)


In [None]:
# https://platform.openai.com/docs/assistants/quickstart
# https://platform.openai.com/docs/assistants/tools/code-interpreter

file = client.files.create(
  file=open(INPUT_FILE, "rb"),
  purpose='assistants'
)
print("Submitted file to OpenAI assitant", file.id)

assistant = client.beta.assistants.create(
  instructions="You are a analysing data on correlation between jobs some people occupy and their personal interests. You are tasked with writing a summary of the data and providing insights on the correlation between jobs and personal interests.",
  model="gpt-4o",
  tools=[{"type": "code_interpreter"}],
  tool_resources={
    "code_interpreter": {
      "file_ids": [file.id]
    }
  }
)

Submitted file to OpenAI assitant file-FAYTCfAC6TkkKhhekYk1rM


In [23]:
thread = client.beta.threads.create()

In [24]:
def create_message_and_run(client, thread_id, assistant_id, user_prompt, instructions):
    message = client.beta.threads.messages.create(
        thread_id=thread_id,
        role="user",
        content=user_prompt
    )

    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread_id,
        assistant_id=assistant_id,
        instructions=instructions,
    )

    return message, run

In [25]:
message, run = create_message_and_run(client, 
                                      thread.id, 
                                      assistant.id,
                                      "What is Tina Escobar favourite city and why?", 
                                      "Please use the file {file_id} to answer the question.".format(file_id=file.id))

In [26]:
# Verify if the issue is reproduciable by printing messages in the thread
# My first assumption was exceeding content window. For counting tokens - https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them

def print_and_count_tokens(run, thread_id):
  context_length = 0
  if run.status == 'completed': 
    messages = client.beta.threads.messages.list(
      thread_id=thread_id
    )
    #context_length = sum(len(message.content.split(' ') for message in messages))
    for message in messages:
      for content in message.content:
        print("[", message.role, "]", ": ", content.text.value)
        context_length += len(content.text.value.split(" "))
  else:
    print(run.status)
  print("approx number of tokens: ", context_length) 

In [27]:
print_and_count_tokens(run, thread.id)

[ assistant ] :  Tina Escobar's favorite city is New York. She likes it because of its vibrant city life and diverse cultural experiences.
[ assistant ] :  The content of the file is in a CSV format, not JSON. Let's proceed to load it as a CSV file, then search for Tina Escobar's favorite city and the explanation behind her preference.
[ assistant ] :  It seems there was an issue with reading the file as JSON. Let's try another approach to determine its structure and contents. I'll examine the raw file contents to understand its format.
[ user ] :  What is Tina Escobar favourite city and why?
approx number of tokens:  94


### Verifying whether adjusting prompt and/or instructions would help achieve the desired outcome

In [None]:
message, run = create_message_and_run(client, 
                                      thread.id, 
                                      assistant.id, 
                                      "Why are you omitting everything after the word 'additional' in the column 'favourite_city_and_why'?", # Only changing promt. Instructions are the same, part of the same thread
                                      "Please use the file {file_id} to answer the question.".format(file_id=file.id))

print_and_count_tokens(run, thread.id)

[ assistant ] :  Tina Escobar's favorite city is New York because of its vibrant city life and diversity. Additionally, she appreciates that it is home to the largest metropolitan zoo in the U.S.
[ assistant ] :  The file's content revealed that each entry in the "favourite_city_and_why" column might have multiple reasons detailed across multiple sentences. Let's extract and display the entire entry related to Tina Escobar's favorite city to ensure nothing is omitted.
[ user ] :  Why are you only summarizing the first sentence in the column 'favourite_city_and_why' when answering my question?
[ assistant ] :  Tina Escobar's favorite city is New York. She likes it because of its vibrant city life and diverse cultural experiences.
[ assistant ] :  The content of the file is in a CSV format, not JSON. Let's proceed to load it as a CSV file, then search for Tina Escobar's favorite city and the explanation behind her preference.
[ assistant ] :  It seems there was an issue with reading the 

In [29]:
message, run = create_message_and_run(client, 
                                      thread.id, 
                                      assistant.id, 
                                      "So, why did your summary only include the first sentence of the data, if your assumption was based on each sentence representing a different reason?", # Only changing promt. Instructions are the same, part of the same thread
                                      "Please use the file {file_id} to answer the question.".format(file_id=file.id))

print_and_count_tokens(run, thread.id)

[ assistant ] :  I apologize for the oversight. In summarization, I mistakenly focused primarily on the initial sentence, thinking it might encapsulate the main reason concisely. It's important to recognize that even within a single explanation, repeated phrases can appear (as seen here with the repetition of "because of its vibrant city life and diversity"). Some segments may contain cumulative details or additional points that contribute to the overall reasoning.

Thank you for pointing it out, and I will ensure to consider the full explanation in similar tasks going forward.
[ user ] :  So, why did your summary only include the first sentence of the data, if your assumption was based on each sentence representing a different reason?
[ assistant ] :  Tina Escobar's favorite city is New York because of its vibrant city life and diversity. Additionally, she appreciates that it is home to the largest metropolitan zoo in the U.S.
[ assistant ] :  The file's content revealed that each ent

In [30]:
message, run = create_message_and_run(client, 
                                      thread.id,
                                      assistant.id, 
                                      "What is Tina Escobar favourite city and why?", 
                                      "Please use the file {file_id} to answer the question. In your summary of the response, please incorporate the entirety of the response (all reasons) provided.".format(file_id=file.id)) # Only changing instructions. Prompt is the same, part of the same thread

print_and_count_tokens(run, thread.id)

[ assistant ] :  Tina Escobar's favorite city is New York because of its vibrant city life and diversity. Additionally, she appreciates that it is home to the largest metropolitan zoo in the U.S.
[ user ] :  What is Tina Escobar favourite city and why?
[ assistant ] :  I apologize for the oversight. In summarization, I mistakenly focused primarily on the initial sentence, thinking it might encapsulate the main reason concisely. It's important to recognize that even within a single explanation, repeated phrases can appear (as seen here with the repetition of "because of its vibrant city life and diversity"). Some segments may contain cumulative details or additional points that contribute to the overall reasoning.

Thank you for pointing it out, and I will ensure to consider the full explanation in similar tasks going forward.
[ user ] :  So, why did your summary only include the first sentence of the data, if your assumption was based on each sentence representing a different reason?
[

In [31]:
user_prompt_sarah = "What is Sarah King's favourite city and why?"

message, run = create_message_and_run(client, 
                                      thread.id, 
                                      assistant.id, 
                                      "What is Sarah King's favourite city and why?", 
                                      "Please use the file {file_id} to answer the question. In your summary of the response, please incorporate the entirety of the response (all reasons) provided.".format(file_id=file.id)) # Using the old instruction but new prompt for a different person from the file

print_and_count_tokens(run, thread.id)

[ assistant ] :  Sarah King's favorite city is Tokyo because of its unique blend of traditional and modern elements. Additionally, she finds it interesting that Tokyo was once known as Edo.
[ user ] :  What is Sarah King's favourite city and why?
[ assistant ] :  Tina Escobar's favorite city is New York because of its vibrant city life and diversity. Additionally, she appreciates that it is home to the largest metropolitan zoo in the U.S.
[ user ] :  What is Tina Escobar favourite city and why?
[ assistant ] :  I apologize for the oversight. In summarization, I mistakenly focused primarily on the initial sentence, thinking it might encapsulate the main reason concisely. It's important to recognize that even within a single explanation, repeated phrases can appear (as seen here with the repetition of "because of its vibrant city life and diversity"). Some segments may contain cumulative details or additional points that contribute to the overall reasoning.

Thank you for pointing it out

Observation: When using instruction1 along with a prompt tailored for a new user, the behavior adapts and the desired outcome is achieved. However, this effect is probably just limited to the context of the current thread. Therefore, if the instructions are not exposed to the end-user and modified only by the Assistant's developer, the desired specificity of the outcome should just be embedded directly into the instruction when each run is initilized.

### Verifying how to better engineer the prompt
Creating a new thread to ensure the results from previous thread do not get mixed in

In [105]:
thread2 = client.beta.threads.create()

In [106]:
user_prompt3 = "What is Tina Escobar favourite city? Please provide all reasons she mentioned."

message, run = create_message_and_run(client, 
                                      thread2.id, 
                                      assistant.id, 
                                      "What is Tina Escobar favourite city? Please provide all reasons she mentioned.", 
                                      "Please use the file {file_id} to answer the question.".format(file_id=file.id))

print_and_count_tokens(run, thread2.id)

Message:  Tina Escobar's favorite city is New York. The reasons she mentioned for her preference are:

1. Its vibrant city life.
2. Its diversity.
3. It's home to the largest metropolitan zoo in the US.
Message:  The file contains several columns including "name" and "favourite_city_and_why". I will filter the data to find Tina Escobar's entry and extract her favorite city along with the reasons she mentioned.
Message:  The file appears to be in CSV format, with columns related to personal preferences and other information, including "favourite_city_and_why." I will specifically search for information related to Tina Escobar's favorite city and the reasons mentioned for her preference. Let's find her entry in the file and extract the needed details.
Message:  To determine Tina Escobar's favorite city and the reasons she mentioned, I will need to analyze the content of the uploaded file. I'll read through the file to find the relevant information. Let's proceed with that.
Message:  What

Observation: Previous prompt changes seemed to mostly expand on the content, so the user could use that. However, asking to "provide all reasons XXX mentioned" seems to force the model to scan through and summarize the column's content rather then take out the first sentence. 

### Be more specific with the instructions as these will not depend on the verbiage of an individual user interacting with the Assistance

In [107]:
thread3 = client.beta.threads.create()

In [108]:
message, run = create_message_and_run(client, 
                                      thread3.id, 
                                      assistant.id, 
                                      "What is Paul Vega's favourite city and why?", 
                                      "Please use the file {file_id} to answer the question. When summarising your response, please provide all reasons mentioned in the user's response.".format(file_id=file.id))

print_and_count_tokens(run, thread3.id)

Message:  Paul Vega's favorite city is London. He appreciates it for its historical landmarks and diverse cultural scene. Additionally, he notes that London has hosted the Summer Olympics three times: in 1908, 1948, and 2012.
Message:  It seems there is an issue with reading the file as a JSON. The file might be in a different format. Let me inspect the file to determine its format and extract the necessary information.
Message:  What is Paul Vega's favourite city and why?
approx number of tokens:  77


## Quick experiment with file search
This may be more appropriate to recommend to the User

In [57]:
client = OpenAI(
  api_key="sk-proj-<REDACTED>"
)

In [67]:
assistantFS = client.beta.assistants.create(
    name="Financial Analyst Assistant",
    instructions="You are an HR assistant, helping identify how people's intrests are related to the companies they are employed with and positions they are applying for.",
    model="gpt-4o",
    tools=[{"type": "file_search"}],
)

In [64]:
# stranegly, CSV file is not supported for file search. I will convert it to JSON.
json_data = df.to_json(orient='records')
json_file_path = 'test-file-search.json'
with open(json_file_path, 'w') as json_file:
    json_file.write(json_data)

vector_store = client.beta.vector_stores.create(name="Employee Test Data")
with open("test-file-search.json", "rb") as file:
    file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
        vector_store_id=vector_store.id, files=[file]
    )
    print(file_batch.status)
    print(file_batch.file_counts)

completed
FileCounts(cancelled=0, completed=1, failed=0, in_progress=0, total=1)


In [68]:
assistantFS = client.beta.assistants.update(
assistant_id=assistant.id,
tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)

In [66]:
thread5 = client.beta.threads.create()

In [69]:
message, run = create_message_and_run(client, 
                                      thread5.id, 
                                      assistantFS.id,
                                      "What is Tina Escobar favourite city and why?", 
                                      "Use the file from Vector store to provide the most exact answer to the question")

print_and_count_tokens(run, thread5.id)

Tina Escobar's favorite city is New York because of its vibrant city life and diversity. Additionally, it is home to the largest metropolitan zoo in the US【4:0†test-file-search.json】.
What is Tina Escobar favourite city and why?
approx number of tokens:  43


Summery: If the client's goal is to retieve the most exact verbiage from the File, File search may be a better option as it will aim to retrieve the exacrt answers.