In [14]:
import os
import pandas as pd
import json

In [2]:
import sys
sys.path.append('../..')
import openai_data_tools as dt

In [42]:
data = pd.read_csv('data.csv', dtype=str, keep_default_na=False)

In [43]:
data

Unnamed: 0,item,name,industry,role
0,My name is Aaron Jones. I work for a financial...,Aaron Jones,financial services,senior account manager
1,"Hi, I am Laetitia Chen. I am a sales associate...",Laetitia Chen,retail,sales associate
2,"My name is Manuel Garcia, you can call me Mann...",Manuel Garcia,insurance,help desk technician
3,"Hi, I'm Angela, Angela Chekov. My job is in a ...",Angela Chekov,restaurant,bookkeeping
4,"Darren Gabor here, pleased to meet you. I work...",Darren Gabor,bank,lawyer


In [29]:
processor = dt.DataProcessor(
    api_key=os.getenv("OPENAI_API_KEY"),
    model = 'gpt-3.5-turbo', 
    instructions = "You will be provided with passages where someone introduces themselves. For each passage, return a JSON object with the key 'name' for the person's name, 'industry' for what industry they work in, and 'role' for their job role. If any of these is missing, return 'NA' for that value."
)

In [44]:
output = processor.process(data['item'])

Progress: 100%


In [45]:
output

['{\n  "name": "Aaron Jones",\n  "industry": "Financial Services",\n  "role": "Senior Account Manager"\n}',
 '{\n  "name": "Laetitia Chen",\n  "industry": "large retail chain",\n  "role": "sales associate"\n}',
 '{\n  "name": "Manuel Garcia",\n  "industry": "Insurance",\n  "role": "Help Desk Technician"\n}',
 '{\n  "name": "Angela Chekov",\n  "industry": "Restaurant",\n  "role": "Bookkeeping"\n}',
 '{\n  "name": "Darren Gabor",\n  "industry": "Banking",\n  "role": "Legal Advisor"\n}']

Convert the output to an object and then back to a string, to ensure that the stringified JSON is formatted consistently. We convert everything to lowercase to ensure that the model isn't scored as incorrect due to case mismatches.

In [32]:
clean_output = [json.dumps(json.loads(row)).lower() for row in output]

In [33]:
clean_output

['{"name": "aaron jones", "industry": "financial services", "role": "senior account manager"}',
 '{"name": "laetitia chen", "industry": "retail", "role": "sales associate"}',
 '{"name": "manuel garcia", "industry": "insurance", "role": "help desk technician"}']

Convert the target fields into stringified JSON that can be compared with the model output. We again convert everything to lowercase so that case mismatches don't mess up the scoring.

In [34]:
targets = [json.dumps({"name": row["name"], "industry": row["industry"], "role": row["role"]}).lower() for index, row in data.iterrows()]

In [35]:
targets

['{"name": "aaron jones", "industry": "financial services", "role": "senior account manager"}',
 '{"name": "laetitia chen", "industry": "retail", "role": "sales associate"}',
 '{"name": "manuel garcia", "industry": "insurance", "role": "help desk technician"}']

In [36]:
scorer = dt.Scorer(clean_output, targets)

In [37]:
scorer.accuracy()

1.0