In [36]:
import os

os.environ['OpenAI_API_KEY'] = 'xxxxx'

In [2]:
import json
import asyncio
import weave
from weave.flow.scorer import MultiTaskBinaryClassificationF1
import openai
import pandas as pd

# Training set

In [24]:
sentences = [
    "There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.",
    "Pounits are a bright green color and are more savory than sweet.",
    "Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them."
]

labels = [
    {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'},
    {'fruit': 'pounits', 'color': 'bright green', 'flavor': 'savory'},
    {'fruit': 'glowls', 'color': 'pale orange', 'flavor': 'sour and bitter'}
]


df = pd.DataFrame({'sentence': sentences, 'target': labels})
df.head()

Unnamed: 0,sentence,target
0,There are many fruits that were found on the r...,"{'fruit': 'neoskizzles', 'color': 'purple', 'f..."
1,Pounits are a bright green color and are more ...,"{'fruit': 'pounits', 'color': 'bright green', ..."
2,"Finally, there are fruits called glowls, which...","{'fruit': 'glowls', 'color': 'pale orange', 'f..."


In [25]:
examples = df.reset_index().rename(columns={'index': 'id'}).to_dict(orient='records')
examples

[{'id': 0,
  'sentence': 'There are many fruits that were found on the recently discovered planet Goocrux. There are neoskizzles that grow there, which are purple and taste like candy.',
  'target': {'fruit': 'neoskizzles', 'color': 'purple', 'flavor': 'candy'}},
 {'id': 1,
  'sentence': 'Pounits are a bright green color and are more savory than sweet.',
  'target': {'fruit': 'pounits', 'color': 'bright green', 'flavor': 'savory'}},
 {'id': 2,
  'sentence': 'Finally, there are fruits called glowls, which have a very sour and bitter taste which is acidic and caustic, and a pale orange tinge to them.',
  'target': {'fruit': 'glowls',
   'color': 'pale orange',
   'flavor': 'sour and bitter'}}]

In [26]:
class ExtractFruitsModel(weave.Model):
    model_name: str
    prompt_template: str

    @weave.op()
    async def predict(self, sentence: str) -> dict:
        client = openai.AsyncClient()

        response = await client.chat.completions.create(
            model=self.model_name,
            messages=[
                {"role": "user", "content": self.prompt_template.format(sentence=sentence)}
            ],
            response_format={ "type": "json_object" }
        )
        result = response.choices[0].message.content
        if result is None:
            raise ValueError("No response from model")
        parsed = json.loads(result)
        return parsed


In [34]:
weave.init('intro-example')

prompt = 'Extract fields ("fruit": <str>, "color": <str>, "flavor": <str>) from the following text, as json: {sentence}'

# We create our model with our system prompt.
model = ExtractFruitsModel(name='gpt4',
                           model_name='gpt-4-0125-preview',
                           prompt_template=prompt)



In [35]:
r = await model.predict('strawberry is a delicious healthy red option that tastes delicious')
r

🍩 https://wandb.ai/lucasribeiroabreu-/intro-example/r/call/01923684-3cd6-74c3-ab8b-f452e5b00776


{'fruit': 'strawberry', 'color': 'red', 'flavor': 'delicious'}

In [29]:
# We define a scoring functions to compare our model predictions with a ground truth label.

@weave.op()
def fruit_name_score(target: dict, model_output: dict) -> dict:
    return {'correct': target['fruit'] == model_output['fruit']}

In [33]:
fruit_name_score(examples[0]['target'], r)

🍩 https://wandb.ai/lucasribeiroabreu-/intro-example/r/call/0192352b-754b-7f01-8c7b-3737764162fc


{'correct': False}

In [30]:
# Finally, we run an evaluation of this model.
# This will generate a prediction for each input example, and then score it with each scoring function.
evaluation = weave.Evaluation(
    name='fruit_eval',
    dataset=examples, scorers=[MultiTaskBinaryClassificationF1(class_names=["fruit", "color", "flavor"]), fruit_name_score],
)

await evaluation.evaluate(model)

🍩 https://wandb.ai/lucasribeiroabreu-/intro-example/r/call/0192352a-741b-7521-b845-7d13cf0461db


{'MultiTaskBinaryClassificationF1': {'fruit': {'f1': 0.8,
   'precision': 0.6666666666666666,
   'recall': 1.0},
  'color': {'f1': 1.0, 'precision': 1.0, 'recall': 1.0},
  'flavor': {'f1': 0.8, 'precision': 0.6666666666666666, 'recall': 1.0}},
 'fruit_name_score': {'correct': {'true_count': 2,
   'true_fraction': 0.6666666666666666}},
 'model_latency': {'mean': 1.8662765820821126}}