In [1]:
! pip install -qqq datasets evaluate python-dotenv litellm --progress-bar off

In [2]:
import json 
from dotenv import load_dotenv

from datasets import (
    load_dataset,
)
import litellm
from litellm import completion
from litellm.caching import Cache
from evaluate import load
from statistics import mean

In [3]:
litellm.cache = Cache()

In [4]:
load_dotenv(".env", override=True)

True

In [5]:
base_model = "groq/llama3-8b-8192"

In [6]:
test_ds_path = "test.json"
test_dataset = load_dataset("json", data_files=test_ds_path, split="train")
test_dataset = test_dataset.shuffle(seed=42)
test_dataset

Dataset({
    features: ['instruction', 'output'],
    num_rows: 100
})

In [7]:
PREFIX = """You are a helpful scientific assistant. Your task is to extract information about organic reactions. {shot}"""
SUFFIX = """\n\n{sample}\n\n"""
SHOT = """
One example is provided to you to show how to perform the task:

### Procedure:\nA suspension of 8 g of the product of Example 7 and 0.4 g of DABCO in 90 ml of xylenes were heated under N2 at 130\u00b0-135\u00b0 C. while 1.8 ml of phosgene was added portionwise at a rate to maintain a reflux temperature of about 130\u00b0-135\u00b0 C. The mixture was refluxed an additional two hours, cooled under N2 to room temperature, filtered, and the filtrate was concentrated in vacuo to yield 6.9 g of the subject compound as a crude oil.\n\n
### ORD JSON:\n{\"inputs\": {\"m1_m2_m4\": {\"components\": [{\"identifiers\": [{\"type\": \"NAME\", \"value\": \"product\"}], \"amount\": {\"mass\": {\"value\": 8.0, \"units\": \"GRAM\"}}, \"reaction_role\": \"REACTANT\"}, {\"identifiers\": [{\"type\": \"NAME\", \"value\": \"DABCO\"}], \"amount\": {\"mass\": {\"value\": 0.4, \"units\": \"GRAM\"}}, \"reaction_role\": \"REACTANT\"}, {\"identifiers\": [{\"type\": \"NAME\", \"value\": \"xylenes\"}], \"amount\": {\"volume\": {\"value\": 90.0, \"units\": \"MILLILITER\"}}, \"reaction_role\": \"SOLVENT\"}]}, \"m3\": {\"components\": [{\"identifiers\": [{\"type\": \"NAME\", \"value\": \"phosgene\"}], \"amount\": {\"volume\": {\"value\": 1.8, \"units\": \"MILLILITER\"}}, \"reaction_role\": \"REACTANT\"}]}}, \"conditions\": {\"temperature\": {\"control\": {\"type\": \"AMBIENT\"}}, \"conditions_are_dynamic\": true}, \"workups\": [{\"type\": \"ADDITION\", \"details\": \"was added portionwise at a rate\"}, {\"type\": \"TEMPERATURE\", \"details\": \"to maintain a reflux temperature of about 130\\u00b0-135\\u00b0 C\"}, {\"type\": \"TEMPERATURE\", \"details\": \"The mixture was refluxed an additional two hours\", \"duration\": {\"value\": 2.0, \"units\": \"HOUR\"}}, {\"type\": \"FILTRATION\", \"details\": \"filtered\"}, {\"type\": \"CONCENTRATION\", \"details\": \"the filtrate was concentrated in vacuo\"}], \"outcomes\": [{\"products\": [{\"identifiers\": [{\"type\": \"NAME\", \"value\": \"subject compound\"}], \"measurements\": [{\"type\": \"AMOUNT\", \"details\": \"MASS\", \"amount\": {\"mass\": {\"value\": 6.9, \"units\": \"GRAM\"}}}], \"reaction_role\": \"PRODUCT\"}]}]}
\n
"""

In [8]:
bertscore = load("bertscore")

In [9]:
# Generate text
results = {}
for i in range(2):
    predictions = []
    references = []
    precisions = []
    recalls = []
    f1_scores = []
    
    for t in test_dataset:
        instruction = t['instruction']
        output = t['output']
        if i == 0:
            shot = ''
        else:
            shot = SHOT
        system = PREFIX.format(shot=shot)
        user = SUFFIX.format(sample=instruction)
        prompt = [
            {"role": "system", "content": system},
            {"role": "user", "content": user}
        ]
        pred = completion(
            model=base_model,
            messages=prompt,
            caching=True,
            temperature=0,
        ).choices[0].message.content

    # The computing of the metrics should be in a different cell
        
        references.append(output)
        predictions.append(pred)

    results[f'{i}-shot'] = {
        'predictions': predictions,
        'references': references,
    }

In [10]:
for i in range(2):
    predictions = results[f'{i}-shot']['predictions']
    references = results[f'{i}-shot']['references']

    results_ = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")

    results[f'{i}-shot'].update({
        'precision': mean(results_['precision']),
        'recall': mean(results_['recall']),
        'f1_scores': mean(results_['f1']),
    })



In [11]:
with open('Llama_results.json', 'w') as f:
   json.dump(results, f, indent=4)