In [1]:
!pip install -qqq -U torch transformers datasets evaluate langchain tensorboard --progress-bar off

In [2]:
import json

import torch
from datasets import (
    load_dataset,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
)
from evaluate import load
import langchain
from langchain.cache import SQLiteCache
from statistics import mean

In [3]:
sft_model_path = "TunedLlama-3-8B"

In [4]:
# Allow the model to use the cache

langchain.llm_cache = SQLiteCache(database_path=".langchain.db") 

Note the special characters as for the Fine-tuning data. 

In [5]:
PREFIX = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful scientific assistant. Your task is to extract information about organic reactions. {shot}<|eot_id|>"""
SUFFIX = """<|start_header_id|>user<|end_header_id|>\n\n{sample}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""
SHOT = """
One example is provided to you to show how to perform the task:

### Procedure:\nA suspension of 8 g of the product of Example 7 and 0.4 g of DABCO in 90 ml of xylenes were heated under N2 at 130\u00b0-135\u00b0 C. while 1.8 ml of phosgene was added portionwise at a rate to maintain a reflux temperature of about 130\u00b0-135\u00b0 C. The mixture was refluxed an additional two hours, cooled under N2 to room temperature, filtered, and the filtrate was concentrated in vacuo to yield 6.9 g of the subject compound as a crude oil.\n\n
### ORD JSON:\n{\"inputs\": {\"m1_m2_m4\": {\"components\": [{\"identifiers\": [{\"type\": \"NAME\", \"value\": \"product\"}], \"amount\": {\"mass\": {\"value\": 8.0, \"units\": \"GRAM\"}}, \"reaction_role\": \"REACTANT\"}, {\"identifiers\": [{\"type\": \"NAME\", \"value\": \"DABCO\"}], \"amount\": {\"mass\": {\"value\": 0.4, \"units\": \"GRAM\"}}, \"reaction_role\": \"REACTANT\"}, {\"identifiers\": [{\"type\": \"NAME\", \"value\": \"xylenes\"}], \"amount\": {\"volume\": {\"value\": 90.0, \"units\": \"MILLILITER\"}}, \"reaction_role\": \"SOLVENT\"}]}, \"m3\": {\"components\": [{\"identifiers\": [{\"type\": \"NAME\", \"value\": \"phosgene\"}], \"amount\": {\"volume\": {\"value\": 1.8, \"units\": \"MILLILITER\"}}, \"reaction_role\": \"REACTANT\"}]}}, \"conditions\": {\"temperature\": {\"control\": {\"type\": \"AMBIENT\"}}, \"conditions_are_dynamic\": true}, \"workups\": [{\"type\": \"ADDITION\", \"details\": \"was added portionwise at a rate\"}, {\"type\": \"TEMPERATURE\", \"details\": \"to maintain a reflux temperature of about 130\\u00b0-135\\u00b0 C\"}, {\"type\": \"TEMPERATURE\", \"details\": \"The mixture was refluxed an additional two hours\", \"duration\": {\"value\": 2.0, \"units\": \"HOUR\"}}, {\"type\": \"FILTRATION\", \"details\": \"filtered\"}, {\"type\": \"CONCENTRATION\", \"details\": \"the filtrate was concentrated in vacuo\"}], \"outcomes\": [{\"products\": [{\"identifiers\": [{\"type\": \"NAME\", \"value\": \"subject compound\"}], \"measurements\": [{\"type\": \"AMOUNT\", \"details\": \"MASS\", \"amount\": {\"mass\": {\"value\": 6.9, \"units\": \"GRAM\"}}}], \"reaction_role\": \"PRODUCT\"}]}]}
"""

In [6]:
test_ds_path = "test.json"
test_dataset = load_dataset("json", data_files=test_ds_path, split="train")
test_dataset = test_dataset.shuffle(seed=42).select(range(10))
test_dataset

Dataset({
    features: ['instruction', 'output'],
    num_rows: 10
})

In [7]:
sft_model = AutoModelForCausalLM.from_pretrained(
    sft_model_path,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(sft_model_path)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
bertscore = load("bertscore")

In [9]:
sft_pipe = pipeline(
    "text-generation",
    model=sft_model,
    tokenizer=tokenizer,
    device=0,
    framework="pt",
)

In [10]:
# Generate text for the 0-shot
results = {}
count = 0
for i in range(2):
    print(f"Working with the {i}-shot prompts")
    predictions_sft = []
    predictions_llama = []
    references = []

    for t in test_dataset:
        count += 1
        print(f"{count} prompt")
        instruction = t['instruction'][:-14]
        output = "### ORD JSON:\n" + t['output']
        if i == 0:
            shot = ''
        else:
            shot = SHOT
        system = PREFIX.format(shot=shot)
        user = SUFFIX.format(sample=instruction)
        prompt = system + user
        
        references.append(output)
        sequences_sft = sft_pipe(
            prompt,
            do_sample=True,
            temperature=0.01,
            num_return_sequences=1,
        )
        print(sequences_sft[0]['generated_text'])
        predictions_sft.append(sequences_sft[0]['generated_text'].replace(prompt, ''))
    results[f"{i}-shot"] = {
        "predictions": predictions_sft,
        "references": references,
    }

Working with the 0-shot prompts
1 prompt
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful scientific assistant. Your task is to extract information about organic reactions. <|eot_id|><|start_header_id|>user<|end_header_id|>

Below is a description of an organic reaction. Extract information from it to an ORD JSON record.

### Procedure:
Acetic acid anhydride (0.132 ml) was added to a solution of 1-(3-amino-4-methoxyphenyl)-3-{1-[2-(2-ethoxyphenoxy) ethyl]-4-piperidinyl}-1-propanone dihydrochloride (300 mg) obtained in Example 327 and triethylamine (0.195 ml) in tetrahydrofuran (10 ml) at room temperature, the mixture was stirred for 12 hours, and the reaction mixture was concentrated under reduced pressure. Water (30 ml) and ethyl acetate (40 ml) were added to the residue, and extracted with ethyl acetate. The organic layer was washed with brine, dried over anhydrous magnesium sulfate, and the solvent was evaporated under reduced pressure. The resulting re

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful scientific assistant. Your task is to extract information about organic reactions. <|eot_id|><|start_header_id|>user<|end_header_id|>

Below is a description of an organic reaction. Extract information from it to an ORD JSON record.

### Procedure:
To a flask charged with 2-(4-((2S,4R)-1-acetyl-4-((isopropoxycarbonyl)amino)-2-methyl-1,2,3,4-tetrahydroquinolin-6-yl)phenyl)acetic acid (for a preparation see Intermediate 50) (85 mg, 0.2 mmol) and HATU (91 mg, 0.240 mmol) was added DMF (2 mL) and the mixture treated with DIPEA (0.105 mL, 0.600 mmol) at room temperature. After 5 min, 1,1-dimethylethyl (2-aminoethyl)carbamate (0.047 mL, 0.300 mmol) was added, the mixture was stirred at room temperature for 3 h and then partitioned between EtOAc and water. The phases were separated and the aqueous phase was extracted with EtOAc. The combined organic phases were washed with brine, dried over MgSO4, filtered and conc

In [11]:
for i in range(2):
    predictions_sft = results[f'{i}-shot']["predictions"]
    references = results[f'{i}-shot']["references"]

    results_sft = bertscore.compute(predictions=predictions_sft, references=references, model_type="distilbert-base-uncased")

    results[f"{i}-shot"].update({
        "precision": mean(results_sft["precision"]),
        "recall": mean(results_sft["recall"]),
        "f1_scores": mean(results_sft["f1"]),
    })



In [12]:
with open('sft_results.json', 'w') as f:
   json.dump(results, f, indent=4)