In [1]:
from typing import Union, List

from langchain.agents import tool
from langchain.agents.format_scratchpad import format_log_to_str
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema import AgentAction, AgentFinish
from langchain.tools import Tool
from langchain.tools.render import render_text_description

In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import pandas as pd
import numpy as np

In [3]:
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

In [41]:
model = ChatOpenAI(temperature=0, model_name='gpt-4o', openai_api_key='your api-key here')

In [42]:
planner_template = '''You will be given some chemical reaction data in a JSON format. You have to fetch data from it, make it uniform, and report it as a list of the given format. Only output the list and nothing else. Do not use spaces, linebreaks, brackets or any other formatting. Do this for every reactant and product but not for spectators. Only list numerical values for weight, mols and yield WITHOUT UNITS but ensure their units are homogenous. Use None for any data field that contains "N/A".
The list format is as follows:
smiles1, weight1 (divide by 1000 if in mg), mols1 (in moles); smiles2, weight2 (divide by 1000 if in mg), mols2 (in moles);....; yield

Here is the data:
{extraction}'''

In [43]:
parser = StrOutputParser()
prompt = PromptTemplate(
    template=planner_template,
    input_variables=["extraction"],
)

In [44]:
chain = prompt | model | parser

In [45]:
file = pd.read_csv('prompt_tune_gpt3_5.csv')

In [46]:
full_argument_list = []
for i in file["jsons"]:
    full_argument_list.append(chain.invoke({"extraction": i}))

In [None]:
for i in full_argument_list:
    mol_wise_list = [part.strip() for part in i.split(";")]
    y = float(mol_wise_list[-1]) if mol_wise_list[-1] != 'None' else None
    print(y)

In [47]:
def check(smiles, weight, mols):
    if smiles is None:
        return 3
    m = Chem.MolFromSmiles(smiles)
    if m is None:
        return 3
    elif weight is None or mols is None:
        return 2
    else:
        mol_wt = Descriptors.MolWt(m)
        if weight is None:
            return 2
        calc_mols = weight/mol_wt
        dev = abs((mols - calc_mols))/mols
        if dev <= 0.4:
            return 1
        else:
            return 0

In [None]:
final_list = []
t = 0
for i in full_argument_list:
    valid_smiles = 0
    verified_mols = 0
    mol_min = 0
    yield_score = 0
    mol_wise_list = [part.strip() for part in i.split(";")]
    n_molecules = len(mol_wise_list) - 1
    try:
        y = float(mol_wise_list[-1]) if mol_wise_list[-1] != 'None' else None
    except ValueError as ve:
        print('ValueError issue in example')
    for j in range(len(mol_wise_list)-2):
        mol_arguments  = [part.strip() for part in mol_wise_list[j].split(",")]
        single_smiles = mol_arguments[0] if mol_arguments[0] != 'None' else None
        single_weight = float(mol_arguments[1]) if mol_arguments[1] != 'None' else None
        single_mols = float(mol_arguments[2]) if mol_arguments[2] != 'None' else None
        checked_value = check(single_smiles, single_weight, single_mols)
        if checked_value < 3:
            valid_smiles += 1
        if single_weight is None and single_mols is not None:
            if Chem.MolFromSmiles(p_sm) is not None:
                minmol = w/Descriptors.MolWt(Chem.MolFromSmiles(p_sm))
            else:
                minmol = 0
        if value == 1:
            verified_mols +=1
            minmol = m
            mol_min = min(mol_min, minmol)
    p_arguments = [part.strip() for part in mol_wise_list[-2].split(",")]
    p_smiles = p_arguments[0] if p_arguments[0] != 'None' else None
    p_weight = float(p_arguments[1]) if p_arguments[1] != 'None' else None
    p_mols = float(p_arguments[2]) if p_arguments[2] != 'None' else None
    p_value = check(p_smiles, p_weight, p_mols)
    
    if p_value < 3:
        valid_smiles += 1

    if value == 1:
        verified_mols += 1     
        p_mol = p_m
        if mol_min != 0:
            if y/100 > p_mols/mol_min:
                yield_score = 0
        else:
            yield_score = 1
            
    smiles_score = valid_smiles / n_molecules
    mols_score = verified_mols / n_molecules
    element_list = [smiles_score, mols_score, yield_score]

    final_list.append(element_list)

In [199]:
p = pd.DataFrame(columns=["SMILES_score", "Mols_score", "Yield_score"], dtype='string')

In [None]:
for i in final_list:
    a=i[0]
    b=i[1]
    c=i[2]
    df = {'SMILES_score': a, 'Mols_score': b, 'Yield_score': c}
    p = p._append(df, ignore_index = True)

In [203]:
p.to_csv('scores.csv')

In [None]:
metric_a=0
metric_b=0
metric_c=0
for i in final_list:
    metric_a+=i[0]
    metric_b+=i[1]
    metric_c+=i[2]
print(a/len(final_list))
print(b/len(final_list))
print(c/len(final_list))