In [1]:
from openai import OpenAI
import pandas as pd

In [2]:
client = OpenAI(api_key='')

In [None]:
client.files.create(
  file=open("ft_gpt_834.jsonl", "rb"),
  purpose="fine-tune"
)

In [None]:
client.fine_tuning.jobs.create(
  training_file='', 
  model="gpt-3.5-turbo"
)

In [4]:
template = '''You are a chemical reaction data formatter. You will be provided unstructured data about the procedure for performing a chemical reaction and partially structured data about the molecules involved to convert into a single formatted JSON. I will give you the format of the required JSON and short descriptions of what is required in these fields. Here is the format of the JSON file, with a description of the required information:

 {{
    "reactants": [
        {{
            "name": "Reactant name",
            "amount": "Reactant amount",
            "mols": "Reactant moles", 
            "smiles": "Reactant SMILES"
        }},
        ...
    ],
    "spectators": [
        {{
            "name": "Spectator/solvent name",
            "amount": "Spectator/solvent amount",
            "mol": "Spectator/solvent moles", 
            "smiles": "Solvent SMILES"
        }},
        ...
    ],
    "products": [
        {{
            "name": "Product name",
            "amount": "Product amount",
            "mols": "Product moles", 
            "smiles": "Product SMILES"
        }},
        ...
    ],
    "yield": "reaction yield"
    "procedures": [
        {{
            "procedure": "Procedure type (Choose from preparation, reaction, workup, purification only. Workup refers to the addition of chemicals for quenching/neutralizing. Purification refers to subsequent isolation like filtration. Most reactions do contain both workup and purification, do not miss any of those, even within sentences. Make sure each sentence of the procedure is analysed for this)",
            "chemicals involved": "Chemicals involved in the reaction. For a workup or purification, only mention the chemicals used for the workup/purification",
            "description": "Description"
            "temperature": "temperature",
            "time": "time taken",
        }},
        ...
    ],
    "total_time": "Total amount of time for reaction"
    "product information": [
        {{
            "type": "Type of information (qualitative or quantitative or N/A only. There can be multiple parts of characterization informations separated by "; " or ". " characters. Separate these and include them one by one)",
            "descriptor": "Could be the color, physical state etc for qualitative, could be full details 1H NMR, 13C NMR, LCMS, MS or HPLC for quantitative, separated by ";" or ". ". Separate these and include them one by one"
        }},
        ...
}}

Few more guidelines:
Please interpolate between various parts of the given data, and feel free to use reasoning about chemicals, but do not add any extra information in this cleaned JSON. If any information is missing, use "N/A". If there seems to be an information mismatch between the structured and unstructured parts, inform accordingly. Most importantly, only output the formatted JSON and nothing else. Most importantly, only output the formatted JSON and nothing else.'''

In [5]:
file = pd.read_csv('combined_finetuning_data_test.csv')

In [6]:
final = []
for i in file.iterrows():
    procedure = i[1]['text']
    reactants = i[1]['molecules_reactants']
    products = i[1]['molecules_products']
    userinput = '''Procedure: {procedure}
Reactants: {reactants}
Products: {products}'''
    full_datapt = [dict(role='system', content=template), dict(role='user', content=userinput.format(procedure=procedure, reactants=reactants, products=products))]
    completion = client.chat.completions.create(model="", messages=full_datapt)
    final.append(completion.choices[0].message.content)
    

In [7]:
a = pd.DataFrame(columns=["jsons"], dtype='string')
t = 0
for i in final:
    a.loc[t] = str(i)
    t+=1
a.to_csv('finetuning_gpt_3_5_responses.csv')