In [1]:
from openai import OpenAI
import openai
import json
from dotenv import load_dotenv
import os
import re

load_dotenv()

True

In [18]:
def get_system_content():
    json_schema = {
    "perovskite_composition": {"type:": str},
    "electron_transport_layer": {"type": str},
    "hole_transport_layer": {"type": str},
    "structure_pin_nip": {"type": str},
    "test_1": {
        "stability_type": {"type": str},
        "passivating_molecule": {"type": str},
        "humidity": {"type": int},
        "temperature": {"type": int},
        "time": {"type": int},
        "control_pce": {"type": float},
        "treated_pce": {"type": float},
        "control_voc": {"type": float},
        "treated_voc": {"type": float},
        "retained_proportion_cont": {"type": float},
        "retained_proportion_tret": {"type": float}
        }
    }
    ensure_format_system = """
    You are a helpful data quality assistant.
    Your task is to make sure that each value of the keys in the JSON object provided follows the following schema:

    {json_schema}

    **Instructions**
    - Note that there may be multiple tests, where each test has the key (test_1, test_2, etc.)
    The JSON provided to you may not follow this exact format. Rename the keys if necessary to ensure that it follows the schema previously outlined.
    - The JSON provided may be missing some keys. In the case a key is missing, put it in the JSON with its value as null.
    - Return the original JSON object but where each key's value is the JSON object you formatted.
    - Never drop a key value pair from the original JSON object.

    Begin formatting!
    """
    return ensure_format_system.format(json_schema=json_schema)

In [22]:
def ensure_json_format(json_path, batch_size=50):
    api_key = os.getenv("OPENAI_API_KEY")
    client = OpenAI(api_key=api_key)
    system_content = get_system_content()
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    keys = list(data.keys())
    num_objects = len(keys)
    l = 0
    formatted_objects = {}
    while l < num_objects:
        r = l + batch_size
        if r > num_objects:
            r = num_objects
        curr_keys = keys[l:r]
        print("processing keys: ", curr_keys)
        curr_batch = {key: data[key] for key in curr_keys if key in data}
        messages = [
            {
                "role": "system",
                "content": system_content
            },
            {
                "role": "user",
                "content": str(curr_batch) 
            }
        ]
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            stream=False,
            max_tokens=16384
        )
        output = response.choices[0].message.content
        json_match = re.search(r"\{.*}", output, re.DOTALL)
        if not json_match:
            print(f"could not format keys {curr_keys[l]} to {curr_keys[r]}")
            formatted_objects.update(curr_batch)
        else:
            result = json_match.group(0).strip()
            print(result)
            output_json = json.loads(result)
            formatted_objects.update(output_json)
        l = r
    file_name, file_extension = os.path.splitext(json_path)
    formatted_file_path = file_name + "_formatted" + file_extension
    with open(formatted_file_path, "w") as f:
        json.dump(formatted_objects, f)

            


In [23]:
ensure_json_format("data/deepseek_finetuned.json")

processing keys:  ['115', '112', '72', '40', '91', '129', '47', '96', '35', '127', '49', '120', '32', '131', '23', '24', '89', '58', '144', '87', '138', '56', '80', '51', '143', '18', '63', '4', '103', '16', '3', '121', '33', '126', '48', '99', '128', '97', '41', '90', '74', '73', '113', '114', '105', '17', '2', '5', '19', '81']
{
    "115": {
        "perovskite_composition": null,
        "electron_transport_layer": "Spiro-OMeTAD/PTAA",
        "hole_transport_layer": "Spiro-OMeTAD",
        "structure_pin_nip": "NIP",
        "test_1": {
            "stability_type": "ISOS-D",
            "passivating_molecule": "anisole",
            "humidity": null,
            "temperature": 85,
            "time": 1000,
            "control_pce": 23.15,
            "treated_pce": 23.00,
            "control_voc": 1.155,
            "treated_voc": 1.147,
            "retained_proportion_cont": 0.85,
            "retained_proportion_tret": 0.92
        },
        "test_2": {
            "stabilit

In [15]:
with open("data/deepseek_finetuned_formatted.json", 'r') as json_file:
    data = json.load(json_file)
    keys = list(data.keys())
    for i in range (150):
        if str(i) not in keys:
            data[str(i)] = None

with open("data/deepseek_finetuned_formatted_2.json", 'w') as json_file:
    json.dump(data, json_file)

In [24]:
import pandas as pd
df = pd.read_json('data/deepseek_finetuned_formatted.json')
data = df.T.sort_index()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 126 entries, 0 to 149
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   perovskite_composition    113 non-null    object
 1   electron_transport_layer  117 non-null    object
 2   hole_transport_layer      116 non-null    object
 3   structure_pin_nip         106 non-null    object
 4   test_1                    126 non-null    object
 5   test_2                    16 non-null     object
 6   test_3                    4 non-null      object
 7   passivating_molecule      3 non-null      object
dtypes: object(8)
memory usage: 8.9+ KB


In [10]:
df = pd.read_json('data/deepseek_finetuned.json')
data = df.T.sort_index()
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 126 entries, 0 to 149
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   perovskite_composition    113 non-null    object
 1   electron_transport_layer  117 non-null    object
 2   pin_nip_structure         92 non-null     object
 3   hole_transport_layer      116 non-null    object
 4   test_1                    117 non-null    object
 5   test_2                    13 non-null     object
 6   structure_pin_nip         3 non-null      object
 7   passivating_molecule      9 non-null      object
 8   device_structure          3 non-null      object
 9   stability_test            0 non-null      object
 10  test_3                    3 non-null      object
 11  structure                 8 non-null      object
 12  stability_tests           5 non-null      object
 13  device_info               1 non-null      object
 14  stability_test_1          2 non

In [9]:
data

Unnamed: 0,perovskite_composition,electron_transport_layer,hole_transport_layer,pin_nip_structure,test_1,test_2,test_3,structure_pin_nip,humidity,temperature,...,treated_pce,control_voc,treated_voc,retained_proportion_cont,retained_proportion_tret,stability_tests,stability_test_2,stability_test_3,certified_pce,passivating_molecule
0,FAPbI3,Spiro-OMeTAD,C60,,"{'stability_type': 'ISOS-L-3', 'passivating_mo...",,,PIN,,,...,,,,,,,,,,
1,FA1-x MAx PbI3 (FA: HC(NH2)2; MA: CH3NH3),SnO2,spiro-OMeTAD,,"{'stability_type': 'ISOS-L', 'passivating_mole...",,,PIN,,,...,,,,,,,,,,
2,FA0.85Cs0.15PbI2.85I3,SnO2,Spiro-OMeTAD,PIN,"{'stability_type': 'humidity', 'passivating_mo...",,,,,,...,,,,,,,,,,
3,Cs0.05(MA0.10FA0.85)Pb(I0.90Br0.10)3,SnO2,Spiro-OMeTAD,n-i-p,"{'stability_type': 'ISOS-L-1', 'passivating_mo...",,,,,,...,,,,,,,,,,
4,,,,,"{'stability_type': 'ISOS-L', 'passivating_mole...",,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,Cs0.05(MA0.17FA0.83)0.95Pb(I0.83Br0.17)3,SnO2,Spiro-OMeTAD,,"{'stability_type': 'humidity', 'passivating_mo...",,,PIN,,,...,,,,,,,,,,
146,Cs0.05(FA0.05MA0.95)0.95Pb(I0.95Br0.05)3,C60,"2,2',7,7'-tetrakis(N,N-di(4-methoxyphenylamino...",,"{'stability_type': 'ISOS-L', 'passivating_mole...",,,PIN,,,...,,,,,,,,,,
147,FAPbI3,Spiro-OMeTAD,P3HT,,"{'stability_type': 'ISOS-L', 'passivating_mole...",,,NIP,,,...,,,,,,,,,,
148,Cs0.05 FA0.85 MA0.10 Pb(I0.97 Br0.03)3 with 5%...,TiO2,Spiro-OMeTAD,,"{'stability_type': 'ISOS-L', 'passivating_mole...",,,PIN,,,...,,,,,,,,,,
