In [1]:
from openai import OpenAI
import openai
import json
from dotenv import load_dotenv
import os
import re

load_dotenv()

True

In [14]:
def get_system_content():
    json_schema = {
    "perovskite_composition": {"type:": str},
    "electron_transport_layer": {"type": str},
    "hole_transport_layer": {"type": str},
    "structure_pin_nip": {"type": str},
    "passivating_molecule": {"type": str},
    "control_pce": {"type": float},
    "treated_pce": {"type": float},
    "control_voc": {"type": float},
    "treated_voc": {"type": float},
    "test_1": {
        "stability_type": {"type": str},
        "humidity": {"type": int},
        "temperature": {"type": int},
        "time": {"type": int},
        "retained_proportion_cont": {"type": float},
        "retained_proportion_tret": {"type": float}
        }
    }
    ensure_format_system = """
    You are a helpful data quality assistant.
    Your task is to make sure that each value of the keys in the JSON object provided follows the following schema:

    {json_schema}

    **Instructions**
    - Note that there may be multiple tests, where each test has the key (test_1, test_2, etc.)
    The JSON provided to you may not follow this exact format. Rename the keys if necessary to ensure that it follows the schema previously outlined.
    - The JSON provided may have values that cannot be parsed into JSON (e.g. "temperature": 85 C). For these values, simply put the value as a string.
    - For the following variables, make sure the units are correct. Make conversions if necessary. There are some situations where the value is an equation, (e.g. 18 * 24). In these cases, solve the equation and return the value as a number.
        - "time": hours (as an int)
        - "humidity": percent (as an int)
        - "temperature": celsius (as an int)
    - Once the values are converted to the proper unit, drop the units from the value, so that it can be a number.
    - The JSON provided may be missing some keys. In the case a key is missing, put it in the JSON with its value as null.
    - Return the original JSON object but where each key's value is the JSON object you formatted.
    - Never drop a key value pair from the original JSON object.
    - If any value is a list, return the first value in the list.
    - Make sure to format every JSON object given to you.
    - Ensure that the value you return can be parsed using json.loads().

    Begin formatting!
    """
    return ensure_format_system.format(json_schema=json_schema)

In [15]:
def ensure_json_format(json_path, batch_size=50):
    api_key = os.getenv("OPENAI_API_KEY")
    client = OpenAI(api_key=api_key)
    system_content = get_system_content()
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    keys = list(data.keys())
    num_objects = len(keys)
    l = 0
    formatted_objects = {}
    while l < num_objects:
        r = l + batch_size
        if r > num_objects:
            r = num_objects
        curr_keys = keys[l:r]
        print("processing keys: ", curr_keys)
        curr_batch = {key: data[key] for key in curr_keys if key in data}
        messages = [
            {
                "role": "system",
                "content": system_content
            },
            {
                "role": "user",
                "content": str(curr_batch) 
            }
        ]
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            stream=False,
            max_tokens=16384
        )
        output = response.choices[0].message.content
        json_match = re.search(r"\{.*}", output, re.DOTALL)
        if not json_match:
            print(f"could not format keys {curr_keys[l]} to {curr_keys[r]}")
            formatted_objects.update(curr_batch)
        else:
            result = json_match.group(0).strip()
            print(result)
            output_json = json.loads(result)
            formatted_objects.update(output_json)
        l = r
    file_name, file_extension = os.path.splitext(json_path)
    formatted_file_path = file_name + "_formatted" + file_extension
    with open(formatted_file_path, "w") as f:
        json.dump(formatted_objects, f)

            


In [16]:
ensure_json_format("data/final_extraction/to_format_2.json")

processing keys:  ['563', '585', '596', '598', '645', '667', '678', '707', '719', '723', '791', '803', '847', '849', '888', '898', '906', '917', '929', '933', '964', '977', '999', '1006', '1007', '1044', '1102', '1153', '1171', '1184', '1198', '1216', '1219', '1257']
{
    "563": {
        "perovskite_composition": "Cu2ZnSn(S,Se)4 (C:SSe)",
        "electron_transport_layer": "Mo",
        "hole_transport_layer": "ZnO",
        "structure_pin_nip": "PIN",
        "passivating_molecule": "Na",
        "control_pce": 8.07,
        "treated_pce": 10.69,
        "control_voc": 418.89,
        "treated_voc": 455.71,
        "test_1": {
            "stability_type": "ISOS-L",
            "temperature": 25,
            "time": 2000,
            "humidity": null,
            "retained_proportion_cont": 100.0,
            "retained_proportion_tret": 100.0
        }
    },
    "585": {
        "perovskite_composition": "FAPbI3 0.95 (MAPbBr3 0.05)",
        "electron_transport_layer": "SnO2",
   

In [17]:
import pandas as pd
df = pd.read_json('data/final_extraction/to_format_2.json')
data = df.T.sort_index()
data.info()

ValueError: If using all scalar values, you must pass an index

In [18]:
with open("data/final_extraction/to_format_2.json", 'r') as json_file:
    data = json.load(json_file)

In [25]:
data["563"]

'{"perovskite_composition": "Cu2ZnSn(S,Se)4 (C":SSe)", "electron_transport_layer" "Mo", "pin_nip_structure" "PIN", "hole_transport_layer" "ZnO", "passivating_molecule" "Na", "control_pce" 8.07, "control_voc" 418.89, "treated_pce" 10.69, "treated_voc" 455.71, "test_1" {"test_name" "ISOS-L", "temperature" 25, "time" 2000, "humidity" null, "retained_percentage_cont" 100, "retained_percentage_tret": 100}}'