In [9]:
from openai import OpenAI
import json
from dotenv import load_dotenv
import os
import re

load_dotenv()

False

In [10]:
def get_system_content():
    json_schema = {
    "perovskite_composition": {"type:": str},
    "electron_transport_layer": {"type": str},
    "hole_transport_layer": {"type": str},
    "structure_pin_nip": {"type": str},
    "test_1": {
        "stability_type": {"type": str},
        "passivating_molecule": {"type": str},
        "humidity": {"type": int},
        "temperature": {"type": int},
        "time": {"type": int},
        "control_pce": {"type": float},
        "treated_pce": {"type": float},
        "control_voc": {"type": float},
        "treated_voc": {"type": float},
        "retained_proportion_cont": {"type": float},
        "retained_proportion_tret": {"type": float}
        }
    }
    ensure_format_system = """
    You are a helpful data quality assistant.
    Your task is to make sure that each value of the keys in the JSON object provided follows the following schema:

    {json_schema}

    **Instructions**
    - Note that there may be multiple tests, where each test has the key (test_1, test_2, etc.)
    The JSON provided to you may not follow this exact format. Rename the keys if necessary to ensure that it follows the schema previously outlined.
    - The JSON provided may be missing some keys. In the case a key is missing, put it in the JSON with its value as null.
    - Return the original JSON object but where each key's value is the JSON object you formatted.

    Begin formatting!
    """
    return ensure_format_system.format(json_schema=json_schema)

In [11]:
def ensure_json_format(json_path, batch_size=100):
    client = OpenAI(
        api_key=os.getenv("DEEPSEEK_API_KEY"), 
        base_url="https://api.deepseek.com"
    )
    system_content = get_system_content()
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)
    keys = list(data.keys())
    num_objects = len(keys)
    l = 0
    formatted_objects = {}
    while l < num_objects:
        r = l + batch_size
        if r > num_objects:
            r = num_objects
        curr_keys = keys[l:r]
        curr_batch = {key: data[key] for key in curr_keys if key in data}
        print(curr_batch)
        messages = [
            {
                "role": "system",
                "content": system_content
            },
            {
                "role": "user",
                "content": str(curr_batch) 
            }
        ]
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=messages,
            stream=False,
            max_tokens=4096
        )
        output = response.choices[0].message.content
        json_match = re.search(r"\{.*}", output, re.DOTALL)
        if not json_match:
            print(f"could not format keys {curr_keys[l]} to {curr_keys[r]}")
            formatted_objects.update(curr_batch)
        else:
            output_json = json.load(json_match.group(0).strip())
            formatted_objects.update(output_json)
    formatted_file_path = json_path + "_formatted"
    with open(formatted_file_path, "w") as f:
        json.dump(formatted_objects, f)

            


In [None]:
ensure_json_format("data/deepseek_finetuned.json")