In [21]:
from litellm import completion
from dotenv import load_dotenv
import os
import json

In [7]:
load_dotenv()

True

In [8]:
base_model = "groq/llama3-8b-8192"

In [24]:
PREFIX = """
You are a helpful scientific assistant. Your task is to extract relevant scientific data from the provided text about perovskite solar cells and passivating molecules. If the data is not available in the text, return null for the respective fields. Output the information in JSON format with the following fields:
- `control`: Data for the control group.
  - `PCE`: Power conversion efficiency (numeric).
  - `VOC`: Open-circuit voltage (numeric).
- `treatment`: An array of treatments, where each treatment includes:
  - `PCE`: Power conversion efficiency (numeric).
  - `VOC`: Open-circuit voltage (numeric).
  - `passivating_molecule`: Name of the passivating molecule tested.
- `perovskite_composition`: Chemical formula of the perovskite (string).
- `electron_transport_layer`: Material used as the electron transport layer (string).
- `hole_transport_layer`: Material used as the hole transport layer (string).
- `stability_tests`: An array of stability tests, where each test includes:
  - `test_name`: Name of the stability test (string).
  - `temperature`: Test temperature in degrees Celsius (numeric).
  - `time`: Test duration in hours (numeric).
  - `humidity`: Test humidity in percentage (numeric).
  - `control_efficiency`: Control PCE after the test (numeric).
  - `treatment_efficiency`: Treatment PCE after the test (array of numerics if multiple treatments).

The JSON structure must follow this exact format:
{
  "control": {
    "PCE": null,
    "VOC": null
  },
  "treatment": [
    {
      "PCE": null,
      "VOC": null,
      "passivating_molecule": null
    }
  ],
  "perovskite_composition": null,
  "electron_transport_layer": null,
  "hole_transport_layer": null,
  "stability_tests": [
    {
      "test_name": null,
      "temperature": null,
      "time": null,
      "humidity": null,
      "control_efficiency": null,
      "treatment_efficiency": null
    }
  ]
}
Be concise and accurate. Include only information explicitly present in the text.
"""
SUFFIX = """\n\n{sample}\n\n"""

In [13]:
def split_text_into_chunks(text, max_chunk_size):
    """
    Splits the input text into manageable chunks of max_chunk_size.
    """
    return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

In [30]:
txt_dir = "../../data/txts"
shots = ["0-shot", "1-shot"]
max_chunk_size = 8000
for s in shots:
    for filename in os.listdir(txt_dir):
        if filename.endswith(".txt") == False:
            continue
        if filename != "5.txt":
            continue
        filepath = os.path.join(txt_dir, filename)

        # Open and read the file
        with open(filepath, "r") as file:
            print(f"Processing file: {filename}")
            text = file.read()

            # Split the text into chunks
            chunks = split_text_into_chunks(text, max_chunk_size)

            # Initialize context for sequential chunk processing
            extracted_data = []  # Store outputs from each chunk
            
            for idx, chunk in enumerate(chunks):
                print(f"Processing chunk {idx + 1}/{len(chunks)} for {filename}")

                # Define the shot-specific logic
                if s == "0-shot":
                    shot = ""
                else:
                    # For "1-shot", define additional logic if needed
                    break  # Placeholder for "1-shot" implementation

                # Create the prompt
                system = PREFIX + shot
                user = SUFFIX.format(sample=chunk)

                # Assemble the message for the model
                prompt = [
                    {"role": "system", "content": system},
                    {"role": "user", "content": user}
                ]

                # Perform the model completion
                pred = (
                    completion(
                        model=base_model,
                        api_key=os.getenv("GROQ_API_KEY"),
                        messages=prompt,
                        caching=True,
                        max_tokens=8192,
                        temperature=0,
                    )
                    .choices[0]
                    .message.content
                )
                print(pred)
                # Parse and append the extracted data
                try:
                    json_data = json.loads(pred)
                    extracted_data.append(json_data)
                except json.JSONDecodeError:
                    print(f"Error parsing JSON for chunk {idx + 1}")

            # Combine results from all chunks into a single JSON object
            final_result = {
                "control": extracted_data[0].get("control", None) if extracted_data else None,
                "treatment": [],
                "perovskite_composition": None,
                "electron_transport_layer": None,
                "hole_transport_layer": None,
                "stability_tests": []
            }

            for data in extracted_data:
                if "treatment" in data and data["treatment"]:
                    final_result["treatment"].extend(data["treatment"])
                if "perovskite_composition" in data and data["perovskite_composition"]:
                    final_result["perovskite_composition"] = data["perovskite_composition"]
                if "electron_transport_layer" in data and data["electron_transport_layer"]:
                    final_result["electron_transport_layer"] = data["electron_transport_layer"]
                if "hole_transport_layer" in data and data["hole_transport_layer"]:
                    final_result["hole_transport_layer"] = data["hole_transport_layer"]
                if "stability_tests" in data and data["stability_tests"]:
                    final_result["stability_tests"].extend(data["stability_tests"])
            print(extracted_data)
            print(final_result)
            break

Processing file: 5.txt
Processing chunk 1/6 for 5.txt
Here is the extracted data in JSON format:

```
{
  "control": {
    "PCE": null,
    "VOC": null
  },
  "treatment": [
    {
      "PCE": null,
      "VOC": null,
      "passivating_molecule": "iso-BAI"
    }
  ],
  "perovskite_composition": "FA(MA)PbI3",
  "electron_transport_layer": null,
  "hole_transport_layer": null,
  "stability_tests": [
    {
      "test_name": null,
      "temperature": null,
      "time": null,
      "humidity": null,
      "control_efficiency": null,
      "treatment_efficiency": null
    }
  ]
}
```

Note that some fields are null because the text does not provide the necessary information.
Error parsing JSON for chunk 1
Processing chunk 2/6 for 5.txt
Here is the extracted data in JSON format:

```
{
  "control": {
    "PCE": null,
    "VOC": null
  },
  "treatment": [
    {
      "PCE": null,
      "VOC": null,
      "passivating_molecule": "iso-BAI"
    }
  ],
  "perovskite_composition": "FA(MA)PbI3",