In [2]:
import asyncio
import getpass
import json
import os
from collections import Counter

import aiohttp
import requests

In [3]:
base_url = 'https://nomad-lab.eu/prod/v1/oasis/api/v1'
# base_url = 'http://localhost:8000/nomad-oasis/api/v1'

username = os.getenv('NOMAD_USERNAME') or input('Username: ')
password = os.getenv('NOMAD_PASSWORD') or getpass.getpass('Password: ')

response = requests.post(
    f'{base_url}/auth/token',
    data={'username': username, 'password': password},
)
token = response.json()['access_token']

In [4]:
upload_id = 'aRQ18gDpSzKjo753wdIEZQ'
# upload_id = 'QZ8J2EwGQeC5GGMA6aVnkQ'
# upload_id = 'w96QRdvqQ9qQF0FwI1OKjA'
url = f'{base_url}/entries/query'
json_body = {
    'owner': 'visible',
    'pagination': {
        'page_size': 100,
    },
    'required': {
        'include': ['entry_id'],
    },
    'query': {
        'and': [
            {'upload_id:any': [upload_id]},
            {'processing_errors:all': ['could not normalize section']},
        ]
    },
}
headers = {'Authorization': f'Bearer {token}'}
page_after_value = None
entries = set()

while True:
    response = requests.post(url, json=json_body, headers=headers)
    response_json = response.json()
    for data in response_json.get('data', []):
        entries.add(data['entry_id'])
    page_after_value = response_json.get('pagination', {}).get('next_page_after_value')
    if not page_after_value:
        break
    json_body['pagination']['page_after_value'] = page_after_value

In [5]:
print(f'{len(entries) = }')
print(json.dumps(list(entries), indent=2))

len(entries) = 117
[
  "Lf2EeV48zIZL8iRuw_JunUkXhcID",
  "fsAcmcwuzAlIY8wwsoIu3APnC2Ay",
  "CzofYyKtd20M21SiPwERkPhiA3yR",
  "cOCmlGTJqyIQhOnvKIh61ydlNgmP",
  "axfMqO0j5_44Z8CmdqTF4Dj0hUBy",
  "LSiPZZUkzrR6MaFyENVosPLIgH5z",
  "il_vbA1dNxU3AYtWJH4I_FGhrdx-",
  "k05T7l6HlIKY3d-uGEDXFxsQUeEa",
  "U0vN8jK8oOgB4O-KGXNdZIdJ1PQV",
  "ouTvafyif4jFfjbO7pmI2M2oSESe",
  "ilxVQocdP-5moUmaZk_K9Qkic3AB",
  "thEhR-RlHdNbiPA0WLlrm90jyQjk",
  "Rm6mNRxsqOwwCuyXAp5h1IIWhvwc",
  "8wtOPBovo-a7XngaUEHhX-XLK7L2",
  "83PPZMhiLhYiwtk3hGL83c1C2kZO",
  "k2AfuSHRsz_-t8al_LyUPbxl9XHg",
  "lkqFpUvWYMKyW8YWksKmzmhe9oAN",
  "lR5of-l4ZOooWtJSP6d8AnpI5d4e",
  "H1etKgYEZ6qO_c9sLplLdD6zl7R5",
  "yf-ckRTyjIy4tHcwcroxlbbje_2v",
  "VRtuwzqawfx35ExSDZ3CXio0O-du",
  "rj9eE21I_11I5d_Jj-Ip5VtDLGa6",
  "3-Aw8Nzy7oN5NCBSlZiLA0fNHhGY",
  "wqg8vn5xrUUJHDnmYyqlrX-brTDt",
  "bEc_AjYfbPC3r2zuhfHFwPHunm8B",
  "gVCmqN2JphWPktQEbkacqbpwbbsj",
  "ZIIyivR_G3nFcWB-hmI40cS5yDhV",
  "0I8mNRcNmCAanGbeijlTjKFv-GU7",
  "YJBMhqiU6JBF5ksqo4mO0FAu

In [6]:
json_body = {"required":{"processing_logs":"*"}}
log_messages = {}
for entry_id in entries:
    response = requests.post(
        f"{base_url}/entries/{entry_id}/archive/query", 
        json=json_body, 
        headers=headers,
    )
    for log in response.json()['data']['archive']['processing_logs']:
        if log['level'] == 'ERROR':
            print(f'Error in entry {entry_id}')
            log_messages[entry_id] = log['exception']

Error in entry Lf2EeV48zIZL8iRuw_JunUkXhcID
Error in entry fsAcmcwuzAlIY8wwsoIu3APnC2Ay
Error in entry CzofYyKtd20M21SiPwERkPhiA3yR
Error in entry cOCmlGTJqyIQhOnvKIh61ydlNgmP
Error in entry axfMqO0j5_44Z8CmdqTF4Dj0hUBy
Error in entry LSiPZZUkzrR6MaFyENVosPLIgH5z
Error in entry il_vbA1dNxU3AYtWJH4I_FGhrdx-
Error in entry k05T7l6HlIKY3d-uGEDXFxsQUeEa
Error in entry U0vN8jK8oOgB4O-KGXNdZIdJ1PQV
Error in entry ouTvafyif4jFfjbO7pmI2M2oSESe
Error in entry ilxVQocdP-5moUmaZk_K9Qkic3AB
Error in entry thEhR-RlHdNbiPA0WLlrm90jyQjk
Error in entry Rm6mNRxsqOwwCuyXAp5h1IIWhvwc
Error in entry 8wtOPBovo-a7XngaUEHhX-XLK7L2
Error in entry 83PPZMhiLhYiwtk3hGL83c1C2kZO
Error in entry k2AfuSHRsz_-t8al_LyUPbxl9XHg
Error in entry lkqFpUvWYMKyW8YWksKmzmhe9oAN
Error in entry lR5of-l4ZOooWtJSP6d8AnpI5d4e
Error in entry H1etKgYEZ6qO_c9sLplLdD6zl7R5
Error in entry yf-ckRTyjIy4tHcwcroxlbbje_2v
Error in entry VRtuwzqawfx35ExSDZ3CXio0O-du
Error in entry rj9eE21I_11I5d_Jj-Ip5VtDLGa6
Error in entry 3-Aw8Nzy7oN5NCBSl

In [14]:
counter = Counter(log_messages.values())
print('----------------')
for message, count in counter.most_common():
    print(f'{count:4d} instances of:')
    print(message)
    # Print link to one of the entries with this message
    for entry_id, log_message in log_messages.items():
        if log_message == message:
            print(f'  Example entry: {base_url[:-6]}gui/user/uploads/upload/id/{upload_id}/entry/id/{entry_id}')
            break
    print('----------------')

----------------
  61 instances of:
Traceback (most recent call last):
  File "/opt/venv/lib/python3.12/site-packages/nomad/normalizing/metainfo.py", line 37, in normalize_section
    normalize(archive, logger)
  File "/opt/venv/lib/python3.12/site-packages/perovskite_solar_cell_database/schema_sections/ref.py", line 174, in normalize
    first_name=author['given'],
               ~~~~~~^^^^^^^^^
KeyError: 'given'
  Example entry: https://nomad-lab.eu/prod/v1/oasis/gui/user/uploads/upload/id/aRQ18gDpSzKjo753wdIEZQ/entry/id/CzofYyKtd20M21SiPwERkPhiA3yR
----------------
  33 instances of:
Traceback (most recent call last):
  File "/opt/venv/lib/python3.12/site-packages/requests/models.py", line 971, in json
    return complexjson.loads(self.text, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/jso

In [28]:
import os

path = '/home/hampusnasstrom/repositories/nomad-distro-dev/.volumes/fs/staging/uU/uUUnwrnvSFyhlDZRsjznDg/raw'
files = os.listdir(path)
print(f'{len(files) = }')
llm = []
classic = []
for file in files:
    if file.endswith('_classic.archive.json'):
        classic.append(file.split('_classic.archive.json')[0])
    elif file.endswith('.archive.json'):
        llm.append(file.split('.archive.json')[0])
print(f'{len(classic) = }')
print(f'{len(llm) = }')
missing = []
for item in llm:
    if item not in classic:
        missing.append(item)
print(f'{len(missing) = }')
print(missing)
missing_dir = 'missing'
os.makedirs(missing_dir, exist_ok=True)
for fname in missing:
    src = os.path.join(path, f'{fname}.archive.json')
    dst = os.path.join(missing_dir, f'{fname}.archive.json')
    if os.path.exists(src):
        os.system(f'cp "{src}" "{dst}"')

len(files) = 22
len(classic) = 10
len(llm) = 12
len(missing) = 2
['10.1039--d3tc02414h-cell-1', '10.1039--d3tc03238h-cell-0']


In [None]:
upload_id = 'QZ8J2EwGQeC5GGMA6aVnkQ'
# upload_id = 'w96QRdvqQ9qQF0FwI1OKjA'
url = f'{base_url}/entries/query'
json_body = {
    'owner': 'visible',
    'pagination': {
        'page_size': 100,
    },
    'required': {
        'include': ['entry_id'],
    },
    'query': {
        'and': [
            {'upload_id:any': [upload_id]},
            {'processing_errors:all': ['could not normalize section']},
        ]
    },
}
headers = {'Authorization': f'Bearer {token}'}
response = requests.post(url, json=json_body, headers=headers)
response_json = response.json()

In [26]:
def smart_split(s):
    result = []
    buf = ''
    depth = 0
    for i, c in enumerate(s):
        if c == '(':
            depth += 1
            buf += c
        elif c == ')':
            depth -= 1
            buf += c
        elif c == ',' and depth == 0 and not (i > 0 and s[i-1].isdigit() and i+1 < len(s) and s[i+1].isdigit()):
            result.append(buf)
            buf = ''
        else:
            buf += c
    if buf:
        result.append(buf)
    return result

In [27]:
s = "A,B,C(C,D),1,4-E"
print(smart_split(s))

['A', 'B', 'C(C,D)', '1,4-E']


In [None]:
from perovskite_solar_cell_database.schema_packages.llm_extraction_schema import LLMExtractedPerovskiteSolarCell
import json

print(json.dumps(LLMExtractedPerovskiteSolarCell.m_def.m_to_dict(), indent=2))

{
  "m_def": "nomad.metainfo.metainfo.Section",
  "m_parent_index": 12,
  "m_parent_sub_section": "section_definitions",
  "label": "LLM Extracted Perovskite Solar Cell",
  "name": "LLMExtractedPerovskiteSolarCell",
  "base_sections": [
    "nomad.datamodel.metainfo.basesections.v1.PublicationReference",
    "/section_definitions/0",
    "nomad.datamodel.data.EntryData"
  ],
  "quantities": [
    {
      "m_def": "nomad.metainfo.metainfo.Quantity",
      "m_parent_index": 0,
      "m_parent_sub_section": "quantities",
      "m_annotations": {
        "eln": [
          {
            "component": "URLEditQuantity",
            "label": "DOI Number"
          }
        ]
      },
      "description": "DOI number of the publication",
      "name": "DOI_number",
      "shape": [],
      "type": {
        "type_kind": "python",
        "type_data": "str"
      }
    },
    {
      "m_def": "nomad.metainfo.metainfo.Quantity",
      "m_parent_index": 1,
      "m_parent_sub_section": "quantiti

In [None]:
import json
from nomad.units import ureg
from perovskite_solar_cell_database.schema_packages.llm_extraction_schema import LLMExtractedPerovskiteSolarCell


with open('extracted.json', 'r') as f:
    data = json.load(f)

RENAME_MAP = {
    'a_ions': 'ions_a_site',
    'b_ions': 'ions_b_site',
    'x_ions': 'ions_x_site',
    'bandgap': 'band_gap',
    'PCE_at_the_start_of_the_experiment': 'PCE_at_start',
    'PCE_at_the_end_of_experiment': 'PCE_at_end'
}

def rename_quantities(original: dict) -> dict:
    renamed = {}
    for key, value in original.items():
        new_key = RENAME_MAP.get(key, key)
        if isinstance(value, dict):
            renamed[new_key] = rename_quantities(value)
        elif isinstance(value, list):
            renamed[new_key] = [rename_quantities(item) if isinstance(item, dict) else item for item in value]
        else:
            renamed[new_key] = value
    return renamed

# TODO figure out what's up with impurities and additives

def replace_values_with_pint(original: dict) -> dict:
    replaced = {}
    for key, value in original.items():
        if isinstance(value, dict):
            if 'value' in value:
                if 'unit' not in value:
                    replaced[key] = value['value']
                    continue
                if value['unit'] == '%':
                    try:
                        replaced[key] = float(value['value']) / 100.0
                    except Exception as e:
                        print(f"Error processing {key} with % unit: {e}")
                        replaced[key] = value
                    continue
                if key == 'concentration':
                    replaced[key] = value['value']
                    replaced['concentration_unit'] = value['unit']
                try:
                    replaced[key] = ureg.Quantity(value['value'], value['unit'])
                except Exception as e:
                    print(f"Error processing {key}: {e}")
                    replaced[key] = value
            else:
                replaced[key] = replace_values_with_pint(value)
        elif isinstance(value, list):
            replaced[key] = [replace_values_with_pint(item) if isinstance(item, dict) else item for item in value]
        else:
            replaced[key] = value
    return replaced

for cell in data['cells']:
    cell_renamed = rename_quantities(cell)
    cell_with_pint = replace_values_with_pint(cell_renamed)
    try:
        entry = LLMExtractedPerovskiteSolarCell.m_from_dict(cell_with_pint)
        print(json.dumps(entry.m_to_dict(), indent=2, default=str))
    except Exception as e:
        print(f"Error creating LLMExtractedPerovskiteSolarCell: {e}")
        print(json.dumps(cell_with_pint, indent=2, default=str))


{
  "device_architecture": "pin",
  "ff": 84.06,
  "jsc": 19.97,
  "number_devices": 1,
  "pce": 0.2046,
  "voc": 1.22,
  "layers": [
    {
      "functionality": "Hole-transport",
      "name": "NiOx",
      "thickness": 30.0
    },
    {
      "functionality": "Absorber",
      "name": "Perovskite (Cs0.2FA0.8Pb(I0.77Br0.23)3)",
      "thickness": 400.0
    },
    {
      "functionality": "Electron-transport",
      "name": "C60",
      "thickness": 30.0
    },
    {
      "functionality": "Other",
      "name": "BCP",
      "thickness": 7.0
    },
    {
      "functionality": "Contact",
      "name": "Ag",
      "thickness": 130.0
    }
  ],
  "perovskite_composition": {
    "band_gap": 1.68,
    "dimensionality": "3D",
    "formula": "Cs0.2FA0.8Pb(I0.77Br0.23)3",
    "sample_type": "Polycrystalline film",
    "ions_a_site": [
      {
        "common_name": "Cesium",
        "molecular_formula": "Cs+",
        "abbreviation": "Cs",
        "coefficient": "0.2"
      },
      {
      

In [16]:
from nomad.metainfo import MSection, Quantity
from nomad.units import ureg

class A(MSection):
    quantity = Quantity(type=float, unit='m')

data = {'quantity': ureg.Quantity(1.0, 'nm')}

a = A.m_from_dict(data)
print(json.dumps(a.m_to_dict(), indent=2))

{
  "quantity": 1e-09
}


In [2]:
import json

from nomad.units import ureg
from perovskite_solar_cell_database.llm_extraction_schema import LLMExtractedPerovskiteSolarCell
from perovscribe.export import convert_to_extraction_to_nomad_entries
from perovscribe.pydantic_model_reduced import PerovskiteSolarCells

doi = 'some_doi'
with open('extracted.json', 'r') as f:
    data = json.load(f)
pydantic_model = PerovskiteSolarCells(**data)
schema = LLMExtractedPerovskiteSolarCell

entries = convert_to_extraction_to_nomad_entries(
    pydantic_model=pydantic_model, doi=doi, nomad_schema=schema, ureg=ureg)

In [3]:
entries[0]

{'data': {'m_def': 'perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell',
  'DOI_number': 'https://www.doi.org/some_doi',
  'device_architecture': 'pin',
  'ff': 84.06,
  'jsc': 19.97,
  'layer_order': 'NiOx,Perovskite (Cs0.2FA0.8Pb(I0.77Br0.23)3),C60,BCP,Ag',
  'number_devices': 1,
  'pce': 20.46,
  'voc': 1.22,
  'layers': [{'functionality': 'Hole-transport',
    'name': 'NiOx',
    'thickness': 30.0},
   {'functionality': 'Absorber',
    'name': 'Perovskite (Cs0.2FA0.8Pb(I0.77Br0.23)3)',
    'thickness': 400.0},
   {'functionality': 'Electron-transport', 'name': 'C60', 'thickness': 30.0},
   {'functionality': 'Other', 'name': 'BCP', 'thickness': 7.0},
   {'functionality': 'Contact', 'name': 'Ag', 'thickness': 130.0}],
  'perovskite_composition': {'band_gap': 1.68,
   'dimensionality': '3D',
   'formula': 'Cs0.2FA0.8Pb(I0.77Br0.23)3',
   'sample_type': 'Polycrystalline film',
   'ions_a_site': [{'common_name': 'Cesium',
     'molecular_formula': 'Cs+'

In [10]:
import litellm
import json

# Get all supported models
models = litellm.model_list
print(len(models))
print(json.dumps(models, indent=2))

1278
[
  "gemini-1.5-pro-preview-0215",
  "cloudflare/@cf/mistral/mistral-7b-instruct-v0.1",
  "gpt-4o-mini",
  "dashscope/qwen-turbo-2024-11-01",
  "palm/text-bison-safety-recitation-off",
  "azure_ai/Cohere-embed-v3-english",
  "gemini-2.0-flash",
  "medium/1024-x-1536/gpt-image-1-mini",
  "high/1536-x-1024/gpt-image-1",
  "vercel_ai_gateway/deepseek/deepseek-v3",
  "dashscope/qwen3-coder-plus-2025-07-22",
  "gemini-1.5-pro-preview-0514",
  "snowflake/llama3.2-3b",
  "deepgram/base-general",
  "gpt-4o-mini-transcribe",
  "deepinfra/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
  "xai/grok-3-mini-fast-latest",
  "moonshot/moonshot-v1-128k-vision-preview",
  "gemini/gemini-pro-vision",
  "medium/1536-x-1024/gpt-image-1",
  "us.anthropic.claude-3-5-sonnet-20241022-v2:0",
  "azure/gpt-4o-realtime-preview-2024-10-01",
  "deepinfra/nvidia/Llama-3.3-Nemotron-Super-49B-v1.5",
  "deepinfra/Qwen/Qwen2.5-VL-32B-Instruct",
  "rerank-v3.5",
  "amazon.titan-embed-image-v1",
  "gemini/veo-3.0-fast-

In [None]:
from openai import OpenAI
from pydantic import BaseModel, Field

class BatteryInfo(BaseModel):
    capacity: float = Field(..., description="Capacity in mAh")
    voltage: float = Field(..., description="Voltage in V")
    chemistry: str = Field(..., description="Chemistry type of the battery")

test_prompt = """
You are a helpful assistant that extracts battery information from text.
You should fill the supplied JSON schema.
"""

client = OpenAI(
base_url="https://llm1-compute.cms.hu-berlin.de/v1/",
api_key="required-but-not-used",
)
models = client.models.list()

chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model=models.data[0].id,
)
print(chat_completion)

ChatCompletion(id='chatcmpl-0542a7e5cf0cdeb6f05f31557d2274fb', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=" Alright, let's consider this a test. How can I assist you further with this test? You can ask me questions or give me tasks to complete.", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[], reasoning_content=None), stop_reason=None)], created=1760338886, model='llm1', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=34, prompt_tokens=8, total_tokens=42, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, kv_transfer_params=None)
