## This file compares the teamtat annotation (Json) with Extraction performed by finetuned llama (Json)

In [11]:
label = {'passivating_molecule': 'phenethylammonium iodide', 'perovskite_composition': 'MAPbI3', 'ISOSD1': {'time': '240', 'treated_pce': '15.3', 'control_pce': '16.69', 'temperature': '25', 'humidity': '90', 'control_voc': '1.03', 'treated_voc': '1.06'}, 'electron_transport_layer': 'TiO2', 'hole_transport_layer': 'Spiro-OMeTAD'}

In [12]:
extraction = {
  "control_pce": None,
  "control_voc": None,
  "treated_pce": 15.3,
  "treated_voc": 1.06,
  "passivating_molecule": "Phenylethylammonium (PEA)",
  "perovskite_composition": "[C8H9NH3]2[(CH3NH3)2PbI3 – (n=60)]",
  "electron_transport_layer": "TiO2",
  "hole_transport_layer": "spiro-OMeTAD",
  "ISOS-L-1": None,
  "ISOS-L-2": None,
  "ISOS-T-1": None,
  "ISOS-T-2": None,
  "ISOS-LC": None,
  "ISOS-D-1": None,
  "ISOS-D-2": None
}

In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score
from difflib import SequenceMatcher
import numpy as np

def compare_data(labeled_data, extracted_data, numerical_tolerance=0.05):
    results = {}
    total_fields = len(labeled_data)
    matched_fields = 0
    numerical_differences = []

    for key, value in labeled_data.items():
        if key in extracted_data:
            extracted_value = extracted_data[key]

            # Exact match
            if value == extracted_value:
                matched_fields += 1
                results[key] = "Match"

            # Numerical comparison
            elif isinstance(value, (int, float)) and isinstance(extracted_value, (int, float)):
                if abs(value - extracted_value) <= numerical_tolerance * abs(value):
                    matched_fields += 1
                    numerical_differences.append(abs(value - extracted_value))
                    results[key] = "Numerical Match"
                else:
                    results[key] = "Numerical Mismatch"

            # Fuzzy string match
            elif isinstance(value, str) and isinstance(extracted_value, str):
                similarity = SequenceMatcher(None, value.lower(), extracted_value.lower()).ratio()
                if similarity > 0.8:  # Threshold for similarity
                    matched_fields += 1
                    results[key] = f"Fuzzy Match ({similarity:.2f})"
                else:
                    results[key] = f"Mismatch ({similarity:.2f})"

            else:
                results[key] = "Mismatch"
        else:
            results[key] = "Missing in Extracted Data"

    accuracy = matched_fields / total_fields
    mean_absolute_error = np.mean(numerical_differences) if numerical_differences else None

    return {
        "results": results,
        "accuracy": accuracy,
        "mean_absolute_error": mean_absolute_error,
        "matched_fields": matched_fields,
        "total_fields": total_fields
    }


In [14]:
comparison_result = compare_data(label, extraction)
print(comparison_result)


{'results': {'passivating_molecule': 'Mismatch (0.78)', 'perovskite_composition': 'Mismatch (0.20)', 'ISOSD1': 'Missing in Extracted Data', 'electron_transport_layer': 'Match', 'hole_transport_layer': 'Fuzzy Match (1.00)'}, 'accuracy': 0.4, 'mean_absolute_error': None, 'matched_fields': 2, 'total_fields': 5}
