## An example of automatic DP workflow with GPT-4o-2024-11-18

In [None]:
import pandas as pd
from joblib import Parallel, delayed
import sys
sys.path.append('../src')
from evaluation import Grader, convert_annotation
from decomposition import main
import copy

### Covert annotation results to Pydantic objects

In [2]:
annotation_df = pd.read_excel("../data/Annotation.xlsx", engine="openpyxl", header=0)
criteria = annotation_df["Original Content"].dropna().tolist()
in_or_exs = annotation_df["Inculsion/Exclusion"].dropna().tolist()
annotation_obj = convert_annotation(annotation_df)
labels = []
for trial_annotation in annotation_obj.values():
    labels +=trial_annotation

### Load API Key

In [None]:
import dotenv
import os
dotenv.load_dotenv()
api_key = os.environ.get("API_KEY")

### Automatic DP

In [None]:
predictions = Parallel(n_jobs=32)(delayed(main)(criterion, in_or_ex, model='gpt-4o-2024-11-20', max_token=8192, api_key=api_key) for criterion, in_or_ex  in zip(criteria, in_or_exs))

In [None]:
# Save predictions
import pickle
with open('../data/predictions_gpt_4o.pkl','wb') as f:
    pickle.dump(predictions, f)

### Load Predictions

In [3]:
import pickle
with open('../data/predictions_gpt_4o.pkl','rb') as f:
    predictions = pickle.load(f)

In [4]:
print(f"Number of annotated criteria: {len(labels)}\nNumber of predicted criteria: {len(predictions)}\n{sum(1 for item in predictions if item is None)} criteria failed to pass schema validation")
predictions = [item for item in predictions if item is not None]

Number of annotated criteria: 81
Number of predicted criteria: 81
3 criteria failed to pass schema validation


In [5]:
grader = Grader(labels, copy.deepcopy(predictions), model_type="microsoft/deberta-xlarge-mnli", num_layers=40)
print(grader.generate_report())




        Evaluation Report:
        1. Trial Extraction Evaluation:
           - Precision: 0.78
           - Recall: 0.85
           - F1 Score: 0.81

        2. Logic Relation Evaluation:
            - Precision: 1.00
            - Recall: 0.96
            - F1 Score: 0.98

        3. Trait-Level Evaluation:

        BertScore:
           - Main Entity Precision: 0.87
           - Main Entity Recall: 0.86
           - Main Entity F1 Score: 0.87

           - Constraint Precision: 0.83
           - Constraint Recall: 0.85
           - Constraint F1 Score: 0.84

           - Modifier Precision: 0.35
           - Modifier Recall: 0.34
           - Modifier F1 Score: 0.34

        Classification:
           - Main Entity Type
                           precision    recall  f1-score   support

   condition       0.90      0.92      0.91       134
 demographic       0.86      1.00      0.92         6
      device       0.78      1.00      0.88         7
        drug       0.67      0.97   