## An example of automatic DP workflow with Llama3.3-70B-Instruct 

In [None]:
import pandas as pd
from joblib import Parallel, delayed
import sys
sys.path.append('../src')
from evaluation import Grader, convert_annotation
from decomposition import main
import copy

### Covert annotation results to Pydantic objects

In [2]:
annotation_df = pd.read_excel("../data/Annotation.xlsx", engine="openpyxl", header=0)
criteria = annotation_df["Original Content"].dropna().tolist()
in_or_exs = annotation_df["Inculsion/Exclusion"].dropna().tolist()
annotation_obj = convert_annotation(annotation_df)
labels = []
for trial_annotation in annotation_obj.values():
    labels +=trial_annotation

### Automatic DP

In [None]:
predictions = Parallel(n_jobs=32)(delayed(main)(criterion, in_or_ex, model='meta-llama/Llama-3.3-70B-Instruct', max_token=8192) for criterion, in_or_ex  in zip(criteria, in_or_exs))

In [None]:
# Save predictions
import pickle
with open('../data/predictions_llama3.3.pkl','wb') as f:
    pickle.dump(predictions, f)

### Load Predictions

In [None]:
import pickle
with open('../data/predictions_llama3.3.pkl','rb') as f:
    predictions = pickle.load(f)

In [4]:
print(f"Number of annotated criteria: {len(labels)}\nNumber of predicted criteria: {len(predictions)}\n{sum(1 for item in predictions if item is None)} criteria failed to pass schema validation")
predictions = [item for item in predictions if item is not None]

Number of annotated criteria: 81
Number of predicted criteria: 81
0 criteria failed to pass schema validation


In [5]:
grader = Grader(labels, copy.deepcopy(predictions), model_type="microsoft/deberta-xlarge-mnli", num_layers=40)
print(grader.generate_report())




        Evaluation Report:
        1. Trial Extraction Evaluation:
           - Precision: 0.75
           - Recall: 0.82
           - F1 Score: 0.78

        2. Logic Relation Evaluation:
            - Precision: 0.99
            - Recall: 0.98
            - F1 Score: 0.99

        3. Trait-Level Evaluation:

        BertScore:
           - Main Entity Precision: 0.86
           - Main Entity Recall: 0.86
           - Main Entity F1 Score: 0.85

           - Constraint Precision: 0.84
           - Constraint Recall: 0.87
           - Constraint F1 Score: 0.85

           - Modifier Precision: 0.36
           - Modifier Recall: 0.36
           - Modifier F1 Score: 0.36

        Classification:
           - Main Entity Type
                           precision    recall  f1-score   support

   condition       0.83      0.96      0.89       126
 demographic       0.86      1.00      0.92         6
      device       0.00      0.00      0.00         0
        drug       0.64      0.93   