In [21]:
import os
import json
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
from scipy.stats import mannwhitneyu
import torch

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

load_dotenv()

True

In [22]:
if torch.backends.mps.is_available():  # For macOS devices
    device = torch.device("mps")
elif torch.cuda.is_available():  # For NVIDIA GPUs
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Using device: {device}")

Using device: mps


In [23]:
FIELDS_TO_CHECK = [
    "species",
    "strain",
    "gender",
    "treatment",
    "way_of_administration",
    "dosage",
    "age_at_start",
    "duration_unit",
    "median_treatment",
    "max_treatment",
    "median_control",
    "max_control",
    "p_value",
    "n_treatment",
    "n_control",
]

In [18]:
def create_reference_files(path_to_reference_data: str, folder_to_save: str):
    """
    Create reference files in the required format. path_to_reference_data files should have "doi" columns and FIELDS_TO_CHECK columns
    Args:
        path_to_reference_data (str): Path to the reference data Excel file.
    """
    os.makedirs(folder_to_save, exist_ok=True)
    reference_data = pd.read_excel(path_to_reference_data)
    reference_data = reference_data.rename(columns={"intervention": "treatment"})
    for doi in reference_data["doi"]:
        name = doi.split("/")[1]
        temp_df = reference_data[reference_data["doi"] == doi]
        output = {"groups": temp_df[FIELDS_TO_CHECK].to_dict(orient="records")}
        with open(f"{folder_to_save}/{name}.json", "w") as outfile:
            json.dump(output, outfile, indent=4)

In [None]:
def calculate_simmilarity_scores(
    reference_data_folder: str,
    pipeline_result_folder: str,
    model: SentenceTransformer,
    control=False,
):
    """
    Calculate simmilarity scores between the pipeline answer and reference answer
    Args:
        reference_data_folder (str): Path to the folder containing reference data.
        pipeline_result_folder (str): Path to the folder containing pipeline results.
        model: SentenceTransformer model for computing embeddings.
        control (bool): If True, use random file for control, otherwise use exact file.
    Returns:
        List[float]: List of similarity scores between the pipeline answer and reference answer.
    """
    full_scores = []
    # for each doi we calculate the simmilarity scores between the pipeline answer and reference answer
    for i in tqdm(os.listdir(reference_data_folder)):
        reference_path = os.path.join(reference_data_folder, i)
        # for control we use random file, but for not control we use exact file
        if control:
            file = random.choice(
                [x for x in os.listdir(pipeline_result_folder) if x != ".DS_Store"]
            )
        else:
            file = i
        answer_path = os.path.join(pipeline_result_folder, file)
        reference_answer = json.load(open(reference_path, "r"))
        answer_answer = json.load(open(answer_path, "r"))
        scores = []
        reference_len = len(reference_answer["groups"])
        answer_len = len(answer_answer["groups"])
        # group_similarity is a penalty for discrepancy in the number of allocated experimental groups
        group_similarity = 1 / (1 + abs(answer_len - reference_len) / reference_len)
        # computing simmilarity score for each experimental group
        for answer in reference_answer["groups"]:
            ref = " ".join([str(answer[fld]) for fld in FIELDS_TO_CHECK])
            ref_embed = model.encode(ref)
            groups = answer_answer["groups"]
            answers = []
            for res in groups:
                res_temp = [str(res[fld]) for fld in FIELDS_TO_CHECK]
                answers.append(" ".join(res_temp))
            res_embed = model.encode(answers)
            # as we dont know the exact pair that we need to compare we took maximal similarity score across all experimental groups
            test = max([cosine_similarity([ref_embed], [x])[0][0] for x in res_embed])
            scores.append(test)
        metric = np.mean(scores) * group_similarity
        print(metric)
        full_scores.append(metric)
    return full_scores

In [19]:
create_reference_files("CollidaData_2023.xlsx",
                       "pipeline_results/reference_data")

In [24]:
random.choice([x for x in os.listdir("pipeline_results/openai") if x!='.DS_Store'])

'acel.13088.json'

In [27]:
model = SentenceTransformer("all-mpnet-base-v2",device=device)

In [29]:
control_scores = calculate_simmilarity_scores(reference_data_folder="pipeline_results/reference_data",
                             pipeline_result_folder="pipeline_results/openai",
                             model=model,
                             control=True)

  2%|▏         | 1/59 [00:00<00:10,  5.33it/s]

0.055029794573783875


  3%|▎         | 2/59 [00:02<01:23,  1.47s/it]

0.18383451278616741


  5%|▌         | 3/59 [00:02<00:52,  1.06it/s]

0.3675459623336792
0.09365519881248474


  8%|▊         | 5/59 [00:03<00:28,  1.91it/s]

0.1967846155166626


 10%|█         | 6/59 [00:06<01:09,  1.31s/it]

0.24906693861402315


 12%|█▏        | 7/59 [00:08<01:14,  1.43s/it]

0.17373248934745789


 14%|█▎        | 8/59 [00:08<00:55,  1.09s/it]

0.4773087799549103


 15%|█▌        | 9/59 [00:10<01:01,  1.23s/it]

0.21605604109556778


 17%|█▋        | 10/59 [00:10<00:50,  1.02s/it]

0.1850414276123047


 19%|█▊        | 11/59 [00:10<00:38,  1.26it/s]

0.1484048068523407


 22%|██▏       | 13/59 [00:11<00:24,  1.88it/s]

0.37000399827957153
0.35891714692115784
0.20663610100746155


 25%|██▌       | 15/59 [00:11<00:16,  2.64it/s]

0.28984421491622925


 27%|██▋       | 16/59 [00:12<00:15,  2.84it/s]

0.4000508785247803


 29%|██▉       | 17/59 [00:12<00:14,  2.93it/s]

0.26112047831217444


 31%|███       | 18/59 [00:12<00:13,  2.99it/s]

0.20860298474629718


 32%|███▏      | 19/59 [00:13<00:20,  1.92it/s]

0.20710163224827158
0.39527857303619385


 37%|███▋      | 22/59 [00:15<00:19,  1.95it/s]

0.29979583248496056
0.2316506803035736
0.1898849904537201


 41%|████      | 24/59 [00:15<00:14,  2.44it/s]

0.19080202281475067


 42%|████▏     | 25/59 [00:16<00:13,  2.48it/s]

0.278023324906826
0.4822007417678833


 46%|████▌     | 27/59 [00:16<00:11,  2.77it/s]

0.4570615589618683


 47%|████▋     | 28/59 [00:17<00:10,  2.96it/s]

0.2314353883266449


 49%|████▉     | 29/59 [00:19<00:22,  1.36it/s]

0.20943369331031012


 51%|█████     | 30/59 [00:19<00:18,  1.60it/s]

0.1050527294476827


 56%|█████▌    | 33/59 [00:20<00:10,  2.38it/s]

0.34494752883911134
0.152915358543396
0.17989182472229004


 58%|█████▊    | 34/59 [00:22<00:18,  1.33it/s]

0.06698342646871296
0.12192466855049133


 64%|██████▍   | 38/59 [00:23<00:07,  2.69it/s]

0.22930824011564255
0.1249738410115242
0.21579134464263916


 66%|██████▌   | 39/59 [00:23<00:07,  2.67it/s]

0.12755504250526428


 68%|██████▊   | 40/59 [00:23<00:06,  2.90it/s]

0.25516915652487016


 71%|███████   | 42/59 [00:24<00:05,  3.21it/s]

0.19173847138881683
0.48433923721313477


 73%|███████▎  | 43/59 [00:24<00:05,  2.95it/s]

0.19748106598854065


 75%|███████▍  | 44/59 [00:25<00:05,  2.52it/s]

0.25571436882019044


 80%|███████▉  | 47/59 [00:25<00:03,  3.69it/s]

0.1904896438121796
0.18014876544475555
0.3557349741458893
0.3494058847427368


 86%|████████▋ | 51/59 [00:26<00:01,  4.17it/s]

0.29052096605300903
0.2770790457725525
0.08567075431346893


 90%|████████▉ | 53/59 [00:28<00:01,  3.03it/s]

0.14563830109203563
0.4887359142303467


 92%|█████████▏| 54/59 [00:29<00:03,  1.43it/s]

0.2795700481959752


 93%|█████████▎| 55/59 [00:32<00:04,  1.17s/it]

0.17841757759451868


 97%|█████████▋| 57/59 [00:35<00:02,  1.15s/it]

0.2041575452114673
0.23657187819480896
0.7486936450004578


100%|██████████| 59/59 [00:35<00:00,  1.68it/s]

0.16963714361190796





In [30]:
pipeline_scores = calculate_simmilarity_scores(reference_data_folder="pipeline_results/reference_data",
                             pipeline_result_folder="pipeline_results/openai",
                             model=model,
                             control=False)

  2%|▏         | 1/59 [00:00<00:20,  2.78it/s]

0.23264575004577637


  5%|▌         | 3/59 [00:02<00:47,  1.17it/s]

0.3291767468819251
0.5959968566894531
0.8674309849739075


  8%|▊         | 5/59 [00:02<00:22,  2.39it/s]

0.36509761214256287


 10%|█         | 6/59 [00:09<01:55,  2.18s/it]

0.3784541103327386


 14%|█▎        | 8/59 [00:11<01:13,  1.45s/it]

0.3147058343065196
0.35102351506551105


 15%|█▌        | 9/59 [00:12<01:09,  1.38s/it]

0.3079773343127707


 20%|██        | 12/59 [00:12<00:29,  1.62it/s]

0.5339346081018448
0.1897166570027669
0.618681788444519


 25%|██▌       | 15/59 [00:13<00:14,  3.11it/s]

0.4519452452659607
0.3925038278102875
0.423265278339386


 29%|██▉       | 17/59 [00:13<00:12,  3.30it/s]

0.4114214777946472
0.8606034517288208


 31%|███       | 18/59 [00:14<00:14,  2.84it/s]

0.2397751361131668


 32%|███▏      | 19/59 [00:14<00:13,  2.88it/s]

0.30700789690017705
0.31743723154067993


 36%|███▌      | 21/59 [00:15<00:19,  1.97it/s]

0.47437691688537603


 41%|████      | 24/59 [00:16<00:11,  3.02it/s]

0.6020895391702652
0.3158091604709625
0.9139817953109741


 46%|████▌     | 27/59 [00:17<00:08,  3.85it/s]

0.584263265132904
0.22691130638122559
0.35800567269325256
0.3454052805900574


 49%|████▉     | 29/59 [00:18<00:13,  2.22it/s]

0.22601535608028545
0.8844670653343201


 53%|█████▎    | 31/59 [00:19<00:09,  2.88it/s]

0.5751446485519409


 54%|█████▍    | 32/59 [00:19<00:09,  2.85it/s]

0.21318540970484415
0.3320433795452118


 58%|█████▊    | 34/59 [00:21<00:13,  1.80it/s]

0.24582736832754956
0.34369325637817383


 64%|██████▍   | 38/59 [00:21<00:06,  3.46it/s]

0.3969874620437623
0.24037539958953857
0.7273123860359192
0.5119397640228271


 69%|██████▉   | 41/59 [00:22<00:03,  4.57it/s]

0.39684752623240155
0.7343182563781738


 71%|███████   | 42/59 [00:22<00:04,  3.79it/s]

0.5393621325492859
0.34762006998062134


 76%|███████▋  | 45/59 [00:22<00:02,  5.10it/s]

0.6476533859968185
0.6123324036598206


 81%|████████▏ | 48/59 [00:23<00:01,  6.16it/s]

0.3806050717830658
0.23939935366312662
0.1850001960992813


 86%|████████▋ | 51/59 [00:23<00:00,  8.35it/s]

0.4080932289361954
0.13165946304798126
0.1473246415456136


 90%|████████▉ | 53/59 [00:24<00:01,  3.78it/s]

0.3825000454397763
0.49599965413411456


 92%|█████████▏| 54/59 [00:26<00:03,  1.60it/s]

0.2795700481959752


 93%|█████████▎| 55/59 [00:28<00:03,  1.07it/s]

0.29744039803016475


100%|██████████| 59/59 [00:31<00:00,  1.75it/s]

0.252846938685367
0.5052233537038167
0.42058566212654114
0.409000426530838


100%|██████████| 59/59 [00:31<00:00,  1.89it/s]


In [31]:
stat, p_value = mannwhitneyu(pipeline_scores, control_scores, alternative='two-sided')

print(f"U-statistic: {stat}")
print(f"P-value: {p_value}")

U-statistic: 2762.5
P-value: 3.841350845062791e-08
