In [None]:
import os
import pandas as pd

num_shot = 0
model_size = "2-9b-it"
setting = "with"
dataset = "VAC_Cancer_Vaccine_adjuvant"
base_path = "Output"
model_folder = f'google-gemma-{model_size}_llama_prompt2_{setting}_Substances_{num_shot}shot'


In [None]:

for run in range(1, 4):
    merged_lines = []

    for fold in range(1, 291):  
        input_file_path = (
            f"{base_path}/{model_folder}/"
            f"Single_Abstract_With_Substances_Only_{dataset}_T0.0001_16December/"
            f"{dataset}_T0.0001_Run{run}/{run}_Abstract_{fold}.txt"
        )

        if os.path.exists(input_file_path):
            with open(input_file_path, "r", encoding="utf-8") as file:
                content = file.read().strip()
                if content:
                    merged_lines.append(content)
        else:
            print(f"Missing file: {input_file_path}")

    output_file_path = (
        f"{base_path}/google-gemma-{model_size}_llama_prompt2_{setting}_Substances_{num_shot}shot/"
        f"Single_Abstract_With_Substances_Only_{dataset}_T0.0001_16December/"
        f"{dataset}_T0.0001_Run{run}_Merged.txt"
    )

    # Create the output directory if needed
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    # Write merged content
    with open(output_file_path, "w", encoding="utf-8") as outfile:
        outfile.write("\n\n".join(merged_lines))

    print(f"Merged file written: {output_file_path}")


In [None]:
# Postprocessing Code for Merged File
import re

for run in range(1, 4):
    merged_file_path = (
        f"{base_path}/{model_folder}/"
        f"Single_Abstract_With_Substances_Only_{dataset}_T0.0001_16December/"
        f"{dataset}_T0.0001_Run{run}_Merged.txt"
    )

    postprocessed_path = merged_file_path.replace("_Merged.txt", "_Merged_Postprocessed.txt")

    if not os.path.exists(merged_file_path):
        print(f"Merged file not found: {merged_file_path}")
        continue

    with open(merged_file_path, "r", encoding="utf-8") as file:
        content = file.read()

    # Step 1: Remove 'Done'
    content = content.replace("Done", "")

    # Step 2: Ensure each PMID starts on a new line (unless already)
    content = re.sub(r"(?<!\n)(PMID_\d+)", r"\n\1", content)

    # Step 3: Remove excessive blank lines
    content = re.sub(r"\n{2,}", "\n", content)

    # Step 4: Remove rows with only PMID and no adjuvant name
    lines = content.strip().splitlines()
    filtered_lines = []
    seen = set()  # Initialize set to track duplicates

    for line in lines:
        stripped = line.strip()
        # Skip empty lines or rows with only PMID and no value
        if re.match(r"^PMID_\d+\s*$", stripped) or re.match(r"^PMID_\d+\t\s*$", stripped):
            continue
        if stripped not in seen:
            seen.add(stripped)
            filtered_lines.append(stripped)

    # Step 5: Save cleaned output
    with open(postprocessed_path, "w", encoding="utf-8") as file:
        file.write("\n".join(filtered_lines))

    print(f"Postprocessed file written: {postprocessed_path}")


In [None]:

# Gold standard file
goldstandard_path = 'Dataset/VAC_Cancer_Vaccine_adjuvant/VAC_Vaccine_Adjuvant_with_Substances_with_Substances_only_Formatted_Column_updated_PMID_without_shots.csv'

# Runs info
runs = [
    ("VAC_Cancer_Vaccine_adjuvant_T0.0001_Run1", "Run1"),
    ("VAC_Cancer_Vaccine_adjuvant_T0.0001_Run2", "Run2"),
    ("VAC_Cancer_Vaccine_adjuvant_T0.0001_Run3", "Run3"),
]

def merge_goldstandard_with_predicted(model_folder, run_folder, run_number):
    # Build predicted file path
    predicted_path = os.path.join(
        'Output', model_folder,
        f'Single_Abstract_With_Substances_Only_{dataset}_T0.0001_16December',
        f'{run_folder}_Merged_Postprocessed.txt'
    )

    output_path = predicted_path.replace('_Merged_Postprocessed.txt', '_Merged_with_GoldStandard.csv')

    try:
        # Load predicted file (TSV with no header)
        predicted_df = pd.read_csv(predicted_path, sep="\t", header=None, names=["PMID", "Adjuvant"])
        predicted_df = predicted_df.drop_duplicates()

        # Load gold standard
        goldstandard_df = pd.read_csv(goldstandard_path)

        # Merge
        merged_df = pd.merge(goldstandard_df, predicted_df, on="PMID", how="outer")

        # Save
        merged_df.to_csv(output_path, index=False)
        print(f"Merged gold standard with predictions: {output_path}")
        return merged_df

    except Exception as e:
        print(f"Failed merging {predicted_path}: {e}")
        return None

# Run merging for all
all_merged_dfs = {}
for run_folder, run_number in runs:
    merged_df = merge_goldstandard_with_predicted(model_folder, run_folder, run_number)
    all_merged_dfs[run_folder] = merged_df
