In [None]:
import os
import re

def add_header_to_txt_files_in_structure(base_directory):
    try:
        # Define patterns for the directory structure
        patterns = [
            "gpt-4-0613_N_fold_PROTEIN_Prompts_0.0",
            "gpt-3.5-turbo-0613_N_fold_PROTEIN_Prompts_0.0"
        ]

        for i in range(1, 17):
            for j in range(1, 11):
                for pattern in patterns:
                    # Construct directory paths
                    dir_paths = [
                        os.path.join(base_directory, pattern, f"Prompt{i}", "LLL", f"LLL_T0.0_Prompt{i}_Run{j}"),
                        os.path.join(base_directory, pattern, f"Prompt{i}", "HPRD50", f"HPRD50_T0.0_Prompt{i}_Run{j}"),
                        os.path.join(base_directory, pattern, f"Prompt{i}", "IEPA", f"IEPA_T0.0_Prompt{i}_Run{j}")
                    ]

                    for dir_path in dir_paths:
                        if os.path.exists(dir_path):
                            # Process each .txt file in the directory
                            for file in os.listdir(dir_path):
                                if file.endswith('.txt'):
                                    file_path = os.path.join(dir_path, file)
                                    with open(file_path, 'r') as f:
                                        content = f.readlines()

                                    # Check if the first line is the expected header
                                    if not content or not content[0].strip().startswith("Sentence ID,PPI"):
                                        content.insert(0, "Sentence ID,PPI\n")
                                        print("Header added to",file_path)
                                        with open(file_path, 'w') as f:
                                            f.writelines(content)

        return "Header added successfully where necessary."
    except Exception as e:
        return f"An error occurred: {e}"

# Example usage
base_directory = 'Output/'  # Ensure this is correctly indented
add_header_to_txt_files_in_structure(base_directory)


In [None]:
def evaluation_PROTEIN(data_path, dataset, output_path, run):
    if dataset == "LLL":
        run_standard = [f"fold{i}.csv" for i in range(1, 37)]
    if dataset == "HPRD50":
        run_standard = [f"fold{i}.csv" for i in range(1, 22)]
    if dataset == "IEPA":
        run_standard = [f"fold{i}.csv" for i in range(1, 16)]
    data_path_standard = f'Datasets/PROTEIN_DATA/{dataset}/N_fold/N_fold_csv/'
    for f in range(10):
        fold = f+1

        df_predicted_processed = pd.read_csv(os.path.join(data_path, run[f]), sep = ',', on_bad_lines= 'skip')
        
        
        df_predicted_processed = df_predicted_processed.dropna()
        df_predicted_processed = df_predicted_processed.drop(df_predicted_processed[df_predicted_processed['Sentence ID'] == 'Done'].index)
        df_predicted_processed.columns = ['Sentence ID','PPI']
        
        # Check data type
        print(df_predicted_processed['PPI'].dtype)

        # Check unique values for hidden characters or formats
        print(df_predicted_processed['PPI'].unique())
        
        interaction_processed = pd.read_csv(os.path.join(data_path_standard, run_standard[f]), sep = ',')
        #interaction_processed = interaction_processed [
        y = interaction_processed['isValid']

        y_pred = df_predicted_processed['PPI'].astype(bool)

        accuracy = accuracy_score(y, y_pred)
        precision = precision_score(y, y_pred)
        recall = recall_score(y, y_pred)
        f1 = f1_score(y, y_pred)
        
        print("Fold = ",fold)
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1 Score: {f1}")
        
        print(data_path)
        
        with open(output_path, "a") as f:
          print (data_path, fold,',', "{:.4f}".format(precision), ',', "{:.4f}".format(recall), ',',"{:.4f}".format(f1), file=f)

In [None]:
import os
section_no = "N_fold_PROTEIN_Prompts"
def output_path(Run_no, dataset, temperature, prompt_no, model_engine):
    base = "Output/" + model_engine + "_"+section_no+"_" + str(temperature) + "/" + str(prompt_no) +"/"+dataset + "/"
    extension_path = dataset + "_T" + str(temperature) + "_" + prompt_no + "_Run" + str(Run_no)+'/'

    # Join the base path and extension path
    Implementation_base_path_output = os.path.join(base, extension_path)
    os.makedirs(Implementation_base_path_output, exist_ok=True)

    return Implementation_base_path_output

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

temperature = 0.0
model_engines = ["gpt-3.5-turbo-0613", "gpt-4-0613"]
datasets = ["LLL", "HPRD50", "IEPA"]

prompt_no = 15
for model_engine in model_engines:
    for dataset in datasets:
        for R in range(1, 11):
            query_no = f"Prompt{prompt_no}"

            if dataset == "LLL":
                run = [f'{R}_fold{i}.txt' for i in range(1, 37)]
            elif dataset == "HPRD50":
                run = [f'{R}_fold{i}.txt' for i in range(1, 22)]
            elif dataset == "IEPA":
                run = [f'{R}_fold{i}.txt' for i in range(1, 16)]

            # Construct the output path
            output_path_R = f'Final_outputs_all/{model_engine}_N_fold_{dataset}_Prompts_0.0/{query_no}/{dataset}/Evaluation_{dataset}_T{temperature}_{query_no}_Run{R}.csv'
            Implementation_base_path_output_Run = output_path(R, dataset, temperature, query_no, model_engine)
            evaluation_PROTEIN(Implementation_base_path_output_Run, dataset, output_path_R, run)


In [None]:
import pandas as pd
import os

model_engines = ["gpt-3.5-turbo-0613", "gpt-4-0613"]
datasets = ["LLL", "HPRD50", "IEPA"]

for model_engine in model_engines:
    for dataset in datasets:
        for prompt_no in range(15, 16):  # Adjust the range as needed

            # Define the folder path containing the CSV files
            folder_path = f"Output/{model_engine}_N_fold_PROTEIN_Prompts_0.0/Prompt{prompt_no}/{dataset}/"

            # Create an empty dataframe to store the results
            results_df = pd.DataFrame(columns=["filename", "average precision", "average recall", "average f1 score"])

            # Loop through the CSV files in the folder
            for filename in os.listdir(folder_path):
                if filename.endswith(".csv"):
                    # Read the CSV file into a dataframe
                    df = pd.read_csv(os.path.join(folder_path, filename), header=None, names=["filename", "precision", "recall", "f1 score"])

                    # Calculate the average precision, recall, and f1 score for each file
                    avg_precision = df["precision"].mean()
                    avg_recall = df["recall"].mean()
                    avg_f1_score = df["f1 score"].mean()

                    # Add the results to the results dataframe
                    results_df = results_df.append({
                        "filename": filename,
                        "average precision": avg_precision,
                        "average recall": avg_recall,
                        "average f1 score": avg_f1_score
                    }, ignore_index=True)

            # Sort and process the dataframe
            results_df["T_value"] = results_df["filename"].str.extract("T(\d+\.\d+)")
            shortened_df = results_df[["T_value", "average precision", "average recall", "average f1 score"]].sort_values(by="T_value")
            shortened_df[['average precision', 'average recall', 'average f1 score']] *= 100
            shortened_df[['average precision', 'average recall', 'average f1 score']] = shortened_df[['average precision', 'average recall', 'average f1 score']].round(2)

            # Calculate and print the average scores
            avg_precision = round(shortened_df['average precision'].mean(), 2)
            avg_recall = round(shortened_df['average recall'].mean(), 2)
            avg_f1 = round(shortened_df['average f1 score'].mean(), 2)
            print(f"Model Engine: {model_engine}, Dataset: {dataset}, Prompt: {prompt_no}")
            print("Average Precision:", avg_precision)
            print("Average Recall:", avg_recall)
            print("Average F1 Score:", avg_f1)

            # Append the new row to the DataFrame
            new_row = {'T_value': 'Average', 'average precision': avg_precision, 'average recall': avg_recall, 'average f1 score': avg_f1}
            shortened_df = shortened_df.append(new_row, ignore_index=True)

            # Save the results
            filepath = os.path.join(folder_path, 'Average_Score.csv')
            shortened_df.to_csv(filepath, index=False)
