In [None]:
import obonet 
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
import pandas as pd
import numpy as np
import re
import requests
from tqdm import tqdm
import ast
import pickle
import math
from sklearn.metrics import precision_recall_curve, auc
import numpy as np 
import requests
import urllib3
#import sleep 
from time import sleep
# Optional: Suppress warnings if SSL verification is disabled

In [None]:
test_MFO_sequences = list(SeqIO.parse("processed_data_90_30/function_test.fasta", "fasta"))
test_BPO_sequences = list(SeqIO.parse("processed_data_90_30/process_test.fasta", "fasta"))
test_CCO_sequences = list(SeqIO.parse("processed_data_90_30/component_test.fasta", "fasta"))



In [None]:
print("Number of MFO test sequences: ", len(test_MFO_sequences))
print("Number of BPO test sequences: ", len(test_BPO_sequences))
print("Number of CCO test sequences: ", len(test_CCO_sequences))

In [None]:
plasmodium_MFO_sequences = list(SeqIO.parse("processed_data_90_30/Plasmodium_MA_function.fasta", "fasta"))
plasmodium_MFO_df = pd.read_csv("processed_data_90_30/Plasmodium_MA_function.tsv", sep="\t")
assert len(plasmodium_MFO_sequences) == len(plasmodium_MFO_df)

plasmodium_BPO_sequences = list(SeqIO.parse("processed_data_90_30/Plasmodium_MA_process.fasta", "fasta"))
plasmodium_BPO_df = pd.read_csv("processed_data_90_30/Plasmodium_MA_process.tsv", sep="\t")
assert len(plasmodium_BPO_sequences) == len(plasmodium_BPO_df)

plasmodium_CCO_sequences = list(SeqIO.parse("processed_data_90_30/Plasmodium_MA_component.fasta", "fasta"))
plasmodium_CCO_df = pd.read_csv("processed_data_90_30/Plasmodium_MA_component.tsv", sep="\t")
assert len(plasmodium_CCO_sequences) == len(plasmodium_CCO_df)

#treat the Raw propagated GO terms as a list
plasmodium_MFO_df['Raw propagated GO terms'] = plasmodium_MFO_df['Raw propagated GO terms'].apply(ast.literal_eval)
plasmodium_BPO_df['Raw propagated GO terms'] = plasmodium_BPO_df['Raw propagated GO terms'].apply(ast.literal_eval)
plasmodium_CCO_df['Raw propagated GO terms'] = plasmodium_CCO_df['Raw propagated GO terms'].apply(ast.literal_eval)

In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# url = 'https://deepgo.cbrc.kaust.edu.sa/deepgo/api/create'
def process_batch(batch_sequences):
    url = "https://deepgo.cbrc.kaust.edu.sa/deepgo/api/create"
    batch_data = "\n".join([str(seq.seq) for seq in batch_sequences])
    payload = {
        'version': '1.0.20',
        'data_format': 'enter',
        'data': batch_data,
        'threshold': 0.0
    }
    # print(payload)
    try:
        # Make a POST request with SSL verification disabled (for debugging)
        response = requests.post(url, json=payload, verify=False)
        response.raise_for_status()  # Raise an exception for HTTP errors
        #sleep 10-20 seconds
        # sleep(np.random.randint(4, 12))
        return response.json()
    
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return {"predictions": []}  # Handle the error gracefully


# Test sequences

In [None]:
function_predictions = []
for i in tqdm(range(0, len(test_MFO_sequences), 100), desc="Processing function sequences"):
    batch = test_MFO_sequences[i:i+100]
    response_data = process_batch(batch)

    # Reverse the predictions to match the order of sequences in the batch
    predictions_reversed = response_data.get('predictions', [])[::-1]

    for seq, prediction in zip(batch, predictions_reversed):
        function_predictions.append({
            "sequence_id": seq.id,
            "prediction": prediction
        })

print(function_predictions)

process_predictions = []
for i in tqdm(range(0, len(test_BPO_sequences), 100), desc="Processing process sequences"):
    batch = test_BPO_sequences[i:i+100]
    response_data = process_batch(batch)
    predictions_reversed = response_data['predictions'][::-1]
    
    for seq, prediction in zip(batch, predictions_reversed):
        process_predictions.append({
            "sequence_id": seq.id,
            "prediction": prediction
        })
assert len(process_predictions) == len(test_BPO_sequences)

component_predictions = []
for i in tqdm(range(0, len(test_CCO_sequences), 100), desc="Processing function sequences"):
    batch = test_CCO_sequences[i:i+100]
    response_data = process_batch(batch)
    predictions_reversed = response_data['predictions'][::-1]
    
    for seq, prediction in zip(batch, predictions_reversed):
        component_predictions.append({
            "sequence_id": seq.id,
            "prediction": prediction
        })
assert len(component_predictions) == len(test_CCO_sequences)

In [None]:
function_results_filtered = {} 

for i in function_predictions:
    function_results_filtered[i['sequence_id']] = i['prediction']['functions'][1]['functions']

process_results_filtered = {}
for i in process_predictions:
    process_results_filtered[i['sequence_id']] = i['prediction']['functions'][2]['functions']

component_results_filtered = {}
for i in component_predictions:
    component_results_filtered[i['sequence_id']] = i['prediction']['functions'][0]['functions']

In [None]:
with open("processed_data_90_30/deepgoplus_disorder_function_predictions.pkl", "wb") as f:
    pickle.dump(function_predictions, f)

with open("processed_data_90_30/deepgoplus_disorder_process_predictions.pkl", "wb") as f:
    pickle.dump(process_predictions, f)

with open("processed_data_90_30/deepgoplus_disorder_component_predictions.pkl", "wb") as f:
    pickle.dump(component_predictions, f)

# Plasmodium

In [None]:
assert len(function_predictions) == len(test_MFO_sequences)

In [None]:
function_predictions = []
for i in tqdm(range(0, len(plasmodium_MFO_sequences), 100), desc="Processing function sequences"):
    batch = plasmodium_MFO_sequences[i:i+100]
    response_data = process_batch(batch)

    # Reverse the predictions to match the order of sequences in the batch
    predictions_reversed = response_data.get('predictions', [])[::-1]

    for seq, prediction in zip(batch, predictions_reversed):
        function_predictions.append({
            "sequence_id": seq.id,
            "prediction": prediction
        })

assert len(function_predictions) == len(plasmodium_MFO_sequences)

process_predictions = []
for i in tqdm(range(0, len(plasmodium_BPO_sequences), 100), desc="Processing process sequences"):
    batch = plasmodium_BPO_sequences[i:i+100]
    response_data = process_batch(batch)
    predictions_reversed = response_data['predictions'][::-1]
    
    for seq, prediction in zip(batch, predictions_reversed):
        process_predictions.append({
            "sequence_id": seq.id,
            "prediction": prediction
        })
assert len(process_predictions) == len(plasmodium_BPO_sequences)

component_predictions = []
for i in tqdm(range(0, len(plasmodium_CCO_sequences), 100), desc="Processing function sequences"):
    batch = plasmodium_CCO_sequences[i:i+100]
    response_data = process_batch(batch)
    predictions_reversed = response_data['predictions'][::-1]
    
    for seq, prediction in zip(batch, predictions_reversed):
        component_predictions.append({
            "sequence_id": seq.id,
            "prediction": prediction
        })
assert len(component_predictions) == len(plasmodium_CCO_sequences)



In [None]:
with open("DeepGOPlus_function_predictions_threshold0.pkl", "wb") as output_handle:
    pickle.dump(function_predictions, output_handle)

with open("DeepGOPlus_process_predictions_threshold0.pkl", "wb") as output_handle:
    pickle.dump(process_predictions, output_handle)

with open("DeepGOPlus_component_predictions_threshold0.pkl", "wb") as output_handle:
    pickle.dump(component_predictions, output_handle)

In [None]:
function_results_filtered = {} 

for i in function_predictions:
    function_results_filtered[i['sequence_id']] = i['prediction']['functions'][1]['functions']

process_results_filtered = {}
for i in process_predictions:
    process_results_filtered[i['sequence_id']] = i['prediction']['functions'][2]['functions']

component_results_filtered = {}
for i in component_predictions:
    component_results_filtered[i['sequence_id']] = i['prediction']['functions'][0]['functions']

# Evaluation

In [None]:

#read C:\Users\stava\Dropbox\Bridge\IA_all.tsv 
ia_df = pd.read_csv(r'IA_all.tsv', sep='\t', header=None)
ia_df.columns = ['GO', 'IA']

ic_dict = dict(zip(ia_df['GO'], ia_df['IA']))

In [None]:
def df_to_annotation_dict_with_scores(df):
    """
    Converts a DataFrame with 'Entry', 'GO terms', and 'Score' into a dictionary
    where each Entry maps to a dictionary of {GO term: score}.
    """
    annot_dict = {}
    for entry, group in df.groupby('Entry'):
        annot_dict[entry] = {row['GO terms']: row['Score'] for _, row in group.iterrows()}
    return annot_dict

# Convert the ground truth DataFrames to dictionaries (no scores needed)
function_ground_truth_dict = df_to_annotation_dict(FUNCTION_GROUND_TRUTH)
process_ground_truth_dict = df_to_annotation_dict(PROCESS_GROUND_TRUTH)
component_ground_truth_dict = df_to_annotation_dict(COMPONENT_GROUND_TRUTH)

# Convert the prediction DataFrames to dictionaries (with scores)
function_pred_dict = df_to_annotation_dict_with_scores(function_df_out)
process_pred_dict = df_to_annotation_dict_with_scores(process_df_out)
component_pred_dict = df_to_annotation_dict_with_scores(component_df_out)


In [None]:
from utils_corrected import evaluate_annotations, _calculate_metrics_at_threshold, threshold_performance_metrics

In [None]:
MFO_smin, MFO_fmax, MFO_best_threshold_s, MFO_best_threshold_f, MFO_s_at_fmax, MFO_results_df = threshold_performance_metrics(
    ic_dict, function_ground_truth_dict, function_pred_dict, threshold_range=np.arange(0, 1.01, 0.01)
)

MFO_aupr_micro = calculate_aupr_micro(function_ground_truth_dict, function_pred_dict)
print(f"AUPR Micro: {MFO_aupr_micro}")
BPO_smin, BPO_fmax, BPO_best_threshold_s, BPO_best_threshold_f, BPO_s_at_fmax, BPO_results_df = threshold_performance_metrics(
    ic_dict, process_ground_truth_dict, process_pred_dict, threshold_range=np.arange(0, 1.01, 0.01)
)
BPO_aupr_micro = calculate_aupr_micro(process_ground_truth_dict, process_pred_dict)
print(f"AUPR Micro: {BPO_aupr_micro}")
CCO_smin, CCO_fmax, CCO_best_threshold_s, CCO_best_threshold_f, CCO_s_at_fmax, CCO_results_df = threshold_performance_metrics(
    ic_dict, component_ground_truth_dict, component_pred_dict, threshold_range=np.arange(0, 1.01, 0.01)
)
CCO_aupr_micro = calculate_aupr_micro(component_ground_truth_dict, component_pred_dict)
print(f"AUPR Micro: {CCO_aupr_micro}")



In [None]:
MFO_results_df.to_csv("Plasmodium_DeepGO_Plus_MFO_results_df.csv", index=False)
BPO_results_df.to_csv("Plasmodium_DeepGO_Plus_BPO_results_df.csv", index=False)
CCO_results_df.to_csv("Plasmodium_DeepGO_Plus_CCO_results_df.csv", index=False)

In [None]:
#print RU, MI at fmax best threshold 
print(f"MFO: S-min @ F-max Threshold ({MFO_best_threshold_f}): {MFO_s_at_fmax}")
ru = MFO_results_df[MFO_results_df['n'] == MFO_best_threshold_f]['ru'].values[0]
mi = MFO_results_df[MFO_results_df['n'] == MFO_best_threshold_f]['mi'].values[0]
print(f"MFO: ru @ F-max Threshold: {ru}")
print(f"MFO: mi @ F-max Threshold: {mi}")
print(f"BPO: S-min @ F-max Threshold ({BPO_best_threshold_f}): {BPO_s_at_fmax}")
ru = BPO_results_df[BPO_results_df['n'] == BPO_best_threshold_f]['ru'].values[0]
mi = BPO_results_df[BPO_results_df['n'] == BPO_best_threshold_f]['mi'].values[0]
print(f"BPO: ru @ F-max Threshold: {ru}")
print(f"BPO: mi @ F-max Threshold: {mi}")
print(f"CCO: S-min @ F-max Threshold ({CCO_best_threshold_f}): {CCO_s_at_fmax}")
ru = CCO_results_df[CCO_results_df['n'] == CCO_best_threshold_f]['ru'].values[0]
mi = CCO_results_df[CCO_results_df['n'] == CCO_best_threshold_f]['mi'].values[0]
print(f"CCO: ru @ F-max Threshold: {ru}")
print(f"CCO: mi @ F-max Threshold: {mi}")
