# Setup

In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import average_precision_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import plotly.express as px

import sys
sys.path.append("../../..")
from src.models import training_utils, sage_ones

data_folder = "../../../data/processed/graph_data_nohubs/merged_types/"
reports_folder = "../../../reports/explore_predictions/"

# Load a pretrained model

In [15]:
seed = 0

prediction_edge_type = ("gene_protein","gda","disease")
datasets, node_map = training_utils.load_data(data_folder+f"split_dataset/seed_{seed}/")
train_data, val_data = datasets

feature_type = "ones"
feature_dim = 10
train_data = training_utils.initialize_features(train_data, feature_type, feature_dim)
val_data = training_utils.initialize_features(val_data, feature_type, feature_dim)

# weights_path = "../../../data/experiments/merged_types_experiment/sage_ones_merged_experiment_13_06_23__15_59.pth"
weights_path = "../../../data/experiments/merged_types_experiment/sage_ones_first_negatives_exp_04_07_23__12_07.pth"
weights = torch.load(weights_path)
model = sage_ones.Model(train_data.metadata(),[("gene_protein","gda","disease")])
model.load_state_dict(weights)

node_df = pd.read_csv(data_folder+"split_dataset/seed_4/tensor_df.csv",index_col=0).set_index("node_index",drop=True)

# Get encodings

In [3]:
%%timeit
encodings_dict = training_utils.get_encodings(model,val_data)

36.6 ms ± 3.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
encodings_dict = training_utils.get_encodings(model,val_data)

# Prioritization

In [5]:
class Predictor():
    def __init__(self,node_df, encodings_dict):
        assert node_df.index.name == "node_index", f"df index must be node_index, not {node_df.index.name}."

        self.df = node_df
        self.encodings = encodings_dict
    
    def inner_product_decoder(self,x_source,x_target,apply_sigmoid=True):
        pred = (x_source * x_target).sum(dim=1)

        if apply_sigmoid:
            pred = torch.sigmoid(pred)

        return pred
    
    def prioritize_one_vs_all(self,node_index):
        source_type = self.df.loc[node_index,"node_type"]
        tensor_index = self.df.loc[node_index,"tensor_index"]

        if source_type == "disease":
            target_type = "gene_protein"

        elif source_type == "gene_protein":
            target_type = "disease"

        source_vector = self.encodings[source_type][tensor_index]
        target_matrix = self.encodings[target_type]

        predicted_edges = self.inner_product_decoder(source_vector,target_matrix)
        ranked_scores, ranked_indices = torch.sort(predicted_edges,descending=True)
        results = pd.DataFrame({"score":ranked_scores.cpu().numpy(),"tensor_index":ranked_indices.cpu().numpy()})
        results.score = results.score.round(3)

        index_map = self.df.loc[self.df.node_type == target_type,["tensor_index","node_name"]].reset_index()
        ranked_predictions = pd.merge(results,index_map,on="tensor_index")
        ranked_predictions.index.name = "rank"

        return ranked_predictions
    
    def predict_supervision_edges(self,data, edge_type, return_dataframe=True):
        """If return_dataframe_==True, returns dataframe with edges, prediction scores and labels. Else, returns predicted scores tensor"""
        src_type, trg_type = edge_type[0],edge_type[2]
        x_source = self.encodings[src_type]
        x_target = self.encodings[trg_type]

        edge_label_index = data.edge_label_index_dict[edge_type]
        source_index, target_index = edge_label_index[0], edge_label_index[1]

        emb_nodes_source = x_source[source_index]
        emb_nodes_target = x_target[target_index]

        pred = self.inner_product_decoder(emb_nodes_source, emb_nodes_target)
        if return_dataframe:
            labels = data.edge_label_dict[edge_type].numpy()
            df = pd.DataFrame({"torch_gene_protein_index":source_index, "torch_disease_index":target_index, "score":pred, "label":labels})
            return df
        else: 
            return pred
    
    def hits_at_k(self,node_index,mapped_train,mapped_val):
      k_list = [5,10,50,100]
      predictions = self.prioritize_one_vs_all(node_index)

      node_type = self.df.loc[node_index,"node_type"]
      y_type = "disease" if node_type == "gene_protein" else "gene_protein"

      new_edges = set(mapped_val[(mapped_val.edge_type == "supervision") & (mapped_val.label == 1) & (mapped_val[node_type] == node_index)][y_type].values)
      seen_edges = set(mapped_train[(mapped_train.label != 0) & (mapped_train[node_type] == node_index)][y_type].values)

      results = {"seen_edges":len(seen_edges),"new_edges":len(new_edges)}

      for k in k_list:
            predicted_top = set(predictions[:k]["node_index"].values)

            seen_hits = len(seen_edges.intersection(predicted_top))
            new_hits = len(new_edges.intersection(predicted_top))

            results[f"{k}_seen"] = seen_hits
            results[f"{k}_new"] = new_hits

      return results

# Map datasets to index

In [6]:
class MappedDataset():
    def __init__(self,heterodata,node_map,prediction_edge_type):
        self.prediction_edge_type = prediction_edge_type
        self.node_map = node_map
        self.edge_dict = self._reverse_map_heterodata(heterodata)
        self.dataframe = self._edge_dict_to_dataframe()
        
    def _reverse_map_tensor(self,tensor,edge_type):
        """Maps edge dictionary from pyg Heterodata back into the original node indexes from the dataframe"""
        #Tensor to lists [sources], [targets]
        sources = tensor[0,:].tolist()
        targets = tensor[1,:].tolist()

        #Map edge list to node indexes
        src_type, dst_type = edge_type[0], edge_type[2]
        src_map,dst_map = self.node_map[src_type], self.node_map[dst_type]

        mapped_src = [src_map[n] for n in sources]
        mapped_trg = [dst_map[n] for n in targets]

        return {src_type:mapped_src, dst_type:mapped_trg, f"torch_{src_type}_index":sources, f"torch_{dst_type}_index":targets}

    def _reverse_map_heterodata(self,data):
        """Maps full edge data from pyg Heterodata back into the original node indexes from the dataframe"""
        edge_dict = {}
        for edge_type in data.edge_types:
            type_dict = {}
            edge_tensor = data[edge_type]["edge_index"]
            mapped_edge_list = self._reverse_map_tensor(edge_tensor,edge_type)

            type_dict["message_passing_edges"] = mapped_edge_list

            if "edge_label_index" in data[edge_type].keys():
                labeled_edges_tensor = data[edge_type]["edge_label_index"]
                # labeled_edges_list = tensor_to_edgelist(labeled_edges_tensor)
                mapped_labeled_edges_list = self._reverse_map_tensor(labeled_edges_tensor,edge_type)
                edge_labels = data[edge_type]["edge_label"].tolist()

                type_dict["supervision_edges"] = mapped_labeled_edges_list
                type_dict["supervision_labels"] = edge_labels
    
            edge_dict[edge_type] = type_dict
        
        return edge_dict
    
    def _edge_dict_to_dataframe(self):
        edges_df = []
        e_dict = self.edge_dict[self.prediction_edge_type]
        supervision_edges = pd.DataFrame(e_dict["supervision_edges"])

        labeled_edges = pd.concat([supervision_edges,pd.DataFrame(e_dict["supervision_labels"])],axis=1).rename(columns={0:"label"})
        msg_passing_edges = pd.DataFrame(e_dict["message_passing_edges"])

        msg_passing_edges["edge_type"] = "message_passing"
        labeled_edges["edge_type"] = "supervision"


        edges_df.append(labeled_edges)
        edges_df.append(msg_passing_edges)
        total_df = pd.concat(edges_df,axis=0)
        return total_df

mapped_val = MappedDataset(val_data,node_map,prediction_edge_type)
mapped_train = MappedDataset(train_data,node_map,prediction_edge_type)

In [17]:
predictor = Predictor(node_df,encodings_dict)
pred = predictor.predict_supervision_edges(val_data, prediction_edge_type,return_dataframe=True)
val_supervision_edges = mapped_val.dataframe[mapped_val.dataframe.edge_type == "supervision"]
pred = pd.concat([pred,val_supervision_edges[["gene_protein","disease"]]], axis=1)
pred


Unnamed: 0,torch_gene_protein_index,torch_disease_index,score,label,gene_protein,disease
0,11174,5020,0.724898,1.0,6914,25553
1,1670,378,0.752792,1.0,10812,24180
2,3740,10341,0.406674,1.0,7801,31007
3,11693,73,0.508933,1.0,12316,31722
4,13379,24,0.298094,1.0,15478,22836
...,...,...,...,...,...,...
16797,4054,5347,0.477559,0.0,4242,29172
16798,2364,12740,0.096699,0.0,10473,25972
16799,14622,146,0.273133,0.0,11148,23488
16800,14170,14214,0.652375,0.0,10287,18688


In [18]:
idx = 24180
pred[pred.disease == idx].sort_values(by="score", ascending=False)[["label","score"]]

Unnamed: 0,label,score
4308,1.0,0.8508
5999,1.0,0.832559
688,1.0,0.830801
1,1.0,0.752792
2201,1.0,0.752717
6952,1.0,0.676001
1240,1.0,0.646692
5598,1.0,0.620009
7005,1.0,0.600299
6235,1.0,0.459423


In [19]:
disease_index = []
ap_at_10 = []
ap_at_5 = []
k = []
for disease in pred.disease.unique():
    labels = pred[pred.disease == disease].sort_values(by="score", ascending=False)["label"].values
    scores = pred[pred.disease == disease].sort_values(by="score", ascending=False)["score"].values
    k.append(len(labels))

    if len(labels) >= 10:
        labels = labels[:10]
        scores = scores[:10]
        ap_at_10.append(average_precision_score(labels,scores))  
    else:
        ap_at_10.append(average_precision_score(labels,scores))
    
    if len(labels) >= 5:
        labels = labels[:5]
        scores = scores[:5]
        ap_at_5.append(average_precision_score(labels,scores))
    else:
        ap_at_5.append(average_precision_score(labels,scores))

    disease_index.append(disease)

ap_df = pd.DataFrame({"disease_index":disease_index,"ap_at_10":ap_at_10,"ap_at_5":ap_at_5, "k":k})
ap_df



Unnamed: 0,disease_index,ap_at_10,ap_at_5,k
0,25553,1.0,1.0,3
1,24180,1.0,1.0,19
2,31007,1.0,1.0,2
3,31722,1.0,1.0,55
4,22836,1.0,1.0,1
...,...,...,...,...
8276,34168,-0.0,-0.0,1
8277,27990,-0.0,-0.0,1
8278,25972,-0.0,-0.0,1
8279,18688,-0.0,-0.0,1


In [20]:
ap_df.ap_at_5.mean().round(2)

0.33

In [11]:
ap_df[ap_df.k < 5]

Unnamed: 0,disease_index,ap_at_10,ap_at_5,k
0,25553,1.0,1.0,3
2,31007,1.0,1.0,2
4,22836,1.0,1.0,1
10,21097,1.0,1.0,1
11,30091,1.0,1.0,2
...,...,...,...,...
8276,34168,-0.0,-0.0,1
8277,27990,-0.0,-0.0,1
8278,25972,-0.0,-0.0,1
8279,18688,-0.0,-0.0,1


In [21]:
ap_df[ap_df.k >= 5].ap_at_5.mean().round(2)

0.97

In [22]:
ap_df[ap_df.k >= 10].ap_at_10.mean().round(2)

0.99

In [23]:
ap_df[ap_df.k < 5].ap_at_10.mean().round(2)

0.3

In [None]:
import plotly.express as px

aver = pd.merge(ap_df,node_df[node_df.node_type == "disease"], left_on="disease_index",right_index=True, how="left")
fig = px.scatter(aver,x="degree_gda",y="ap_at_10")
fig.show()

NameError: name 'ap_df' is not defined

# Hits

In [None]:
# disease_evals = {}
# for disease in tqdm(node_df[node_df.node_type == "disease"].index.values):
#     predictions = predictor.prioritize_one_vs_all(disease)
#     disease_evals[disease] = hits_at_k(disease,predictions,)

In [79]:
def load_hits_df(model_name,reports_folder=reports_folder):
    fnames = ["_total_disease.csv","_summary_disease.csv","_summary_gene.csv","_total_gene.csv"]
    dfs = []
    for fname in fnames:
        dfs.append(pd.read_csv(reports_folder+model_name+fname,index_col=0))
    return dfs

def group_by_range(data_df,group_column,ranges,inplace=True):
    if not inplace:
        df = data_df.copy()
        bins = np.digitize(df[group_column].values, ranges)
        df["bins"] = bins
        return df
    else:
        df = data_df
        bins = np.digitize(df[group_column].values, ranges)
        df["bins"] = bins

def plot_box(data_df,value_cols,title,range_text):
    melted_df = data_df[["bins",*value_cols]].melt("bins").rename(columns={"value":"hits"})
    melted_df["Nivel de Evidencia"] = melted_df.bins.apply(lambda x: range_text[x])
    fig = px.box(melted_df.sort_values(by="bins"),y="hits",x="Nivel de Evidencia",color="variable",title=title,width=900,height=450,labels={"hits":"Hits"})
    fig.update_yaxes(range=[-0.5, 10])
    fig.show()

In [81]:
model_name = "sage_ones_no_sampling"
# model_name = "sage_ones_first_negatives_exp_04_07_23__12_07"
hits_df = load_hits_df(model_name)

disease_ranges = np.array([10,50,100,hits_df[0].degree_gda.max()+1]).astype(int)
gene_ranges = np.array([5,20,50,100,hits_df[3].degree_gda.max()+1]).astype(int)
range_text = ["< 10","10-50","50-100","100 +"]

for i,data in enumerate(hits_df):
    if i<2:
        data = group_by_range(data[data.degree_gda != 0],"degree_gda",disease_ranges,inplace=False)
        hits_df[i] = data
    else:
        data = group_by_range(data[data.degree_gda != 0],"degree_gda",gene_ranges,inplace=False)
        hits_df[i] = data       

value_pairs = [["5_seen","5_new"],["10_seen","10_new"],["50_seen","50_new"]]
for pair in value_pairs:
    plot_box(hits_df[0],pair,"Evaluación Enfermedades",range_text)

In [63]:
# model_name = "sage_ones_no_sampling"
model_name = "sage_ones_first_negatives_exp_04_07_23__12_07"
hits_df = load_hits_df(model_name)

disease_ranges = np.array([10,50,100,hits_df[0].degree_gda.max()+1]).astype(int)
gene_ranges = np.array([5,20,50,100,hits_df[3].degree_gda.max()+1]).astype(int)
range_text = ["< 10","10-50","50-100","100 +"]

for i,data in enumerate(hits_df):
    if i<2:
        data = group_by_range(data[data.degree_gda != 0],"degree_gda",disease_ranges,inplace=False)
        hits_df[i] = data
    else:
        data = group_by_range(data[data.degree_gda != 0],"degree_gda",gene_ranges,inplace=False)
        hits_df[i] = data       

value_pairs = [["5_seen","5_new"],["10_seen","10_new"],["50_seen","50_new"]]
for pair in value_pairs:
    plot_box(hits_df[0],pair,"Evaluación Enfermedades",range_text)