# Setup

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import roc_auc_score
from scipy import sparse
import numpy as np
from torch_geometric import seed_everything

from tqdm import tqdm
import sys
sys.path.append("../../..")
from src.models import training_utils,base_model

seed = 4
seed_everything(seed)

data_folder = f"../../../data/processed/graph_data_nohubs/merged_types/split_dataset/seed_{seed}/"

In [2]:
datasets, node_map = training_utils.load_data(data_folder)
train_data, val_data = datasets

tensor_df = pd.read_csv(data_folder+"tensor_df.csv",index_col=0)

## Test xavier init

In [45]:
train_data = training_utils.initialize_features(train_data,"random_xavier",10)
train_data.x_dict["disease"]

Parameter containing:
tensor([[ 6.9402e-03, -9.9509e-03,  1.5045e-02,  ...,  1.2405e-02,
          4.1738e-03,  7.5350e-05],
        [-1.9038e-02,  1.3949e-03,  2.2808e-03,  ..., -1.3522e-03,
         -7.6680e-03, -5.6774e-03],
        [-1.6820e-02, -1.0448e-02, -3.7490e-03,  ...,  8.8095e-03,
         -1.3806e-02,  1.3130e-03],
        ...,
        [-6.2185e-03, -3.6905e-03,  1.1802e-03,  ...,  1.1522e-02,
         -1.2786e-02, -1.3349e-04],
        [ 1.4714e-02,  6.0671e-03, -1.3541e-02,  ..., -9.2281e-03,
         -1.5613e-02, -1.0152e-03],
        [-2.7347e-03,  5.3925e-03,  1.2545e-02,  ...,  9.0000e-04,
          6.4353e-03, -1.4098e-02]])

## Natural features

Vector LSA

In [82]:
def load_sparse_dataframe(matrix_path,row_path):
    mat = sparse.load_npz(matrix_path)
    row = np.loadtxt(row_path)
    df = pd.DataFrame.sparse.from_spmatrix(mat, index=row)
    return df

lsa_matrix_path = "../../../data/processed/graph_data_nohubs/LSA_data/lsa_matrix_0.npz"
index_path = "../../../data/processed/graph_data_nohubs/LSA_data/matrix_index_0.txt"

lsa_matrix = load_sparse_dataframe(lsa_matrix_path,index_path)
lsa_matrix = lsa_matrix.sparse.to_dense()

In [83]:
lsa_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
20754.0,0.134274,-0.017062,-0.051522,-0.060388,-0.006148,-0.016418,-0.015471,-0.035477,-0.002372,-0.025263,...,0.000935,0.016881,-0.034059,-0.016362,-0.004727,0.076864,0.015240,0.020612,0.041545,-0.013124
24165.0,0.137190,-0.300671,0.234870,0.104356,0.092940,0.064492,0.126296,0.037364,0.064942,0.030029,...,-0.012396,-0.026856,-0.005593,-0.007346,-0.000203,-0.001395,0.016450,-0.022085,0.013709,-0.003217
18773.0,0.139381,-0.293749,0.224677,0.104450,0.098511,0.066586,0.169885,0.015898,0.094341,0.006254,...,-0.021818,-0.009280,-0.011640,0.007414,-0.018007,0.019184,0.001507,0.010214,0.018266,-0.007354
18732.0,0.122613,-0.277884,0.227395,0.116528,0.104370,0.083421,0.309792,-0.053349,0.205181,-0.051788,...,-0.008870,-0.014389,0.009127,-0.027230,-0.002844,-0.007976,0.017064,0.005707,-0.010845,0.013501
18728.0,0.129513,-0.281187,0.225254,0.117831,0.101595,0.086694,0.302238,-0.059431,0.208859,-0.044756,...,-0.024839,0.009879,0.009744,-0.009497,0.003490,-0.015645,0.002977,-0.007414,0.002706,0.008735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30820.0,0.156439,0.004854,-0.077726,0.026575,-0.041974,0.075319,-0.029100,0.012105,0.031990,0.083214,...,0.013188,0.021869,0.015668,0.012398,0.014488,0.013122,0.029597,-0.027913,0.005425,-0.009401
30959.0,0.111105,0.020195,-0.001945,-0.031119,-0.029401,-0.035818,-0.025760,0.005888,-0.011574,0.001891,...,0.007074,-0.019973,-0.005719,-0.002065,0.035066,0.030590,-0.035403,0.000772,0.004327,-0.004401
33955.0,0.053066,-0.079916,0.055155,0.007849,0.011768,-0.003534,-0.066150,0.034540,-0.083015,0.046320,...,0.008698,-0.000442,0.030018,-0.002018,0.014918,-0.017016,0.043703,-0.059441,-0.021945,0.027495
32968.0,0.137585,-0.028859,0.004807,-0.047921,-0.053716,-0.054462,-0.056229,-0.001315,-0.055701,0.001595,...,-0.000907,-0.011576,0.012220,-0.004495,-0.018278,-0.023783,-0.008124,0.055886,0.013614,-0.010334


In [84]:
total_number = tensor_df.node_type.value_counts()["disease"]
missing_number = total_number - len(lsa_matrix)
missing_feature_rate = round((missing_number*100)/total_number,2)
print(f"Missing feature rate of {missing_feature_rate}%")

Missing feature rate of 23.08%


Con este missing rate, según el paper de FP podría reemplazar con global mean o zero.

Ahora tengo que sortearlo por tensor index y estamos

In [111]:
global_mean = lsa_matrix.mean(axis=0).values
lsa_matrix.index = lsa_matrix.index.astype(int)
has_feature = lsa_matrix.index.values
no_feature = list(set(tensor_df[tensor_df.node_type == "disease"].node_index.values) - set(has_feature))

global_mean_matrix = np.tile(global_mean,(len(no_feature),1))
global_mean_df = pd.DataFrame(global_mean_matrix, index=no_feature)
full_feature_df = pd.concat([lsa_matrix,global_mean_df]).reset_index().rename(columns={"index":"node_index"})

disease_only = tensor_df[tensor_df.node_type == "disease"]
full_feature_df = pd.merge(full_feature_df, tensor_df[["node_index","tensor_index"]], left_on="node_index", right_on="node_index").sort_values(by="tensor_index").drop(columns=["node_index"]).set_index("tensor_index")
full_feature_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,240,241,242,243,244,245,246,247,248,249
tensor_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.109681,-0.018991,-0.003220,-0.005686,-0.003634,-0.008605,-0.000755,0.002757,0.001019,0.002879,...,0.000180,0.000474,-0.000118,0.000564,-0.000392,-0.000240,-0.000087,-0.000053,0.000205,0.000081
1,0.134274,-0.017062,-0.051522,-0.060388,-0.006148,-0.016418,-0.015471,-0.035477,-0.002372,-0.025263,...,0.000935,0.016881,-0.034059,-0.016362,-0.004727,0.076864,0.015240,0.020612,0.041545,-0.013124
2,0.137190,-0.300671,0.234870,0.104356,0.092940,0.064492,0.126296,0.037364,0.064942,0.030029,...,-0.012396,-0.026856,-0.005593,-0.007346,-0.000203,-0.001395,0.016450,-0.022085,0.013709,-0.003217
3,0.139381,-0.293749,0.224677,0.104450,0.098511,0.066586,0.169885,0.015898,0.094341,0.006254,...,-0.021818,-0.009280,-0.011640,0.007414,-0.018007,0.019184,0.001507,0.010214,0.018266,-0.007354
4,0.122613,-0.277884,0.227395,0.116528,0.104370,0.083421,0.309792,-0.053349,0.205181,-0.051788,...,-0.008870,-0.014389,0.009127,-0.027230,-0.002844,-0.007976,0.017064,0.005707,-0.010845,0.013501
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16074,0.156439,0.004854,-0.077726,0.026575,-0.041974,0.075319,-0.029100,0.012105,0.031990,0.083214,...,0.013188,0.021869,0.015668,0.012398,0.014488,0.013122,0.029597,-0.027913,0.005425,-0.009401
16075,0.111105,0.020195,-0.001945,-0.031119,-0.029401,-0.035818,-0.025760,0.005888,-0.011574,0.001891,...,0.007074,-0.019973,-0.005719,-0.002065,0.035066,0.030590,-0.035403,0.000772,0.004327,-0.004401
16076,0.053066,-0.079916,0.055155,0.007849,0.011768,-0.003534,-0.066150,0.034540,-0.083015,0.046320,...,0.008698,-0.000442,0.030018,-0.002018,0.014918,-0.017016,0.043703,-0.059441,-0.021945,0.027495
16077,0.137585,-0.028859,0.004807,-0.047921,-0.053716,-0.054462,-0.056229,-0.001315,-0.055701,0.001595,...,-0.000907,-0.011576,0.012220,-0.004495,-0.018278,-0.023783,-0.008124,0.055886,0.013614,-0.010334


In [112]:
lsa_feature_tensor = torch.tensor(full_feature_df.values)