In [7]:
# --- UPDATET PREPROCESSING FUNCTION! ----
# Notbeook to create SUBGRAPH in networkx out of random nodes
# subgraph must be ego_graph to maintain connected nodes
# Subgraph should be used for Ablation study with less nodes 

# SECOND PURPOSE:
# create Graph, that contains less relations and therefore less Signal-to-Noise:
# a) delete all rare relations
# b) delete all relations, that dont contain any information for the risk-prediction task, e.g. "Is a", "Is descendeant", "Parent/Child"..

import pandas as pd
import networkx as nx
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from pykeen.triples import TriplesFactory



In [11]:
# FIRST
# preprocess graph

# funktion to create the triple_list containing the node-relation-node triple from the latest graph
# title to name triple list
def create_triple_list(title, preprocessed_graph): ##tripleList):
    #create Triple List
    tripleList=[]
    for u,v,data in preprocessed_graph.edges.data():
        l=[]
        l.append(u)
        l.append(data['edge_code'])
        l.append(v)
        tripleList.append(l)

    #needs triples as ndarray - shape (n,3), dtype:str 
    tripleArray=np.array(tripleList, dtype=str)
    print(type(tripleArray), tripleArray.shape, tripleArray.dtype)
    
    #save TripleList as feather file
    ##p_title = '/home/tilingl/Pykeen/Triple_Lists/'+ title +'.feather'
    # triple array to pandas df
    df = pd.DataFrame(tripleArray, columns=['node1', 'relation_code','node2'])
    df.head()
    df.tail()
    df.to_feather(title)
    print(title)
    #return tripleList
def read_text(path):
    # opening the file in read mode
    my_file = open(path, "r")
    # reading the file
    data = my_file.read()
    # replacing end of line('/n') with ' ' and
    # splitting the text it further when '.' is seen.
    data_into_list = data.split("\n")
    my_file.close()
    return data_into_list


# function to drop specific edgetypes and filter for weight
def preprocess_delete_relations(graph: nx.Graph, exclude_path):
        edge_types = []
        edge_dict = defaultdict(lambda: None)
        
        for u, v, data in graph.edges.data():
            edge_types.append(data["edge_type"])
            # counting the appearence of the edgetypes
            if edge_dict[data["edge_type"]] is None:
                edge_dict[data["edge_type"]] = 0
            else:
                edge_dict[data["edge_type"]] +=1
                
        
        # codes , uniques
        edge_codes, edge_types = pd.factorize(edge_types)
        
        node_types = []
        for n, data in graph.nodes.data():
            node_types.append(data["node_type"])

        node_codes, node_types = pd.factorize(node_types)

        preprocessed_graph = nx.DiGraph()
        preprocessed_graph.add_nodes_from(graph.nodes())

        preprocessed_graph.node_codes = node_codes
        preprocessed_graph.node_types = node_types

        omop_exclude_codes = []
        ot_exclude_dict = defaultdict(lambda: None)
        #drop shortcut bound to edge weight
        omop_exclude_codes.append(list(edge_types).index("Subsumes"))
        omop_exclude_codes.append(list(edge_types).index("Is a"))
        
        #load edge_codes that will be excluded
        irrelevant_edges=read_text(exclude_path)
        print(len(irrelevant_edges))
        # append to drop-list
        for code in irrelevant_edges:
            ot_exclude_dict[list(edge_types).index(code)] = 1
        # TODO: multiple indices?
        ot_exclude_dict[list(edge_types).index("Is descendant of")] = 1
        ot_exclude_dict[list(edge_types).index("Is ancestor of")] =1
        print(ot_exclude_dict)
        c2=0
        c1=0
        
        for (u, v, w), c in zip(graph.edges.data("edge_weight"), edge_codes):
            assert w is not None
            # drop shortcut edges( droped in new graph)
           #if c in omop_exclude_codes and w < 1.0:
                #ontinue
            if w <= 0.01:
                c1+=1
                continue
            
            if ot_exclude_dict[c] is not None:
                c2 +=1
                continue

            preprocessed_graph.add_edge(u, v, edge_weight=w, edge_code=c)
        
        print('counter skipped: ', c1, c2)
        return preprocessed_graph
    
# function to drop schortcut edges from:https://github.com/nebw/ehrgraphs/blob/master/ehrgraphs/data/data.py#L82-L117
def preprocess_graph_heterogeneous(graph: nx.Graph):
        edge_types = []
        for u, v, data in graph.edges.data():
            edge_types.append(data["edge_type"])

        edge_codes, edge_types = pd.factorize(edge_types)

        node_types = []
        for n, data in graph.nodes.data():
            node_types.append(data["node_type"])

        node_codes, node_types = pd.factorize(node_types)

        preprocessed_graph = nx.DiGraph()
        preprocessed_graph.add_nodes_from(graph.nodes())

        preprocessed_graph.node_codes = node_codes
        preprocessed_graph.node_types = node_types
        
        omop_exclude_codes = []
        ot_exclude_codes = []
        #drop shortcut edges =True
        omop_exclude_codes.append(list(edge_types).index("Subsumes"))
        omop_exclude_codes.append(list(edge_types).index("Is a"))
        # TODO: multiple indices?
        ot_exclude_codes.append(list(edge_types).index("Is descendant of"))
        ot_exclude_codes.append(list(edge_types).index("Is ancestor of"))
        print(len(omop_exclude_codes), omop_exclude_codes, '\n', len(ot_exclude_codes), ot_exclude_codes)
        c2=0
        c1=0
        for (u, v, w), c in zip(graph.edges.data("edge_weight"), edge_codes):
            assert w is not None
            

            # drop shortcut edges
            #f c in omop_exclude_codes and w < 1.0:
                #ontinue
            """
            if w <= 0.01:
                c1+=1
                continue"""

            if c in ot_exclude_codes:
                c2 +=1
                continue

            preprocessed_graph.add_edge(u, v, edge_weight=w, edge_code=c)

        #self.edge_types = edge_types
        print('counter scipped:', c1, c2)

        return preprocessed_graph

In [3]:
# Loading the Graph with networkx !

path = '/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/graph_full_220413.p'
# load latest graph into networkx
g = nx.read_gpickle(path)


In [4]:
######           BACHELOR THESIS            ################

# examine the networkx graph stats
preprocessed_g = preprocess_graph_heterogeneous(g)

print(f' in Total: number nodes: {g.number_of_nodes()} and number of edges: {g.number_of_edges()}')
print(f' in preprocessed: number nodes: {preprocessed_g.number_of_nodes()} and number of edges: {preprocessed_g.number_of_edges()}')
print(f' difference to preprocessed: {g.number_of_nodes() - preprocessed_g.number_of_nodes()}, {g.number_of_edges()-preprocessed_g.number_of_edges()}')

2 [0, 1] 
 2 [48, 49]
counter scipped: 3374992 163600
 in Total: number nodes: 739579 and number of edges: 24335194
 in preprocessed: number nodes: 739579 and number of edges: 20770287
 difference to preprocessed: 0, 3564907


In [12]:
######           BACHELOR THESIS            ################

# examine the networkx graph stats
# rare_relations
##rare_relations_G = preprocess_delete_relations(g, 'exclude_rare.txt') # 180924

print(f' in Total: number nodes: {rare_relations_G.number_of_nodes()} and number of edges: {rare_relations_G.number_of_edges()}')

# filtering with edge_weight lower 0.01
# attention: check to include w<=0.01 in preprocess_graph_heterogeneous
##less001_G= preprocess_graph_heterogeneous(g)
print(f' in Total: number nodes: {less001_G.number_of_nodes()} and number of edges: {less001_G.number_of_edges()}')

# less relations:
##less_G= preprocess_delete_relations(g, 'exclude_edges.txt')
print(f' in Total: number nodes: {less_G.number_of_nodes()} and number of edges: {less_G.number_of_edges()}')

# normal preprocessing:
# attention: check that w<=00.1 is commented out in preprocess_graph_heterogeneous
default_prepr_G=preprocess_graph_heterogeneous(g)
print(f' in Total: number nodes: {default_prepr_G.number_of_nodes()} and number of edges: {default_prepr_G.number_of_edges()}')


 in Total: number nodes: 739579 and number of edges: 20753877
 in Total: number nodes: 739579 and number of edges: 20770287
 in Total: number nodes: 739579 and number of edges: 17574086
2 [0, 1] 
 2 [48, 49]
counter scipped: 0 163600
 in Total: number nodes: 739579 and number of edges: 24145279


In [13]:
######           BACHELOR THESIS            ################
# identify the number of triples used with the filtered Graphs
"""title_path_list=[]
# rare-relations:
title = '/home/tilingl/Pykeen/Triple_Lists/'+ 'rare_relations-TEMP' +'.feather'
#create_triple_list(title, rare_relations_G)
title_path_list.append(title)

# edge-weigt lower 0.01:
title = '/home/tilingl/Pykeen/Triple_Lists/'+ 'less001_relations-TEMP' +'.feather'
create_triple_list(title, less001_G)
title_path_list.append(title)

# less relations
title = '/home/tilingl/Pykeen/Triple_Lists/'+ 'less_relations-TEMP' +'.feather'
create_triple_list(title, less_G)
title_path_list.append(title)
"""
title = '/home/tilingl/Pykeen/Triple_Lists/'+ 'default_relations-TEMP' +'.feather'
create_triple_list(title, default_prepr_G)
title_path_list.append(title)


# load into pykeen Triple Factory and print stats
for path in title_path_list:
    tripleArray = pd.read_feather(path).to_numpy()
    print('length of the triple array: ', len(tripleArray), type(tripleArray))
    tf = TriplesFactory.from_labeled_triples(tripleArray, create_inverse_triples=True)
    
    print('loading TriplesFactory done ... ')
    print(f'number entites: {tf.num_entities},number relations: {tf.num_relations} number of triples: {tf.num_triples}') 


<class 'numpy.ndarray'> (24145279, 3) <U21
/home/tilingl/Pykeen/Triple_Lists/default_relations-TEMP.feather
length of the triple array:  20753877 <class 'numpy.ndarray'>
loading TriplesFactory done ... 
number entites: 696095,number relations: 218 number of triples: 20753877
length of the triple array:  20770287 <class 'numpy.ndarray'>
loading TriplesFactory done ... 
number entites: 696328,number relations: 406 number of triples: 20770287
length of the triple array:  17574086 <class 'numpy.ndarray'>
loading TriplesFactory done ... 
number entites: 463713,number relations: 148 number of triples: 17574086
length of the triple array:  24145279 <class 'numpy.ndarray'>
loading TriplesFactory done ... 
number entites: 700380,number relations: 406 number of triples: 24145279


In [21]:
    # compare number of edges:
del G
G = preprocess_delete_relations(g, 'exclude_edges.txt') # 180924
# CREATE and SAVE deleted_relations graph in triple list:
titel = '/home/tilingl/Pykeen/Triple_Lists/'+ 'less_001_relations' +'.feather'
create_triple_list(titel, less001_G)

131
defaultdict(<function preprocess_delete_relations.<locals>.<lambda> at 0x7f2c106ef670>, {0: 1, 128: 1, 197: 1, 1: 1, 10: 1, 16: 1, 48: 1, 49: 1, 11: 1, 13: 1, 15: 1, 17: 1, 14: 1, 5: 1, 26: 1, 51: 1, 89: 1, 32: 1, 33: 1, 203: 1, 204: 1, 55: 1, 117: 1, 4: 1, 18: 1, 2: 1, 22: 1, 3: 1, 19: 1, 67: 1, 111: 1, 6: 1, 27: 1, 7: 1, 31: 1, 46: 1, 79: 1, 28: 1, 86: 1, 69: 1, 100: 1, 123: 1, 126: 1, 91: 1, 95: 1, 102: 1, 110: 1, 174: 1, 190: 1, 61: 1, 133: 1, 97: 1, 136: 1, 172: 1, 173: 1, 188: 1, 191: 1, 9: 1, 29: 1, 175: 1, 176: 1, 177: 1, 189: 1, 93: 1, 142: 1, 124: 1, 127: 1, 109: 1, 149: 1, 104: 1, 106: 1, 71: 1, 131: 1, 108: 1, 112: 1, 119: 1, 192: 1, 125: 1, 152: 1, 165: 1, 193: 1, 105: 1, 120: 1, 132: 1, 158: 1, 96: 1, 155: 1, 134: 1, 137: 1, 150: 1, 159: 1, 8: 1, 94: 1, 114: 1, 115: 1, 78: 1, 122: 1, 178: 1, 187: 1, 135: 1, 171: 1, 103: 1, 148: 1, 156: 1, 161: 1, 121: 1, 186: 1, 143: 1, 162: 1, 34: 1, 35: 1, 153: 1, 169: 1, 130: 1, 154: 1, 113: 1, 146: 1, 179: 1, 184: 1, 160: 1, 180: 

In [None]:
print(nx.is_directed(G))
print(nx.density(G))
print('Number of selfloops: ', nx.number_of_selfloops(G))

nx.info(G)
print(len(G.nodes()),len(G.edges()))

# CREATE and SAVE deleted_relations graph in triple list:
titel = '/home/tilingl/Pykeen/Triple_Lists/'+ 'deleted_relations' +'.feather'
create_triple_list(titel, G)

In [5]:
# SECOND
#del G
G = preprocess_graph_heterogeneous(g)
print(nx.is_directed(G))
print(nx.density(G))
print('Number of selfloops: ', nx.number_of_selfloops(G))

nx.info(G)

2 [0, 1] 
 2 [48, 49]
counter scipped: 0 163600
True
4.4143185828372486e-05
Number of selfloops:  5935


'DiGraph with 739579 nodes and 24145279 edges'

In [6]:
# SECOND POINT FIVE
# save Graph structure

titel = '/home/tilingl/Pykeen/Triple_Lists/'+ 'triple_list_filtered_graph_full_v3' +'.feather'
create_triple_list(titel, G)

<class 'numpy.ndarray'> (24145279, 3) <U21
/home/tilingl/Pykeen/Triple_Lists/triple_list_filtered_graph_full_v3.feather


In [14]:
# THIRD
# sample the nodes out of the preprocessed Graph and start building ego_graph
sampled_nodes=random.sample(G.nodes, 4)
Egos_list=[]
for node in sampled_nodes:
    ego = nx.ego_graph(G, node, radius=3, center=True, undirected=False, distance=None)
    Egos_list.append(ego)
    
#nx.draw(Egos_list[0])

since Python 3.9 and will be removed in a subsequent version.
  sampled_nodes=random.sample(G.nodes, 4)


In [15]:
for graph in Egos_list:
    print(nx.is_directed(graph))
    print(nx.density(graph))
    print('Number of selfloops: ', nx.number_of_selfloops(graph))
    print(nx.info(graph))

True
0.02040816326530612
Number of selfloops:  0
DiGraph with 98 nodes and 194 edges
True
4.230503366391334e-05
Number of selfloops:  1
DiGraph with 93437 nodes and 369339 edges
True
2.2505708206613222e-05
Number of selfloops:  0
DiGraph with 176773 nodes and 703270 edges
True
0.014995964136518618
Number of selfloops:  5321
DiGraph with 36810 nodes and 20318621 edges


In [16]:
# FOURTH
# create the triple List containing all the edge data out of the 10 sampled EgoGraphs
tripleList=[]
for graph in Egos_list:
    tripleList = create_triple_list("title", graph, tripleList)

<class 'numpy.ndarray'> (194, 3) <U21
<class 'numpy.ndarray'> (369533, 3) <U21
<class 'numpy.ndarray'> (1072803, 3) <U21
<class 'numpy.ndarray'> (21391424, 3) <U21


In [54]:
# Filter and Drop duplicate Rows in pandas TEST
tripleArray=np.array(tripleList, dtype=str)
print(len(tripleList))
dfd = pd.DataFrame(tripleArray, columns=['node1', 'relation_code','node2'])
dfd.sort_values("node1", inplace=True)
dfd.head()
dupl = dfd.duplicated(subset=['node1', 'relation_code','node2'])
print(dfd.duplicated().sum())
# drop duplicates
dfd.drop_duplicates(inplace=True, keep='first',subset=['node1', 'relation_code','node2'])
dfd

3700078
1965070


Unnamed: 0,node1,relation_code,node2
3654812,OMOP_1110410,20,OMOP_40481919
3654816,OMOP_1110410,21,OT_ENSG00000112038
3654815,OMOP_1110410,20,OMOP_4179955
3654814,OMOP_1110410,20,OMOP_312327
3654813,OMOP_1110410,20,OMOP_438112
...,...,...,...
3419063,phecode_978,197,OMOP_4121653
3419064,phecode_978,197,OMOP_4124845
3419065,phecode_978,197,OMOP_4121636
3419066,phecode_978,199,phecode_978


In [53]:
# PROVE for Filter duplicates
data = {'NODE':['Tom','Tom','Tom', 'nick', 'krish','krish','krish','krish','krish' ,'jack'], 'RELATION':[20, 20, 20,11,33,33,33,34,35, 18], 'NODE2':['Tom','Tom','Tom', 'nndf', 'krish2','krish4','krish2','krish2','krish2' ,'fdgdfg']}
 
# Create DataFrame
testdf = pd.DataFrame(data)
print(testdf.duplicated().sum())
print(testdf.duplicated(subset=['NODE', 'RELATION', 'NODE2']))
testdf

3
0    False
1     True
2     True
3    False
4    False
5    False
6     True
7    False
8    False
9    False
dtype: bool


Unnamed: 0,NODE,RELATION,NODE2
0,Tom,20,Tom
1,Tom,20,Tom
2,Tom,20,Tom
3,nick,11,nndf
4,krish,33,krish2
5,krish,33,krish4
6,krish,33,krish2
7,krish,34,krish2
8,krish,35,krish2
9,jack,18,fdgdfg


In [17]:
# FIFTH
#save TripleList as feather file
p_title = '/home/tilingl/Pykeen/Triple_Lists/'+ 'random_ego_graph3' +'.feather'
#needs triples as ndarray - shape (n,3), dtype:str 
tripleArray=np.array(tripleList, dtype=str)
#triple array to pandas df
df = pd.DataFrame(tripleArray, columns=['node1', 'relation_code','node2'])
print(df)
# drop duplicates
df_filtered = df.drop_duplicates(inplace=False, keep='first',subset=['node1', 'relation_code','node2'])
df_filtered.reset_index(inplace=True, drop=True) # has to reset bc to_feather do not wants deserialized indexes
print(df_filtered)
df_filtered.to_feather(p_title)
print(p_title)
df_filtered

                       node1 relation_code          node2
0              OMOP_40479714             1   OMOP_4033086
1               OMOP_4043304             1   OMOP_4269725
2              OMOP_40484974             1   OMOP_4033086
3               OMOP_4045031             1   OMOP_4260453
4               OMOP_4043303             1   OMOP_4269725
...                      ...           ...            ...
21391419  OT_ENSG00000187824            50   OMOP_4181194
21391420  OT_ENSG00000187824            50    OMOP_194984
21391421  OT_ENSG00000187824            50  OMOP_40488919
21391422  OT_ENSG00000187824            50  OMOP_40484156
21391423  OT_ENSG00000187824            50   OMOP_4171379

[21391424 rows x 3 columns]
                       node1 relation_code          node2
0              OMOP_40479714             1   OMOP_4033086
1               OMOP_4043304             1   OMOP_4269725
2              OMOP_40484974             1   OMOP_4033086
3               OMOP_4045031             1 

Unnamed: 0,node1,relation_code,node2
0,OMOP_40479714,1,OMOP_4033086
1,OMOP_4043304,1,OMOP_4269725
2,OMOP_40484974,1,OMOP_4033086
3,OMOP_4045031,1,OMOP_4260453
4,OMOP_4043303,1,OMOP_4269725
...,...,...,...
21041550,OT_ENSG00000187824,50,OMOP_4181194
21041551,OT_ENSG00000187824,50,OMOP_194984
21041552,OT_ENSG00000187824,50,OMOP_40488919
21041553,OT_ENSG00000187824,50,OMOP_40484156


In [None]:
# Check the numbers of triples in the file:
