In [2]:
import os
import pandas as pd
import networkx as nx
from collections import defaultdict, Counter
import random
import numpy as np
import json
from data_loader.data_loaders import DataLoader
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Analysis of neighbor sets

Graph Synergy utilizes info from a Protein-Protein graph to give the model extra information to learn from.
For this for every cell and drug a set of neighbors is created to be used as input for the model.
But the number of neighbors in the set is limited by the variable `n_memory`.

We hypothesize that in many cases the number of neighbors in many cases is a lot higher than `n_memory` resulting in many neighbors to be ignored. This could potentially have a significant impact on model performance and the usefulness of the neighbor sets.\
Also this could explain why taking more hops into account does not improve performance a lot.

## Loading the data

In [3]:
data_dir = './data/DrugCombDB'
# data_dir = './data/OncologyScreen'

data_loader = DataLoader(data_dir, 64)

undirected graph
# proteins: 15970, # drugs: 764, # cells: 76
# protein-protein interactions: 217160, # drug-protein associations: 5290, # cell-protein associations: 27730
constructing neighbor set ... done
constructing neighbor set ... done


In [4]:
data_loader.cpi_df

Unnamed: 0,cell,protein
0,47,14099
1,47,10909
2,47,15697
3,47,2134
4,47,13552
...,...,...
27725,30,12064
27726,30,5332
27727,30,6070
27728,30,3765


In [5]:
values = data_loader.node_map_dict.values()
print(len(values))
print(sorted(list(values)))

16810
[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, 39, 40, 40, 40, 41, 41, 41, 42, 42, 42, 43, 43, 43, 44, 44, 44, 45, 45, 45, 46, 46, 46, 47, 47, 47, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 51, 51, 52, 52, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 57, 57, 57, 58, 58, 58, 59, 59, 59, 60, 60, 60, 61, 61, 61, 62, 62, 62, 63, 63, 63, 64, 64, 64, 65, 65, 65, 66, 66, 66, 67, 67, 67, 68, 68, 68, 69, 69, 69, 70, 70, 70, 71, 71, 71, 72, 72, 72, 73, 73, 73, 74, 74, 74, 75, 75, 75, 76, 76, 77, 77, 78, 78, 79, 79, 80, 80, 81, 81, 82, 82, 83, 83, 84, 84, 85, 85, 86, 86, 87, 87, 88, 88, 89, 89,

In [6]:
data_loader.cell_protein_dict

defaultdict(list,
            {0: [14338,
              3589,
              11787,
              3596,
              2574,
              9745,
              529,
              1042,
              4117,
              4630,
              6167,
              2078,
              7711,
              3107,
              1572,
              1573,
              4646,
              15395,
              11300,
              6185,
              2089,
              4139,
              14893,
              13875,
              2612,
              7734,
              1079,
              3128,
              12858,
              10299,
              12358,
              10318,
              2641,
              2130,
              10837,
              11861,
              13399,
              2136,
              2649,
              1621,
              10330,
              13918,
              5215,
              10849,
              12390,
              12391,
              1128,
              6760,
  

In [7]:
data_loader.graph.number_of_nodes()

15970

#### *data_loader variables*\
`cells`: ids of cells\
`cell_protein_dict`: ids of cells with associated protein\
`cell_neighbor_set`: for each cell there are lists of set of proteins for each hop \

`drugs`: ids of drugs\
`drug_protein_dict`: ids of drugs with associated protein\
`drug_neighbor_set`: for each drug there are lists of set of proteins for each hop \

### 1. Hop

In [8]:
# for first hop
number_of_neighbors = []
for i in data_loader.cells:
    length = len(data_loader.cell_protein_dict[i])
    number_of_neighbors.append(length)

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=1300, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

In [9]:
# for first hop
number_of_neighbors = []
for i in data_loader.drugs:
    length = len(data_loader.drug_protein_dict[i])
    number_of_neighbors.append(length)

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=1300, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()

### 2. Hop

In [10]:
# for second hop
number_of_neighbors = []
for i in data_loader.cells:
    neighbors = []
    for node in data_loader.cell_protein_dict[i]:
        neighbors += data_loader.graph.neighbors(node)
    neighbors = set(neighbors).difference(set(data_loader.cell_protein_dict[i]))
    number_of_neighbors.append(len(neighbors))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=12000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of second degree neighbors for each cell (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of second degree neighbors for each cell (limit 15970)')
fig.show()

In [11]:
# for second hop
number_of_neighbors = []
for i in data_loader.drugs:
    neighbors = []
    for node in data_loader.drug_protein_dict[i]:
        neighbors += data_loader.graph.neighbors(node)
    neighbors = set(neighbors).difference(set(data_loader.drug_protein_dict[i]))
    number_of_neighbors.append(len(neighbors))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=5000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of second degree neighbors for each drug (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of second degree neighbors for each drug (limit 15970)')
fig.show()

### 3. Hop

In [12]:
# for second hop
number_of_neighbors = []
for i in data_loader.cells:
    neighbors = data_loader.cell_protein_dict[i]
    neighbors_next = []
    neighbors_old = set()
    for j in range(2):
        for node in neighbors:
            neighbors_next += data_loader.graph.neighbors(node)
        neighbors = set(neighbors_next).difference(neighbors_old)
        neighbors_old = neighbors_old.union(neighbors)
        neighbors_next = []
    number_of_neighbors.append(len(set(neighbors)))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=12000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of third degree neighbors for each cell (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of third degree neighbors for each cell (limit 15970)')
fig.show()

In [13]:
# for second hop
number_of_neighbors = []
for i in data_loader.drugs:
    neighbors = data_loader.drug_protein_dict[i]
    neighbors_next = []
    neighbors_old = set()
    for j in range(2):
        for node in neighbors:
            neighbors_next += data_loader.graph.neighbors(node)
        neighbors = set(neighbors_next).difference(neighbors_old)
        neighbors_old = neighbors_old.union(neighbors)
        neighbors_next = []
    number_of_neighbors.append(len(set(neighbors)))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=12000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of third degree neighbors for each drug (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of third degree neighbors for each drug (limit 15970)')
fig.show()