In [8]:
import os
import pandas as pd
import networkx as nx
from collections import defaultdict, Counter
import random
import numpy as np
import json
from data_loader.data_loaders import DataLoader
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Analysis of neighbor sets

Graph Synergy utilizes info from a Protein-Protein graph to give the model extra information to learn from.
For this for every cell and drug a set of neighbors is created to be used as input for the model.
But the number of neighbors in the set is limited by the variable `n_memory`.

We hypothesize that in many cases the number of neighbors in many cases is a lot higher than `n_memory` resulting in many neighbors to be ignored. This could potentially have a significant impact on model performance and the usefulness of the neighbor sets.\
Also this could explain why taking more hops into account does not improve performance a lot.

## Loading the data

In [None]:
data_dir = './data/DrugCombDB'
# data_dir = './data/OncologyScreen'

data_loader = DataLoader(data_dir, 64)

undirected graph
# proteins: 15970, # drugs: 764, # cells: 76
# protein-protein interactions: 217160, # drug-protein associations: 5290, # cell-protein associations: 27730
constructing neighbor set ... done
constructing neighbor set ... done


In [32]:
data_loader.graph.number_of_nodes()

15970

#### *data_loader variables*\
`cells`: ids of cells\
`cell_protein_dict`: ids of cells with associated protein\
`cell_neighbor_set`: for each cell there are lists of set of proteins for each hop \

`drugs`: ids of drugs\
`drug_protein_dict`: ids of drugs with associated protein\
`drug_neighbor_set`: for each drug there are lists of set of proteins for each hop \

### 1. Hop

In [43]:
# for first hop
number_of_neighbors = []
for i in data_loader.cells:
    length = len(data_loader.cell_protein_dict[i])
    number_of_neighbors.append(length)

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=1300, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

In [44]:
# for first hop
number_of_neighbors = []
for i in data_loader.drugs:
    length = len(data_loader.drug_protein_dict[i])
    number_of_neighbors.append(length)

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=1300, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()

### 2. Hop

In [45]:
# for second hop
number_of_neighbors = []
for i in data_loader.cells:
    neighbors = []
    for node in data_loader.cell_protein_dict[i]:
        neighbors += data_loader.graph.neighbors(node)
    neighbors = set(neighbors).difference(set(data_loader.cell_protein_dict[i]))
    number_of_neighbors.append(len(neighbors))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=12000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

In [46]:
# for second hop
number_of_neighbors = []
for i in data_loader.drugs:
    neighbors = []
    for node in data_loader.drug_protein_dict[i]:
        neighbors += data_loader.graph.neighbors(node)
    neighbors = set(neighbors).difference(set(data_loader.drug_protein_dict[i]))
    number_of_neighbors.append(len(neighbors))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=5000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()

### 3. Hop

In [47]:
# for second hop
number_of_neighbors = []
for i in data_loader.cells:
    neighbors = data_loader.cell_protein_dict[i]
    neighbors_next = []
    neighbors_old = set()
    for j in range(2):
        for node in neighbors:
            neighbors_next += data_loader.graph.neighbors(node)
        neighbors = set(neighbors_next).difference(neighbors_old)
        neighbors_old = neighbors_old.union(neighbors)
        neighbors_next = []
    number_of_neighbors.append(len(set(neighbors)))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=12000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each cell (limit 15970)')
fig.show()

In [48]:
# for second hop
number_of_neighbors = []
for i in data_loader.drugs:
    neighbors = data_loader.drug_protein_dict[i]
    neighbors_next = []
    neighbors_old = set()
    for j in range(2):
        for node in neighbors:
            neighbors_next += data_loader.graph.neighbors(node)
        neighbors = set(neighbors_next).difference(neighbors_old)
        neighbors_old = neighbors_old.union(neighbors)
        neighbors_next = []
    number_of_neighbors.append(len(set(neighbors)))

fig = go.Figure(go.Histogram(x=number_of_neighbors, xbins=dict(start=0, end=12000, size=10), marker_color='mediumpurple'))
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()

fig = go.Figure(go.Violin(x=number_of_neighbors, box_visible=True, line_color='mediumpurple'))
fig.update_traces(meanline_visible=True, points='all', jitter=0.05)
fig.update_xaxes(range=[0, 15970])
fig.update_layout(title_text='Number of first degree neighbors for each drug (limit 15970)')
fig.show()