In [1]:
%matplotlib inline

# Understand Neighbor Gene Component and Co-localization Graphs
This tutorial shows the disentanglement of neighbor gene components (NGC) and colocalization graphs, which are used as the input for training. 
We will use Nanostring CosMx NSCLC (He et al., 2022) data as an example.

### Import packages & data

In [2]:
import sys 
import numpy as np
import pandas as pd
import tifffile as tiff
import matplotlib.pyplot as plt 

import Bering as br

In [3]:
# load data
df_spots_seg = pd.read_csv('spots_seg.txt', sep='\t', header=0, index_col=0)
df_spots_unseg = pd.read_csv('spots_unseg.txt', sep='\t', header=0, index_col=0)
img = tiff.imread('image.tif')
channels = ['Nuclei', 'PanCK', 'Membrane']

### Create Bering object

In [4]:
bg = br.BrGraph(df_spots_seg, df_spots_unseg, img, channels)
bg

<Bering.objects.bering.Bering_Graph at 0x2af8439f4580>

### Construct graphs

In [5]:
# Build graphs for GCN training purpose
br.graphs.BuildWindowGraphs(
    bg, 
    n_cells_perClass = 10, 
    window_width = 100.0, 
    window_height = 100.0, 
    n_neighbors = 10, 
)

In [6]:
print(f'Number of node features: {bg.n_node_features}')

Number of node features: 981


In [7]:
graphs = bg.Graphs_golden
print(f'Number of graphs: {len(graphs)}')

graph = graphs[0].cpu()
print('Type of graph:', type(graph))
graph

Number of graphs: 436
Type of graph: <class 'torch_geometric.data.data.Data'>


Data(x=[994, 981], edge_index=[2, 9940], edge_attr=[9940], y=[994, 10], pos=[994, 4])

### cocalization graphs

In [8]:
graph.x

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 8.4660],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 8.4303],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 8.7936],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 4.7216],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 3.7851],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 3.6468]],
       dtype=torch.float64)

### Edges of the graph

In [9]:
graph.edge_index

tensor([[  0,   0,   0,  ..., 993, 993, 993],
        [  2,   1,   4,  ..., 704, 593, 291]])

### Labels of nodes

Labels of nodes are the cell type indices for individual transcripts

In [10]:
graph.y

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 1],
        [0, 0, 0,  ..., 0, 0, 1]])

### Node position matrix

In [11]:
pos_mtx = graph.pos.numpy()
df_pos = pd.DataFrame(pos_mtx, columns = ['molecule_id', 'x', 'y', 'cell_id'])
print(df_pos.shape)
df_pos.head()

(994, 4)


Unnamed: 0,molecule_id,x,y,cell_id
0,100012.0,1258.9501,609.375,292.0
1,100034.0,1265.075,605.1,292.0
2,100038.0,1258.45,608.375,292.0
3,100268.0,1265.899902,603.123108,292.0
4,100293.0,1265.083374,604.583313,292.0
