In [1]:
import csv
import os
import zipfile
from functools import reduce

import networkx as nx
import numpy as np
import pandas as pd
import rustworkx as rx
import torch
import torch.nn.functional as F
from torch_geometric.data import Batch, Data

# Create gene-gene network from PathwayCommons
- Selects important genes (N=3000) from each data source:
    - Top N genes by variance from each input matrix
    - Top N genes by centrality from gene-gene network
    - Top N genes with most interactions from drug-target interaction data
- Filters gene-gene interaction network based on selected genes
- Creates feature vectors for each cell line, combining:
    - Gene expression
    - Methylation
    - Copy number variation
    - Mutation data
- Generates graph data structure compatible with PyTorch Geometric:
    - 5,181 genes (nodes)
    - 60 cell lines
    - 630,632 edges (gene-gene interactions)

# Obtain from PathwayCommons [Link](https://www.pathwaycommons.org/archives/)

In [2]:
exp = pd.read_csv("../data/nci60_gene_exp.csv", index_col=0).dropna()
met = pd.read_csv("../data/nci60_methylation.csv", index_col=0).dropna()
mut = pd.read_csv("../data/nci60_mutation.csv", index_col=0).dropna()
cop = pd.read_csv("../data/nci60_copy.csv", index_col=0).dropna()

In [3]:
def normalize_expression(data):
    tpm = data.div(data.sum(axis=0)) * 1e6

    log2_tpm = np.log2(tpm + 1)

    low = np.percentile(log2_tpm.values, 0.1)
    high = np.percentile(log2_tpm.values, 99.9)
    log2_tpm_winsor = log2_tpm.clip(lower=low, upper=high)

    expressed_genes = (log2_tpm_winsor > 0).sum(axis=1) > (
        log2_tpm_winsor.shape[1] * 0.25
    )
    log2_tpm_filtered = log2_tpm_winsor.loc[expressed_genes]

    return log2_tpm_filtered

In [4]:
norm_exp = normalize_expression(exp)

In [5]:
all_genes = set(norm_exp.index) | set(met.index) | set(mut.index) | set(cop.index)
print(f"All genes: {len(all_genes)}")

All genes: 24787


In [6]:
df = pd.read_table("../data/PC14.All.hgnc.txt.sif.gz", sep="\t", header=None)
df

Unnamed: 0,0,1,2
0,A1BG,controls-expression-of,A2M
1,A1BG,interacts-with,ABCC6
2,A1BG,interacts-with,ACE2
3,A1BG,interacts-with,ADAM10
4,A1BG,interacts-with,ADAM17
...,...,...,...
2467857,ZZZ3,controls-state-change-of,H3C3
2467858,ZZZ3,controls-state-change-of,H3C4
2467859,ZZZ3,controls-state-change-of,H3C6
2467860,ZZZ3,controls-state-change-of,H3C7


In [7]:
common_genes = sorted((set(df[0]) | set(df[2])) & set(all_genes))
print(len(common_genes))

20997


# Made all vectors to the same size with fillna(0)

In [8]:
df = df.loc[df[0].isin(common_genes) & df[2].isin(common_genes)].reset_index(drop=True)
norm_exp = norm_exp.reindex(common_genes).fillna(0).sort_index()
met = met.reindex(common_genes).fillna(0).sort_index()
mut = mut.reindex(common_genes).fillna(0).sort_index()
cop = cop.reindex(common_genes).fillna(0).sort_index()

In [9]:
print("Filtered df shape:", df.shape)
print("norm_exp shape:", norm_exp.shape)
print("met shape:", met.shape)
print("mut shape:", mut.shape)
print("cop shape:", cop.shape)

Filtered df shape: (1531987, 3)
norm_exp shape: (20997, 60)
met shape: (20997, 60)
mut shape: (20997, 60)
cop shape: (20997, 60)


# Choose top N variance genes for each input matrix and choose 2 or more overwrapping genes.

In [10]:
N = 3000

In [11]:
genesets = []
for data in [norm_exp, met, mut, cop]:
    genesets.extend(list(data.var(axis=1).nlargest(N).index))

In [12]:
tmp = pd.Series(genesets).value_counts()
tmp = tmp[tmp >= 3]
mat_imp = sorted(tmp.index)
mat_imp

['ABCB1',
 'ABCB5',
 'ABCC3',
 'ABCD1',
 'ACO2',
 'ACSL4',
 'ADAM12',
 'ADAM18',
 'ADAM33',
 'ADAMTS1',
 'ADAMTS12',
 'ADAMTS2',
 'ADCY8',
 'ADD2',
 'ADGRL2',
 'AFF2',
 'ALDH3A1',
 'ALPK2',
 'AMOT',
 'AMOTL1',
 'APBB1IP',
 'AR',
 'ARHGAP36',
 'ARHGAP4',
 'ARMCX1',
 'ARMCX2',
 'ARMCX3',
 'ARSH',
 'ARX',
 'ASB15',
 'ATP10A',
 'ATP11A',
 'ATP6V0A1',
 'ATP8B1',
 'AUTS2',
 'AWAT1',
 'BCAT1',
 'BCL9L',
 'BEX1',
 'BEX4',
 'BEX5',
 'BICC1',
 'BMP7',
 'BNC2',
 'C6orf132',
 'CA5B',
 'CA8',
 'CABYR',
 'CARD16',
 'CAV2',
 'CCBE1',
 'CCDC178',
 'CCNB3',
 'CDH1',
 'CDH11',
 'CDH4',
 'CDH6',
 'CDK14',
 'CDKN2A',
 'CDR1',
 'CELF2',
 'CENPB',
 'CENPV',
 'CFH',
 'CFTR',
 'CHRDL1',
 'CHRM2',
 'CLDN2',
 'CLSTN3',
 'COBL',
 'COL11A1',
 'COL1A1',
 'COL1A2',
 'COL4A1',
 'COL4A2',
 'COL5A1',
 'COL5A2',
 'COL6A2',
 'COL7A1',
 'CPE',
 'CPNE1',
 'CREB5',
 'CSAG1',
 'CTHRC1',
 'CYP24A1',
 'DAW1',
 'DCT',
 'DEF6',
 'DLG3',
 'DLX5',
 'DOK6',
 'DSC1',
 'DSG4',
 'DUSP4',
 'DZIP1',
 'ECHDC3',
 'EDA2R',
 'EDIL3',
 'EDN

# Choose top N centrarilies genes from gene-gene network.

In [13]:
G = nx.from_pandas_edgelist(df, 0, 2)
degree_centrality = nx.degree_centrality(G)

In [14]:
top_nodes = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:N]
top_nodes
top_nodes = sorted([node for node, centrality in top_nodes])
top_nodes

['A2M',
 'AAAS',
 'AACS',
 'AAK1',
 'AAR2',
 'AARS2',
 'AASDHPPT',
 'AATF',
 'ABAT',
 'ABCB1',
 'ABCC1',
 'ABCC2',
 'ABCC3',
 'ABCE1',
 'ABCG2',
 'ABI1',
 'ABI2',
 'ABL1',
 'ABLIM1',
 'ABT1',
 'ACAA1',
 'ACAA2',
 'ACACA',
 'ACAD9',
 'ACADM',
 'ACADVL',
 'ACAT1',
 'ACAT2',
 'ACE2',
 'ACIN1',
 'ACLY',
 'ACO2',
 'ACOT2',
 'ACOT7',
 'ACSL1',
 'ACSL3',
 'ACSL4',
 'ACSL5',
 'ACSM5',
 'ACSS1',
 'ACSS2',
 'ACTA1',
 'ACTA2',
 'ACTB',
 'ACTBL2',
 'ACTC1',
 'ACTG1',
 'ACTG2',
 'ACTL6A',
 'ACTN1',
 'ACTN2',
 'ACTN3',
 'ACTN4',
 'ACTR1A',
 'ACTR2',
 'ACTR3',
 'ADAR',
 'ADARB1',
 'ADIPOQ',
 'ADM',
 'ADORA1',
 'ADORA2A',
 'ADORA2B',
 'ADPGK',
 'ADRB2',
 'ADRM1',
 'AFG3L2',
 'AGO1',
 'AGO2',
 'AGPAT1',
 'AGR2',
 'AGRN',
 'AGT',
 'AGTR1',
 'AHCY',
 'AHCYL1',
 'AHCYL2',
 'AHNAK',
 'AHR',
 'AHSA1',
 'AHSG',
 'AIFM1',
 'AIMP1',
 'AIMP2',
 'AK2',
 'AK3',
 'AK4',
 'AKAP1',
 'AKAP9',
 'AKR1B1',
 'AKR1C3',
 'AKT1',
 'AKT2',
 'AKT3',
 'ALAD',
 'ALAS1',
 'ALB',
 'ALDH1A1',
 'ALDH1A3',
 'ALDH1B1',
 'ALDH2',
 'AL

# Choose top N genes' which have many DTI

In [15]:
dti = pd.read_csv(
    "../../DTI-quantification/data/drug_gene_score.csv.gz",
    usecols=["NSC", "gene", "log_Y"],
).dropna()
nodes = sorted(set(df[0]) | set(df[2]))
dti = dti[dti.gene.isin(nodes)]
dti = dti[
    dti["NSC"].isin(
        sorted(set(pd.read_csv("../data/dataset.csv", usecols=["NSC"])["NSC"]))
    )
]
top_dti = sorted(dti["gene"].value_counts().head(N).index)
top_dti

['AAK1',
 'AASDH',
 'AASDHPPT',
 'ABCA1',
 'ABCA10',
 'ABCA12',
 'ABCA13',
 'ABCA2',
 'ABCA3',
 'ABCA4',
 'ABCA5',
 'ABCA6',
 'ABCA7',
 'ABCA9',
 'ABCB1',
 'ABCB10',
 'ABCB11',
 'ABCB4',
 'ABCB5',
 'ABCB6',
 'ABCB7',
 'ABCC1',
 'ABCC10',
 'ABCC12',
 'ABCC2',
 'ABCC3',
 'ABCC4',
 'ABCC5',
 'ABCC6',
 'ABCC9',
 'ABCD1',
 'ABCD3',
 'ABCG1',
 'ABCG2',
 'ABCG8',
 'ABL1',
 'ABL2',
 'ACAA1',
 'ACAA2',
 'ACACB',
 'ACADSB',
 'ACAT1',
 'ACAT2',
 'ACCS',
 'ACE',
 'ACHE',
 'ACKR3',
 'ACO1',
 'ACO2',
 'ACOX2',
 'ACP5',
 'ACR',
 'ACSBG1',
 'ACSL3',
 'ACSL4',
 'ACSL5',
 'ACSM3',
 'ACSS2',
 'ACSS3',
 'ACTA1',
 'ACTA2',
 'ACTBL2',
 'ACTC1',
 'ACTL7B',
 'ACTL8',
 'ACTL9',
 'ACTR10',
 'ACTR1A',
 'ACTR1B',
 'ACTRT1',
 'ACTRT2',
 'ACTRT3',
 'ACVR1',
 'ACY1',
 'ADA',
 'ADAM10',
 'ADAM17',
 'ADAMTS1',
 'ADAT2',
 'ADCY1',
 'ADCY2',
 'ADCY4',
 'ADCY5',
 'ADCY6',
 'ADCY8',
 'ADCY9',
 'ADD1',
 'ADD2',
 'ADD3',
 'ADGRA3',
 'ADGRL3',
 'ADH1A',
 'ADH1B',
 'ADH5',
 'ADH6',
 'ADI1',
 'ADORA2B',
 'ADRA1A',
 'ADRA1B',
 

In [16]:
genes = sorted(set(top_nodes) | set(mat_imp) | set(top_dti))
len(genes)

5187

In [17]:
for i in genes:
    print(i)

A2M
AAAS
AACS
AAK1
AAR2
AARS2
AASDH
AASDHPPT
AATF
ABAT
ABCA1
ABCA10
ABCA12
ABCA13
ABCA2
ABCA3
ABCA4
ABCA5
ABCA6
ABCA7
ABCA9
ABCB1
ABCB10
ABCB11
ABCB4
ABCB5
ABCB6
ABCB7
ABCC1
ABCC10
ABCC12
ABCC2
ABCC3
ABCC4
ABCC5
ABCC6
ABCC9
ABCD1
ABCD3
ABCE1
ABCG1
ABCG2
ABCG8
ABI1
ABI2
ABL1
ABL2
ABLIM1
ABT1
ACAA1
ACAA2
ACACA
ACACB
ACAD9
ACADM
ACADSB
ACADVL
ACAT1
ACAT2
ACCS
ACE
ACE2
ACHE
ACIN1
ACKR3
ACLY
ACO1
ACO2
ACOT2
ACOT7
ACOX2
ACP5
ACR
ACSBG1
ACSL1
ACSL3
ACSL4
ACSL5
ACSM3
ACSM5
ACSS1
ACSS2
ACSS3
ACTA1
ACTA2
ACTB
ACTBL2
ACTC1
ACTG1
ACTG2
ACTL6A
ACTL7B
ACTL8
ACTL9
ACTN1
ACTN2
ACTN3
ACTN4
ACTR10
ACTR1A
ACTR1B
ACTR2
ACTR3
ACTRT1
ACTRT2
ACTRT3
ACVR1
ACY1
ADA
ADAM10
ADAM12
ADAM17
ADAM18
ADAM33
ADAMTS1
ADAMTS12
ADAMTS2
ADAR
ADARB1
ADAT2
ADCY1
ADCY2
ADCY4
ADCY5
ADCY6
ADCY8
ADCY9
ADD1
ADD2
ADD3
ADGRA3
ADGRL2
ADGRL3
ADH1A
ADH1B
ADH5
ADH6
ADI1
ADIPOQ
ADM
ADORA1
ADORA2A
ADORA2B
ADPGK
ADRA1A
ADRA1B
ADRA1D
ADRA2A
ADRA2C
ADRB1
ADRB2
ADRB3
ADRM1
AEBP2
AFF2
AFG3L2
AFP
AGMAT
AGO1
AGO2
AGPAT1
AGPS
AGR2
AGRN
AGT
AGTR1

In [18]:
df = df[(df[0].isin(genes)) & (df[2].isin(genes))].reset_index(drop=True)
df.columns = ["s", "v", "t"]
df

Unnamed: 0,s,v,t
0,A2M,interacts-with,ABCC6
1,A2M,interacts-with,ACTB
2,A2M,interacts-with,ADAMTS12
3,A2M,interacts-with,ADAMTS1
4,A2M,interacts-with,AGO2
...,...,...,...
695208,ZWILCH,controls-state-change-of,STAG2
695209,ZWILCH,in-complex-with,ZWINT
695210,ZWINT,controls-state-change-of,STAG2
695211,ZYX,controls-state-change-of,VASP


In [19]:
updated_genes = sorted(set(df["s"]) | set(df["t"]))

In [20]:
for i in updated_genes:
    print(i)

A2M
AAAS
AACS
AAK1
AAR2
AARS2
AASDH
AASDHPPT
AATF
ABAT
ABCA1
ABCA10
ABCA12
ABCA13
ABCA2
ABCA3
ABCA4
ABCA5
ABCA6
ABCA7
ABCA9
ABCB1
ABCB10
ABCB11
ABCB4
ABCB5
ABCB6
ABCB7
ABCC1
ABCC10
ABCC12
ABCC2
ABCC3
ABCC4
ABCC5
ABCC6
ABCC9
ABCD1
ABCD3
ABCE1
ABCG1
ABCG2
ABCG8
ABI1
ABI2
ABL1
ABL2
ABLIM1
ABT1
ACAA1
ACAA2
ACACA
ACACB
ACAD9
ACADM
ACADSB
ACADVL
ACAT1
ACAT2
ACCS
ACE
ACE2
ACHE
ACIN1
ACKR3
ACLY
ACO1
ACO2
ACOT2
ACOT7
ACOX2
ACP5
ACR
ACSBG1
ACSL1
ACSL3
ACSL4
ACSL5
ACSM3
ACSM5
ACSS1
ACSS2
ACSS3
ACTA1
ACTA2
ACTB
ACTBL2
ACTC1
ACTG1
ACTG2
ACTL6A
ACTL7B
ACTL8
ACTL9
ACTN1
ACTN2
ACTN3
ACTN4
ACTR10
ACTR1A
ACTR1B
ACTR2
ACTR3
ACTRT1
ACTRT2
ACTRT3
ACVR1
ACY1
ADA
ADAM10
ADAM12
ADAM17
ADAM18
ADAM33
ADAMTS1
ADAMTS12
ADAMTS2
ADAR
ADARB1
ADAT2
ADCY1
ADCY2
ADCY4
ADCY5
ADCY6
ADCY8
ADCY9
ADD1
ADD2
ADD3
ADGRA3
ADGRL2
ADGRL3
ADH1A
ADH1B
ADH5
ADH6
ADI1
ADIPOQ
ADM
ADORA1
ADORA2A
ADORA2B
ADPGK
ADRA1A
ADRA1B
ADRA1D
ADRA2A
ADRA2C
ADRB1
ADRB2
ADRB3
ADRM1
AEBP2
AFF2
AFG3L2
AFP
AGMAT
AGO1
AGO2
AGPAT1
AGPS
AGR2
AGRN
AGT
AGTR1

# Total # of Genes

In [21]:
len(updated_genes)

5181

In [22]:
mut = mut[mut.index.isin(updated_genes)]
norm_exp = norm_exp[norm_exp.index.isin(updated_genes)]
met = met[met.index.isin(updated_genes)]
cop = cop[cop.index.isin(updated_genes)]

In [23]:
print("exp shape:", norm_exp.shape)
print("met shape:", met.shape)
print("mut shape:", mut.shape)
print("cop shape:", cop.shape)

exp shape: (5181, 60)
met shape: (5181, 60)
mut shape: (5181, 60)
cop shape: (5181, 60)


In [24]:
tmp = pd.concat([df.drop("v", axis=1), pd.get_dummies(df["v"]).astype(int)], axis=1)
tmp

Unnamed: 0,s,t,catalysis-precedes,controls-expression-of,controls-phosphorylation-of,controls-state-change-of,controls-transport-of,in-complex-with,interacts-with
0,A2M,ABCC6,0,0,0,0,0,0,1
1,A2M,ACTB,0,0,0,0,0,0,1
2,A2M,ADAMTS12,0,0,0,0,0,0,1
3,A2M,ADAMTS1,0,0,0,0,0,0,1
4,A2M,AGO2,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
695208,ZWILCH,STAG2,0,0,0,1,0,0,0
695209,ZWILCH,ZWINT,0,0,0,0,0,1,0
695210,ZWINT,STAG2,0,0,0,1,0,0,0
695211,ZYX,VASP,0,0,0,1,0,0,0


In [25]:
len(set(tmp["s"]) | set(tmp["t"]))

5181

In [26]:
tmp = tmp.groupby(["s", "t"]).sum().reset_index()
tmp

Unnamed: 0,s,t,catalysis-precedes,controls-expression-of,controls-phosphorylation-of,controls-state-change-of,controls-transport-of,in-complex-with,interacts-with
0,A2M,ABCC6,0,0,0,0,0,0,1
1,A2M,ACTB,0,0,0,0,0,0,1
2,A2M,ADAMTS1,0,0,0,0,0,0,1
3,A2M,ADAMTS12,0,0,0,0,0,0,1
4,A2M,AGO2,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
630627,ZRANB1,ZYX,0,0,0,0,0,0,1
630628,ZWILCH,STAG2,0,0,0,1,0,0,0
630629,ZWILCH,ZWINT,0,0,0,0,0,1,0
630630,ZWINT,STAG2,0,0,0,1,0,0,0


In [27]:
tmp.to_csv("gene_graph.csv.gz", compression="gzip")

In [28]:
graph = tmp[["s", "t"]]
edge_features = tmp.drop(["s", "t"], axis=1)
nodes = sorted(set(tmp["s"].tolist() + tmp["t"].tolist()))
node_to_idx = {node: idx for idx, node in enumerate(nodes)}

In [29]:
edge_index = (
    torch.tensor([[node_to_idx[s], node_to_idx[t]] for s, t in zip(tmp["s"], tmp["t"])])
    .t()
    .contiguous()
)
edge_attr = torch.tensor(tmp.iloc[:, 2:].values, dtype=torch.float)

# Generate gene-gene graph with input x and edge info

In [30]:
def create_node_features_dict(matrix1, matrix2, matrix3, matrix4, genes, node_to_idx):
    features_dict = {}

    for column in matrix1.columns:
        features1 = matrix1.loc[genes, column].values
        features2 = matrix2.loc[genes, column].values
        features3 = matrix3.loc[genes, column].values
        features4 = matrix4.loc[genes, column].values

        # 各細胞株に対して4次元のベクトルを作成
        features = np.stack([features1, features2, features3, features4], axis=-1)

        # node_to_idxの順序に合わせて並べ替え
        sorted_features = np.array([features[node_to_idx[gene]] for gene in genes])

        # Dictに追加
        features_dict[column] = torch.tensor(sorted_features, dtype=torch.float)

    return features_dict

In [31]:
genes = list(norm_exp.index)
genes

['A2M',
 'AAAS',
 'AACS',
 'AAK1',
 'AAR2',
 'AARS2',
 'AASDH',
 'AASDHPPT',
 'AATF',
 'ABAT',
 'ABCA1',
 'ABCA10',
 'ABCA12',
 'ABCA13',
 'ABCA2',
 'ABCA3',
 'ABCA4',
 'ABCA5',
 'ABCA6',
 'ABCA7',
 'ABCA9',
 'ABCB1',
 'ABCB10',
 'ABCB11',
 'ABCB4',
 'ABCB5',
 'ABCB6',
 'ABCB7',
 'ABCC1',
 'ABCC10',
 'ABCC12',
 'ABCC2',
 'ABCC3',
 'ABCC4',
 'ABCC5',
 'ABCC6',
 'ABCC9',
 'ABCD1',
 'ABCD3',
 'ABCE1',
 'ABCG1',
 'ABCG2',
 'ABCG8',
 'ABI1',
 'ABI2',
 'ABL1',
 'ABL2',
 'ABLIM1',
 'ABT1',
 'ACAA1',
 'ACAA2',
 'ACACA',
 'ACACB',
 'ACAD9',
 'ACADM',
 'ACADSB',
 'ACADVL',
 'ACAT1',
 'ACAT2',
 'ACCS',
 'ACE',
 'ACE2',
 'ACHE',
 'ACIN1',
 'ACKR3',
 'ACLY',
 'ACO1',
 'ACO2',
 'ACOT2',
 'ACOT7',
 'ACOX2',
 'ACP5',
 'ACR',
 'ACSBG1',
 'ACSL1',
 'ACSL3',
 'ACSL4',
 'ACSL5',
 'ACSM3',
 'ACSM5',
 'ACSS1',
 'ACSS2',
 'ACSS3',
 'ACTA1',
 'ACTA2',
 'ACTB',
 'ACTBL2',
 'ACTC1',
 'ACTG1',
 'ACTG2',
 'ACTL6A',
 'ACTL7B',
 'ACTL8',
 'ACTL9',
 'ACTN1',
 'ACTN2',
 'ACTN3',
 'ACTN4',
 'ACTR10',
 'ACTR1A',
 'ACTR

In [32]:
node_features_dict = create_node_features_dict(
    norm_exp, met, cop, mut, genes, node_to_idx
)

In [33]:
gene_dict = {
    i: Data(
        x=node_features_dict[i],
        edge_index=edge_index,
        edge_attr=edge_attr,
        num_nodes=len(genes),
    )
    for i in node_features_dict.keys()
}

In [34]:
gene_dict

{'MCF7': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'MDA_MB_231': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'HS578T': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'BT_549': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'T47D': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'SF_268': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'SF_295': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'SF_539': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'SNB_19': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'SNB_75': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 7], num_nodes=5181),
 'U251': Data(x=[5181, 4], edge_index=[2, 630632], edge_attr=[630632, 

In [35]:
torch.save(gene_dict, "../data/gene_dict.pt")

In [36]:
with open("../data/genes.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(norm_exp.index)

In [37]:
norm_exp

Unnamed: 0,MCF7,MDA_MB_231,HS578T,BT_549,T47D,SF_268,SF_295,SF_539,SNB_19,SNB_75,...,PC_3,DU_145,786_0,A498,ACHN,CAKI_1,RXF_393,SN12C,TK_10,UO_31
A2M,0.225771,0.084206,0.450539,0.000000,0.033559,0.164135,0.095539,1.226725,2.023682,9.322696,...,0.383955,0.464141,0.030977,0.230616,0.263501,0.259579,0.111258,0.218692,0.000000,0.000000
AAAS,5.604620,1.215550,4.470427,4.966709,4.785006,3.828443,5.499151,5.081841,4.625646,4.942454,...,5.979123,4.191172,1.864918,4.520794,5.370099,5.955231,5.850001,5.903750,5.701371,5.940361
AACS,3.652581,1.215550,3.969199,2.706955,3.347405,1.839384,4.370416,4.600043,3.967272,3.636667,...,4.627743,3.026299,1.249876,3.715183,4.378396,4.377031,4.736299,3.866441,4.323526,5.149219
AAK1,2.340749,2.984039,3.935033,2.938034,1.683549,2.160348,4.140527,4.261694,3.434542,3.931575,...,4.121379,2.371406,1.100904,3.670057,3.435497,3.183060,4.740632,3.282365,3.542252,3.331940
AAR2,5.340818,4.453930,5.467703,6.150546,4.874533,2.976026,5.111947,5.957258,5.942065,5.545356,...,5.963699,3.386227,2.264128,4.598547,4.585235,5.330156,5.722185,5.220406,5.665074,5.645103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZRANB1,4.656755,4.058069,4.813769,3.854903,3.076673,2.004447,3.520435,4.342655,4.674100,3.723256,...,4.437840,2.481705,1.399804,2.605146,3.644118,4.166165,4.580663,3.688621,4.040161,4.130576
ZRANB3,2.274943,1.078560,1.405708,1.355907,0.624125,0.339236,0.918418,1.575985,0.997485,1.124123,...,2.102011,0.840456,0.250853,1.289521,1.972937,1.168955,2.936232,0.726580,2.049484,1.792172
ZWILCH,5.617952,5.682779,4.251918,3.875954,3.386889,2.047135,5.009880,4.842032,3.382470,4.523799,...,5.250870,3.037455,1.834512,3.488880,4.526237,4.580914,5.502519,3.712835,5.205146,5.174041
ZWINT,6.999310,6.389284,4.931084,4.785747,5.938215,4.637025,6.355634,5.913293,3.857115,6.351014,...,7.698155,4.215935,3.378216,5.154189,6.150063,5.423675,6.447294,7.123263,6.401519,6.446546
