In [1]:
import pandas as pd
import pickle
import numpy as np
import torch


In [None]:
cancer_type = 'Neuroblastoma'
#cancer_type = 'Non-Small Cell Lung Cancer_Lung Neuroendocrine Tumor'
BASE_PATH = "./Data/"
ppi = "Reactome"
remove_rpl = "_noRPL"
remove_commonE = ""
useSTD = "STD"
crispr_threshold_pos = -1.5
cell_feat_name = "cnv"
gene_feat_name = 'cgp'

with open(BASE_PATH+f"multigraphs/{cancer_type.replace(' ', '_')}_{ppi}{remove_rpl}_{useSTD}{remove_commonE}_crispr{str(crispr_threshold_pos).replace('.','_')}.pickle", 'rb') as handle:
    mg_obj = pickle.load(handle) #3columns: gene_A (gene or cell line), gene_B, type (scaffold or depmap)
#type2nodes and int2gene are both functions defined in NetworkAnalys
#all_genes_int is a list of integers that represent the genes 
#all_genes_name is a list of gene names
all_genes_int = mg_obj.type2nodes['gene'] 
all_genes_name = [mg_obj.int2gene[i] for i in all_genes_int]

# PPI obj
ppi_obj = mg_obj.getEdgeType_subset(edge_type='scaffold') #gene-gene interactions
ppi_obj_new_gene2int = {n:i for i, n in enumerate(all_genes_name)}
ppi_obj_new_int2gene = {v:k for k, v in ppi_obj_new_gene2int.items()}
ppi_interactions = ppi_obj.getInteractionNamed() #function defined in multigraph.py, it returns a dataframe with 2 columns "Gene_A", "Gene_B"
ppi_interactions = ppi_interactions.map(lambda x: ppi_obj_new_gene2int[x]) #replace gene names with their corresponding integers

# DEP obj
dep_obj = mg_obj.getEdgeType_subset(edge_type='depmap')
cells = [k for k, v in mg_obj.node_type_names.items() if v == 'cell']
cell2int = {c:i for i, c in enumerate(cells)}
int2cell = {v:k for k, v in cell2int.items()}
dep_interactions = dep_obj.getInteractionNamed() #2 columns: Gene_A (=cell line) and Gene_B (=gene)
dep_genes = [dep_obj.int2gene[i] for i in dep_obj.type2nodes['gene']] #list of genes with a dependecy edge

dep_interactions.loc[~dep_interactions.Gene_A.isin(cells), ['Gene_A', 'Gene_B']] = \
    dep_interactions.loc[~dep_interactions.Gene_A.isin(cells), ['Gene_B', 'Gene_A']].values # assure that all values in Gene_A are cells, otherwise switch with the gene column

assert dep_interactions.Gene_A.isin(cells).sum() == dep_interactions.shape[0] #all Gene_A should be cell lines
dep_interactions = dep_interactions.map(lambda x: cell2int[x] if x in cell2int else ppi_obj_new_gene2int[x]) #map cell lines and genes to their resp. integers
dep_interactions = dep_interactions[['Gene_B', 'Gene_A']] #switch columns (gene, cell line)
print(dep_interactions.shape)

Returning UndirectedInteractionNetwork object.
Continuing with Gene_A and Gene_B as columns for the nodes
14034 Nodes and 278974 interactions
Returning UndirectedInteractionNetwork object.
Continuing with Gene_A and Gene_B as columns for the nodes
1037 Nodes and 14099 interactions
(14099, 2)


In [3]:

path = BASE_PATH+'Depmap/OmicsCNGene.csv'
ccle_cnv = pd.read_csv(path, header=0, index_col=0)
ccle_cnv.columns = [i.split(' ')[0] for i in ccle_cnv.columns] 
ccle_cnv = ccle_cnv[ccle_cnv.columns[ccle_cnv.isna().sum() == 0]] #remove columns with missing values
ccle_cnv = ccle_cnv.loc[list(set(cells) & set(ccle_cnv.index))] #filter only the cells that are in the cell lines

hvg_q = ccle_cnv.std().quantile(q=0.95)  #compute the 95th percentile of the standard deviation per gene accross all cells
hvg_final = ccle_cnv.std()[ccle_cnv.std() >= hvg_q].index #select genes with high variance (top 5%)

ccle_cnv_hvg = ccle_cnv[hvg_final]
#cell_feat = torch.from_numpy(ccle_cnv_hvg.loc[cell2int.keys()].values).to(torch.float) #cell feature matrix

In [4]:
print(ccle_cnv_hvg.shape)
print(ccle_cnv_hvg.head())


(37, 1218)
             CROCCP2     CROCC  BX284668.2  BX284668.5    RNU1-2      TYW3  \
ACH-000149  2.325705  2.192354    2.325705    2.325705  2.325705  0.978057   
ACH-002083  0.722578  0.722578    0.722578    0.722578  0.722578  0.722578   
ACH-000227  0.534714  0.534714    0.534714    0.534714  0.534714  0.534714   
ACH-001300  0.922100  0.628134    0.601304    0.601304  0.601304  0.950683   
ACH-001603  0.509894  0.509894    0.509894    0.509894  0.509894  1.000209   

                LHX8   SLC44A5     ACADM   RABGGTB  ...     GPR85    SMIM30  \
ACH-000149  0.978057  0.978057  0.978057  0.978057  ...  1.483138  1.483138   
ACH-002083  0.722578  0.722578  0.722578  0.722578  ...  1.428792  1.428792   
ACH-000227  0.534714  0.534714  0.534714  0.534714  ...  1.034856  1.034856   
ACH-001300  0.950683  0.950683  0.950683  0.950683  ...  1.394232  1.394232   
ACH-001603  1.000209  1.000209  1.000209  1.000209  ...  1.003859  1.003859   

             PPP1R3A     FOXP2      ING3     

Dit is dus mijn cell feature matrix (CNV data). Wat ik nu moet doen is de top 100 of 200 meest variabele data eruit nemen en kijken of deze genen ook in de literatuur van neuroblastoma voorkomen.

In [5]:
#Select the 200 columns with highest variance in ccle_cnv_hvg
cell_feat = ccle_cnv_hvg.loc[:, ccle_cnv_hvg.var().nlargest(200).index]
 
print(cell_feat.shape)
print(cell_feat)


(37, 200)
                MYCNOS      MYCNUT        MYCN        DDX1  LINC01804  \
ACH-000149    1.411926    1.411926    1.411926    1.411926   1.411926   
ACH-002083    2.010568    2.010568    2.010568    2.010568   2.010568   
ACH-000227  146.633202  146.633202  146.633202    0.988912   0.988912   
ACH-001300    0.946186    0.946186    0.946186    0.946186   0.946186   
ACH-001603   32.897795   32.897795   32.897795    1.484014   1.484014   
ACH-000203   97.644305   97.644305   97.644305   97.644305  97.644305   
ACH-000260    1.039791    1.039791    1.039791    1.039791   1.039791   
ACH-001301   20.681171   20.681171   20.681171   20.681171  20.681171   
ACH-000259  118.338606  118.338606  118.338606    1.440714   1.440714   
ACH-000310   28.482029   28.482029   28.482029   28.482029  28.482029   
ACH-001338  132.342626  132.342626  132.342626  129.622738   0.947914   
ACH-000312    1.497977    1.497977    1.497977    1.497977   1.497977   
ACH-002922    1.136062    1.136062    1.1

In [6]:
# extract column names (out of ccle_cnv_hvg)
gene_names = cell_feat.columns
print(gene_names)
print(len(gene_names))
 

Index(['MYCNOS', 'MYCNUT', 'MYCN', 'DDX1', 'LINC01804', 'CYRIA', 'NBAS',
       'GBP2', 'GBP7', 'GBP4',
       ...
       'PITPNM2', 'MIR4304', 'PITPNM2-AS1', 'MPHOSPH9', 'C12orf65', 'CDK2AP1',
       'SBNO1', 'MIR8072', 'RILPL2', 'LRRC43'],
      dtype='object', length=200)
200


In [7]:
list(gene_names)

['MYCNOS',
 'MYCNUT',
 'MYCN',
 'DDX1',
 'LINC01804',
 'CYRIA',
 'NBAS',
 'GBP2',
 'GBP7',
 'GBP4',
 'AC083805.1',
 'ATP23',
 'LINC02403',
 'LINC02375',
 'LINC02411',
 'LINC02393',
 'LINC00507',
 'LINC02441',
 'LINC02368',
 'MIR3612',
 'CTDSP2',
 'LYZ',
 'YEATS4',
 'CNOT2',
 'SLC35E3',
 'MDM2',
 'CPM',
 'AC020637.1',
 'LINC02388',
 'TMEM132C',
 'AVIL',
 'MIR26A2',
 'TSFM',
 'MARCHF9',
 'CYP27B1',
 'METTL1',
 'EEF1AKMT3',
 'CDK4',
 'AGAP2-AS1',
 'TSPAN31',
 'MIR6759',
 'AGAP2',
 'CPSF6',
 'MIR1279',
 'FRS2',
 'MIR3913-1',
 'MIR3913-2',
 'C1GALT1C1L',
 'PRANCR',
 'CCT2',
 'TRIB2',
 'MIR3125',
 'MIR4262',
 'MIR3681',
 'LINC01799',
 'LINC01628',
 'LINC01828',
 'LINC01829',
 'ETAA1',
 'LINC02831',
 'LINC00276',
 'LRATD1',
 'MYRFL',
 'RAB3IP',
 'PTPRB',
 'KCNMB4',
 'LRRC10',
 'BEST3',
 'AC025263.1',
 'NUP107',
 'TSPAN8',
 'PTPRR',
 'NTSR2',
 'AC106875.1',
 'MIR3681HG',
 'LPIN1',
 'MIR548S',
 'PLEKHH2',
 'RAP1B',
 'SNORA70G',
 'AC090061.1',
 'CAMKMT',
 'LINC00958',
 'RASSF10-DT',
 'RASSF10',


Nu is de volgende stap opzoeken of deze genen effectief voorkomen in de literatuur van neuroblastoma.


Chat-GPTs answer: look up for confirmation in literature!!!!

Genes Associated with Neuroblastoma
From the provided list, here are genes that are commonly associated with neuroblastoma (NB), based on their established roles in this disease:

MYCN:
Primary oncogene in neuroblastoma. MYCN amplification is a hallmark of high-risk neuroblastoma and is used as a clinical biomarker to stratify patient risk.

DDX1:
Located close to MYCN on chromosome 2p. Often co-amplified with MYCN in neuroblastoma and thought to be involved in tumor progression.

ALK:
A major oncogene in neuroblastoma, frequently mutated in familial and sporadic neuroblastoma. ALK amplifications or point mutations are implicated in NB pathogenesis.

NBAS:
Located near MYCN; while less characterized in neuroblastoma, its proximity to MYCN suggests possible involvement in cases of co-amplification.

MDM2:
A regulator of p53, which can play a role in tumorigenesis. Overexpression or amplification of MDM2 is implicated in neuroblastoma pathogenesis, particularly in MYCN-amplified tumors.

CDK4:
Frequently amplified in neuroblastoma, particularly in the context of MYCN amplification. Promotes cell cycle progression and tumor growth.

CNOT2:
A gene linked to transcriptional regulation, including MYCN-driven pathways in neuroblastoma. However, its role is less directly studied.

Why Do Other Genes Have High CNV Variability?
Copy number variability (CNV) reflects structural alterations in the genome, such as amplifications or deletions. High CNV variability across these genes may arise due to the following reasons:

1. Proximity to Known Amplification Hotspots
Many of the listed genes are located in genomic regions known to undergo frequent amplifications or deletions in neuroblastoma or other cancers.
Example: Genes like LINC01804, DDX1, and NBAS are in the 2p24 region, near MYCN, a hotspot for amplification in neuroblastoma.
2. Chromosomal Instability
Neuroblastoma is characterized by chromosomal instability, leading to frequent gains/losses in specific regions:
2p amplification: MYCN, DDX1, NBAS.
1p deletion: Genes such as CNOT2 and others.
11q deletion: A common feature in non-MYCN-amplified neuroblastoma.
3. Role in Tumorigenesis
Even if not directly linked to neuroblastoma, genes with high variability might:
Act as drivers (genes that promote tumor growth when altered).
Be part of the tumor's adaptation to stress, such as hypoxia or therapy resistance.
4. Passenger Alterations
Some high-variability genes may simply be "passenger" alterations. These occur due to proximity to amplified driver genes like MYCN but do not directly contribute to tumorigenesis.
5. Regulatory Non-Coding RNAs
Genes like LINC02403 and LINC00507 are long non-coding RNAs (lncRNAs). Variability in these genes may alter transcriptional or epigenetic landscapes, indirectly influencing cancer progression.
6. Role in Cellular Pathways Commonly Dysregulated in Neuroblastoma
Genes like METTL1, CPSF6, and CDK4 participate in pathways (e.g., cell cycle, DNA damage repair, or metabolism) that are frequently dysregulated in neuroblastoma.
7. Technical Artifacts
CNV variability in some genes may stem from limitations in detecting CNVs accurately or regions prone to sequencing bias.
Summary
Neuroblastoma-associated genes: MYCN, ALK, CDK4, NBAS, DDX1, MDM2.
High CNV variability in the other genes could be attributed to chromosomal instability, proximity to neuroblastoma hotspots, functional relevance in tumorigenic pathways, or passenger alterations. Further analysis (e.g., gene expression or functional studies) would clarify their specific role in neuroblastoma progression.


PHD van Jonathan Linusgabre also mentions: 
MYCN, ALK, MDM2, CDK4, ODC1


## Expression data

In [8]:
path = BASE_PATH+'Depmap/OmicsExpressionProteinCodingGenesTPMLogp1.csv'
ccle_expression = pd.read_csv(path, header=0, index_col=0)
ccle_expression.columns = [i.split(' ')[0] for i in ccle_expression.columns]
# subset_nodes = list(set(ccle_expression.columns) & set(all_genes_name))
cancer_expression = ccle_expression.loc[list(set(cells) & set(ccle_expression.index))]

hvg_q = cancer_expression.std().quantile(q=0.95) #threshold for high variance genes
hvg_final = cancer_expression.std()[cancer_expression.std() >= hvg_q].index #select genes with high variance

cancer_expression_hvg = cancer_expression[hvg_final]
# cancer_expression_full = pd.concat([cancer_expression,
#                                     pd.DataFrame(np.tile(cancer_expression.mean().values, (len(set(cells) - set(cancer_expression.index)), 1)),
#                                                  index=list(set(cells) - set(cancer_expression.index)), columns=cancer_expression.columns)])

#filling the missing entries with the mean expression values of the corresponding genes.
cancer_expression_full = pd.concat([cancer_expression_hvg,
                                    pd.DataFrame(np.tile(cancer_expression_hvg.mean().values, (len(set(cells) - set(cancer_expression_hvg.index)), 1)),
                                                index=list(set(cells) - set(cancer_expression_hvg.index)), columns=cancer_expression_hvg.columns)])
#cancer_expression_full[:] = np.random.uniform(0, 10, size=cancer_expression_full.shape) #delete this line!!! It generates random values (between 0 and 10) for each cell 
#cell_feat = torch.from_numpy(cancer_expression_full.loc[cell2int.keys()].values).to(torch.float) #cell featture matrix with rows as cell lines and columns as genes with high variance


In [9]:
print(cancer_expression_full.shape)
print(cancer_expression_full.head())

(37, 960)
              TSPAN6     FUCA2      CD99  TMEM176A      ICA1     CFLAR  \
ACH-000149  3.776104  6.132166  7.518220  0.545968  5.266412  4.526695   
ACH-000227  4.033863  0.575312  6.310340  0.201634  8.303507  3.388190   
ACH-001300  3.346248  3.695994  6.394548  0.713696  8.165661  1.941106   
ACH-001603  3.152183  1.389567  5.432291  2.185867  7.311703  2.536053   
ACH-000203  4.364572  3.124328  7.132577  2.189034  8.049141  3.735522   

                TFPI     HSPB6     ITGA3     CRLF1  ...    CT45A1    FBXO17  \
ACH-000149  1.839960  0.815575  8.900474  4.678635  ...  0.367371  5.208673   
ACH-000227  0.014355  0.344828  0.871844  2.666757  ...  0.000000  5.394034   
ACH-001300  0.056584  0.505891  1.389567  1.416840  ...  0.000000  4.481557   
ACH-001603  0.042644  0.042644  2.397803  2.604071  ...  0.000000  4.355439   
ACH-000203  1.150560  1.137504  6.448901  3.917432  ...  0.000000  4.452859   

             CT45A10      CD24    ZNF595     DOC2B     H4C12      CBSL

In [10]:
# select the 200 columns with highest variance in cancer_expression_full
cell_feat = cancer_expression_full.loc[:, cancer_expression_full.var().nlargest(200).index]
print(cell_feat.shape)
#names of the genes with high variance
gene_names = cell_feat.columns

(37, 200)


In [11]:
list(gene_names)

['RPS4Y1',
 'MYCN',
 'S100A6',
 'PAGE5',
 'DLK1',
 'COL3A1',
 'XAGE1A',
 'S100A11',
 'STMN2',
 'ARHGAP36',
 'MAGEA4',
 'TGFBI',
 'ANXA1',
 'SST',
 'COL1A1',
 'NPY',
 'ANXA2',
 'S100A10',
 'SERPINE1',
 'IGF2',
 'CHGA',
 'VIP',
 'NNAT',
 'S100A16',
 'RBP1',
 'GTSF1',
 'DBH',
 'FABP5',
 'NSG2',
 'FOS',
 'MSN',
 'C7',
 'CRH',
 'CD99',
 'LUM',
 'RGS5',
 'LGALS1',
 'RGS4',
 'CHGB',
 'CTAG2',
 'GDAP1L1',
 'MAGEA10',
 'MAGEA3',
 'KRT18',
 'MAGEA6',
 'FN1',
 'EMP3',
 'DDX1',
 'GAGE2A',
 'DKK1',
 'CYGB',
 'IFI16',
 'GAL',
 'S100A4',
 'CAVIN1',
 'DDC',
 'ENPP2',
 'DDX3Y',
 'RAB34',
 'MAGEA12',
 'TH',
 'TFAP2B',
 'CCDC80',
 'HGF',
 'THBS1',
 'GFRA2',
 'PRAME',
 'IL13RA2',
 'TMEM176B',
 'CSAG1',
 'CD44',
 'IFITM3',
 'CRABP1',
 'EIF1AY',
 'CRABP2',
 'UBB',
 'TMEM176A',
 'RTN1',
 'ASS1',
 'RET',
 'ITGA3',
 'CYRIA',
 'SLITRK6',
 'MMP14',
 'XAGE1B',
 'BGN',
 'STMN4',
 'CYB561',
 'NT5E',
 'DCN',
 'KCNQ2',
 'HLA-C',
 'PRSS3',
 'MT2A',
 'HTRA1',
 'PRRX1',
 'TGM2',
 'CPLX2',
 'GPR50',
 'GCHFR',
 'EGR1',
 '

Geen enkele can deze komt overeen met wat in Jonathan zijn PhD staat!!

ChatGPT:

Genes Highly Associated with Neuroblastoma
MYCN:

Key Role: MYCN amplification is a hallmark of high-risk neuroblastoma and is strongly associated with poor prognosis.
Mechanism: MYCN drives oncogenesis by promoting cell proliferation and suppressing differentiation.
PHOX2B:

Key Role: Mutations in PHOX2B are linked to familial cases of neuroblastoma.
Mechanism: It is crucial for the development of the autonomic nervous system and neural crest-derived tissues.
ASCL1:

Key Role: Plays a role in neuroblast differentiation and is highly expressed in neuroblastoma cells.
Mechanism: Functions as a transcription factor in neural lineage commitment.
CXCR4:

Key Role: Overexpressed in neuroblastoma and associated with metastasis.
Mechanism: CXCR4 mediates chemotaxis and cell migration, critical for metastatic spread.
RET:

Key Role: Activating mutations in RET are implicated in neuroblastoma and other neuroendocrine tumors.
Mechanism: RET signaling promotes cell survival and proliferation.
TH (Tyrosine Hydroxylase):

Key Role: TH is a marker of neural crest-derived tumors, including neuroblastoma.
Mechanism: Catalyzes dopamine synthesis, which is a characteristic feature of neuroblastoma cells.
CHGA (Chromogranin A):

Key Role: A neuroendocrine marker often elevated in neuroblastoma.
SST (Somatostatin):

Key Role: Somatostatin expression is observed in some neuroblastoma subtypes and serves as a diagnostic marker.
TGFBI:

Key Role: Promotes extracellular matrix remodeling and metastasis in aggressive neuroblastomas.
IGF2:

Key Role: The IGF2 pathway is implicated in neuroblastoma proliferation and survival.