In [148]:
import pandas as pd
import numpy as np
import torch


# LOAD DATA

In [149]:
# Load data

cells = pd.read_csv("./Data/Depmap/Model.csv")

hotspot_mutations = pd.read_csv("./Data/Depmap/OmicsSomaticMutationsMatrixHotspot.csv")
damaging_mutations = pd.read_csv("./Data/Depmap/OmicsSomaticMutationsMatrixDamaging.csv")




In [150]:
print(cells.head())
print(hotspot_mutations.head())  # 543 genes
print(damaging_mutations.head())  # 19098 genes

      ModelID  PatientID CellLineName StrippedCellLineName DepmapModelType  \
0  ACH-000001  PT-gj46wT  NIH:OVCAR-3            NIHOVCAR3           HGSOC   
1  ACH-000002  PT-5qa3uk        HL-60                 HL60             AML   
2  ACH-000003  PT-puKIyc        CACO2                CACO2            COAD   
3  ACH-000004  PT-q4K2cp          HEL                  HEL             AML   
4  ACH-000005  PT-q4K2cp   HEL 92.1.7              HEL9217             AML   

        OncotreeLineage     OncotreePrimaryDisease  \
0  Ovary/Fallopian Tube   Ovarian Epithelial Tumor   
1               Myeloid     Acute Myeloid Leukemia   
2                 Bowel  Colorectal Adenocarcinoma   
3               Myeloid     Acute Myeloid Leukemia   
4               Myeloid     Acute Myeloid Leukemia   

                    OncotreeSubtype OncotreeCode LegacyMolecularSubtype  ...  \
0  High-Grade Serous Ovarian Cancer        HGSOC                    NaN  ...   
1            Acute Myeloid Leukemia          A

In [151]:
print(hotspot_mutations.shape)  # 543 genes
print(damaging_mutations.shape)  # 19098 genes

(1929, 543)
(1929, 19098)


In [152]:
#load graph 

heteroData_obj = torch.load("./Data/multigraphs/heteroData_gene_cell_Neuroblastoma_cgp_cnv_META2.pt")


# FILTER NEUROBLASTOMA

In [153]:
neuroblastoma_ids = heteroData_obj['cell'].names

# Ensure the first column is used as an index to match against ModelIDs
hotspot_neuroblastoma = hotspot_mutations[hotspot_mutations.iloc[:, 0].isin(neuroblastoma_ids)]
damaging_neuroblastoma = damaging_mutations[damaging_mutations.iloc[:, 0].isin(neuroblastoma_ids)]



In [154]:
print(hotspot_neuroblastoma.shape)
print(damaging_neuroblastoma.shape)

(39, 543)
(39, 19098)


# Remove cells not in heterodata

# Remove genes that are never mutated
i.e. sum of columns = 0

In [155]:
hotspot_neuroblastoma = hotspot_neuroblastoma.loc[:, (hotspot_neuroblastoma != 0).any()]
damaging_neuroblastoma = damaging_neuroblastoma.loc[:, (damaging_neuroblastoma != 0).any()]

print(hotspot_neuroblastoma.shape)
print(damaging_neuroblastoma.shape)

(39, 13)
(39, 922)


# Count

In [156]:
hotspot_counts = hotspot_neuroblastoma.drop('Unnamed: 0', axis=1).sum()
damaging_counts = damaging_neuroblastoma.drop('Unnamed: 0', axis=1).sum()


# Convert to DataFrames and sort by counts in descending order
hotspot_df = pd.DataFrame({
    'Gene': hotspot_counts.index.str.split(' ').str[0],
    'Count': hotspot_counts.values
}).sort_values('Count', ascending=False)

damaging_df = pd.DataFrame({
    'Gene': damaging_counts.index.str.split(' ').str[0],
    'Count': damaging_counts.values
}).sort_values('Count', ascending=False)


In [157]:
print(hotspot_df)


       Gene  Count
4      TP53   22.0
7       ALK    8.0
9      TERT    7.0
0      NRAS    2.0
3       MAX    2.0
6   SMARCA4    2.0
1     GATA3    1.0
2      KRAS    1.0
5       NF1    1.0
8    PIK3CA    1.0
10    HLA-A    1.0
11     RAC1    1.0


 ALK --> Jonathan PhD

TP53 --> tumore suppressor??

In [162]:
print(damaging_df.shape)
print(damaging_df.head(25))

(921, 2)
        Gene  Count
318     TP53   24.0
329      NF1    7.0
15    ARID1A    4.0
885     WWC3    3.0
72   RPS6KC1    3.0
167     MMP1    3.0
858    FANCC    3.0
308   PIEZO1    3.0
220   ZNF664    3.0
189     PRB3    3.0
760   DNAH11    3.0
258    TYRO3    3.0
25     IL23R    3.0
558    CRTAP    2.0
99      NRG3    2.0
336   CAVIN1    2.0
562  SLC6A20    2.0
566   IMPDH2    2.0
200   FIGNL2    2.0
330   SLFN11    2.0
920   TPM3P2    2.0
327    USP22    2.0
552     ARSA    2.0
404    FCGBP    2.0
323    MYH13    2.0


In [159]:
# Show rows where 'Gene' contains 'MYCN'
damaging_df[damaging_df['Gene'].str.contains('CDK4', case=False, na=False)]


Unnamed: 0,Gene,Count


ATR, PARP, 

# Count cells with 0 mutations

In [160]:
if 'Unnamed: 0' in hotspot_neuroblastoma.columns:
    hotspot_neuroblastoma = hotspot_neuroblastoma.drop('Unnamed: 0', axis=1)
    damaging_neuroblastoma = damaging_neuroblastoma.drop('Unnamed: 0', axis=1)

# Calculate row sums
row_sums = hotspot_neuroblastoma.sum(axis=1)

# Count how many rows have sum = 0
zero_sum_count = (row_sums == 0).sum()
print(zero_sum_count)


12


In [161]:
# Calculate row sums
row_sums = damaging_neuroblastoma.sum(axis=1)

# Count how many rows have sum = 0
zero_sum_count = (row_sums == 0).sum()
print(zero_sum_count)


0
