# Deep Learning Analysis for Non-Small Cell Lung Cancer

In [2]:
import lightning as L
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchvision import datasets

In [32]:
# Generate a fake cancer dataset
np.random.seed(42)  # For reproducibility

num_samples = 2621
data = {
    "PatientID": [f"PID_{i}" for i in range(1, num_samples + 1)],
    "MutationStatus": np.random.choice([0, 1], size=num_samples),
    "CopyNumberVariant": np.random.randint(0, 10, size=num_samples),
    "StructuralVariant": np.random.randint(0, 5, size=num_samples),
    "SmokingStatus": np.random.choice([1, 0], size=num_samples),
    "PriorTreatment": np.random.choice([1, 0], size=num_samples),
    "OverallSurvivalMonths": np.random.uniform(6, 60, size=num_samples).round(2),
}

cancer_df = pd.DataFrame(data).set_index("PatientID")
print(cancer_df.head())

           MutationStatus  CopyNumberVariant  StructuralVariant  \
PatientID                                                         
PID_1                   0                  3                  3   
PID_2                   1                  3                  0   
PID_3                   0                  5                  3   
PID_4                   0                  9                  0   
PID_5                   0                  5                  4   

           SmokingStatus  PriorTreatment  OverallSurvivalMonths  
PatientID                                                        
PID_1                  0               1                  34.09  
PID_2                  1               0                  36.62  
PID_3                  0               1                  36.30  
PID_4                  1               0                  31.67  
PID_5                  1               0                  48.69  


In [33]:
len(cancer_df)

2621

In [34]:
from sklearn.preprocessing import OneHotEncoder


class CancerDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        columns = df.columns.tolist()
        features = df.drop(columns=[columns[-1]])
        self.x = torch.tensor(features.values, dtype=torch.float32)
        self.y = torch.tensor(df[columns[-1]].values, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

In [36]:
cancer_ds = CancerDataset(cancer_df)
cancer_ds[0]

(tensor([0., 3., 3., 0., 1.]), tensor([34.0900]))

In [44]:
class CancerDataModule(L.LightningDataModule):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def setup(self, stage=None):
        self.ds_train,  self.ds_rest = random_split(
            self.dataset,
            [.6, .4]
        )

        self.ds_val, self.ds_test = random_split(
            self.ds_rest,
            [.5, .5]
        )
    
    def train_dataloader(self):
        return DataLoader(self.ds_train, batch_size=32, shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.ds_val, batch_size=32)
    
    def test_dataloader(self):
        return DataLoader(self.ds_test, batch_size=32)


In [51]:
cancer_dm = CancerDataModule(cancer_ds)
cancer_dm.setup()

In [52]:
len(cancer_dm.ds_train), len(cancer_dm.ds_val), len(cancer_dm.ds_test)

(1573, 524, 524)

In [76]:
mutation_data = pd.read_csv("../../nsclc_tcga_broad_2016/data_mutations.txt", sep="\t", skiprows=1)

  mutation_data = pd.read_csv("../../nsclc_tcga_broad_2016/data_mutations.txt", sep="\t", skiprows=1)


In [77]:
mutation_data['Tumor_Sample_Barcode'].unique().shape[0]

1144

In [78]:
len(mutation_data['Hugo_Symbol'].unique())

17959

In [54]:
first_item = cancer_dm.ds_train[0]
data, label = first_item
print(f"Data: {data}")
print(f"Label: {label}")

Data: tensor([1., 3., 1., 1., 0.])
Label: tensor([40.8300])


In [None]:
class BaseDLRegressorModel(L.LightningModule):
    def __init__(self, model, loss_fn, optimizer, lr=0.001):
        pass