In [7]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Deep Learning Analysis

In [34]:
import lightning as L
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from torchmetrics import MeanSquaredError
from torchvision import datasets
from utils.helper_classes import CancerDataModule, CancerDataset
from utils.models import CancerDataAutoEncoder, BaseCancerRegressor

In [17]:
np.random.seed(42)

# Parameters
num_samples = 2621
num_genes = 1800

# Base clinical data
clinical_data = {
    "PatientID": [f"PID_{i}" for i in range(1, num_samples + 1)],
    "CopyNumberVariant": np.random.randint(0, 10, size=num_samples),
    "SmokingStatus": np.random.choice([1, 0], size=num_samples),
    "OverallSurvivalMonths": np.random.uniform(6, 60, size=num_samples).round(2),
}

# Simulated gene mutation matrix: binary (0=wildtype, 1=mutated)
mutation_matrix = np.random.choice([0, 1], size=(num_samples, num_genes), p=[0.97, 0.03])

# Gene column names
gene_cols = [f"Gene_{i+1}" for i in range(num_genes)]
mutation_df = pd.DataFrame(mutation_matrix, columns=gene_cols, index=clinical_data["PatientID"])

# Combine all data
cancer_df = pd.concat([pd.DataFrame(clinical_data).set_index("PatientID"), mutation_df], axis=1)

# Preview
print(cancer_df.iloc[:, :10].head())  # Just print first 10 columns to avoid overload


       CopyNumberVariant  SmokingStatus  OverallSurvivalMonths  Gene_1  \
PID_1                  6              1                  30.92       0   
PID_2                  3              1                  59.10       0   
PID_3                  7              0                  48.48       0   
PID_4                  4              0                  47.78       0   
PID_5                  6              1                  46.38       0   

       Gene_2  Gene_3  Gene_4  Gene_5  Gene_6  Gene_7  
PID_1       0       0       0       0       0       0  
PID_2       0       0       0       0       0       0  
PID_3       0       0       0       0       0       0  
PID_4       0       0       0       1       0       0  
PID_5       0       0       0       0       0       0  


In [22]:
cancer_ds = CancerDataset(cancer_df)
cancer_dm = CancerDataModule(cancer_ds)
cancer_dm.setup()

In [None]:
with open('results/02/results.txt', 'r') as file:
    best_ae_model_path = file.readline().strip()

In [12]:
with open('results/02/best_trial_params.json', 'r') as json_file:
    best_model_params = json.load(json_file)

print(best_model_params)

{'latent_size': 129, 'n_layers': 5, 'dropout': 0.04892076156671582}


In [15]:
cancer_autoencoder = CancerDataAutoEncoder.load_from_checkpoint(
    best_ae_model_path,
    **best_model_params
)

In [29]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

simple_nn = SimpleNN(
    input_dim=best_model_params['latent_size'],
    hidden_dim=128
)

In [30]:
from torchinfo import summary

input_dim = len(cancer_dm.ds_train[0][0])
summary(BaseCancerRegressor(cancer_autoencoder, simple_nn), input_size=(1, input_dim))

Layer (type:depth-idx)                   Output Shape              Param #
BaseCancerRegressor                      [1, 1]                    --
├─CancerDataAutoEncoder: 1-1             --                        8,871,850
│    └─Sequential: 2-1                   [1, 129]                  --
│    │    └─Linear: 3-1                  [1, 1468]                 (2,646,804)
│    │    └─ReLU: 3-2                    [1, 1468]                 --
│    │    └─Dropout: 3-3                 [1, 1468]                 --
│    │    └─Linear: 3-4                  [1, 1134]                 (1,665,846)
│    │    └─ReLU: 3-5                    [1, 1134]                 --
│    │    └─Dropout: 3-6                 [1, 1134]                 --
│    │    └─Linear: 3-7                  [1, 800]                  (908,000)
│    │    └─ReLU: 3-8                    [1, 800]                  --
│    │    └─Dropout: 3-9                 [1, 800]                  --
│    │    └─Linear: 3-10                 [1, 466]    

In [37]:
from utils.helper_functions import create_classifier_trainer

trainer, regressor_csv_logger, regressor_checkpoint = create_classifier_trainer("cancer_regressor")
cancer_regressor = BaseCancerRegressor(cancer_autoencoder, simple_nn)
trainer.fit(
    cancer_regressor,
    datamodule=CancerDataModule(cancer_ds),
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name           | Type                  | Params | Mode 
-----------------------------------------------------------------
0 | auto_encoder   | CancerDataAutoEncoder | 14.5 M | train
1 | neural_network | SimpleNN              | 16.8 K | train
2 | loss_metric    | MeanSquaredError      | 0      | train
3 | val_metric     | MeanSquaredError      | 0      | train
4 | test_metric    | MeanSquaredError      | 0      | train
-----------------------------------------------------------------
16.8 K    Trainable params
14.5 M    Non-trainable params
14.6 M    Total params
58.245    Total estimated model params size (MB)
43        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]