In [1]:
import os
import pandas as pd
import numpy as np

# Paths to data
data_folder = r"C:\Users\sejac\Desktop\csv_labels"
top_bins_path = r"C:\Users\sejac\Desktop\top_1000_bins_u_test.csv"

# Function to calculate z-scores with special handling for zero entries
def calculate_z_scores(data):
    # Exclude zeros when calculating mean and standard deviation
    non_zero_data = data[data != 0]
    mean = non_zero_data.mean()
    std = non_zero_data.std()

    # Avoid division by zero in case all non-zero values are the same
    if std == 0:
        return np.zeros_like(data)  # Return all zeros if standard deviation is zero

    # Calculate z-scores for non-zero values
    z_scores = (data - mean) / std
    z_scores[data == 0] = 0  # Keep zeros as zero
    return z_scores

# Load top 1000 bins
top_bins = pd.read_csv(top_bins_path)['Bin_ID']

# Initialize list to store all patients' data
all_patients_data = []

# Loop through all patient files
for filename in os.listdir(data_folder):
    if filename.endswith(".csv"):
        # Determine cancer label from filename
        if "_0_" in filename:
            label = 0  # Non-cancer
        elif "_1_" in filename:
            label = 1  # Cancer
        else:
            raise ValueError(f"Unexpected filename format: {filename}")
        
        # Load patient data
        file_path = os.path.join(data_folder, filename)
        patient_data = pd.read_csv(file_path)
        
        # Create unique Bin_ID (Chromosome + Start)
        patient_data['Bin_ID'] = patient_data['Chromosome'] + ":" + patient_data['Start'].astype(str)
        
        # Filter for top bins
        patient_data = patient_data[patient_data['Bin_ID'].isin(top_bins)]
        
        # Ensure consistent data length (after filtering for top bins)
        if len(patient_data) != len(top_bins):
            print(f"Inconsistent data length for {filename}. Skipping.")
            continue
        
        # Normalize Percent_Methylation using z-scores
        normalized_methylation = calculate_z_scores(patient_data['Percent_Methylation'].values)
        patient_data['Percent_Methylation'] = normalized_methylation
        
        # Set Bin_ID as the index for easier merging later
        patient_data = patient_data.set_index('Bin_ID')['Percent_Methylation']
        
        # Add label and patient identifier
        patient_data = patient_data.to_frame(name=filename).T  # Transpose for merging
        patient_data['Label'] = label  # Add label column
        
        # Append to list
        all_patients_data.append(patient_data)

# Combine all patient data into a single DataFrame
final_data = pd.concat(all_patients_data).reset_index(drop=True)

# Extract labels
labels = final_data['Label']
features = final_data.drop('Label', axis=1)

# Fill missing values with column mean (optional, but unlikely needed after z-normalization)
features.fillna(features.mean(), inplace=True)

print("Data normalization and preparation completed.")
print(f"Features shape: {features.shape}, Labels shape: {labels.shape}")


Data normalization and preparation completed.
Features shape: (517, 1000), Labels shape: (517,)


In [2]:
print(f"Features shape: {features.shape}")  # Should be (num_patients, 1000)
print(f"Labels shape: {labels.shape}")      # Should match the number of patients
print(f"Sample labels: {labels.head()}")    # Verify label values

Features shape: (517, 1000)
Labels shape: (517,)
Sample labels: 0    0
1    0
2    0
3    1
4    0
Name: Label, dtype: int64


In [3]:
import torch
import torch.nn as nn

class SparseConnectionLayer(nn.Module):
    def __init__(self, input_size, output_size, connections_per_output):
        """
        Custom sparse layer with localized sparse connections.
        Args:
            input_size: Total number of neurons in the input layer.
            output_size: Total number of neurons in the output layer.
            connections_per_output: Number of input neurons connected to each output neuron.
        """
        super(SparseConnectionLayer, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.connections_per_output = connections_per_output

        # Predefine sparse connectivity
        self.connections = self._generate_connections()

        # Parameters: weights and biases for each output neuron
        self.weights = nn.Parameter(torch.randn(output_size, connections_per_output))
        self.bias = nn.Parameter(torch.zeros(output_size))

    def _generate_connections(self):
        """
        Generate a fixed sparse connection pattern.
        Returns:
            A list of input neuron indices for each output neuron.
        """
        connections = []
        for i in range(self.output_size):
            # Select `connections_per_output` input neurons for each output neuron
            start_idx = (i * self.connections_per_output) % self.input_size
            connections.append(
                [(start_idx + j) % self.input_size for j in range(self.connections_per_output)]
            )
        return connections

    def forward(self, x):
        """
        Forward pass with sparse connections.
        Args:
            x: Input tensor of shape [batch_size, input_size].
        Returns:
            Tensor of shape [batch_size, output_size].
        """
        batch_size = x.size(0)
        output = []
        for i, indices in enumerate(self.connections):
            # Select the relevant input neurons for the current output neuron
            input_subset = x[:, indices]  # Shape: [batch_size, connections_per_output]

            # Compute weighted sum for this sparse connection
            weighted_sum = torch.matmul(input_subset, self.weights[i]) + self.bias[i]
            output.append(weighted_sum)

        return torch.stack(output, dim=1)  # Shape: [batch_size, output_size]


class CancerPredictionModelSparse(nn.Module):
    def __init__(self):
        super(CancerPredictionModelSparse, self).__init__()
        # Define sparse layers for 1000 bins
        self.hidden1 = SparseConnectionLayer(input_size=1000, output_size=300, connections_per_output=3)
        self.hidden2 = SparseConnectionLayer(input_size=300, output_size=100, connections_per_output=3)
        self.hidden3 = SparseConnectionLayer(input_size=100, output_size=50, connections_per_output=3)
        self.hidden4 = SparseConnectionLayer(input_size=50, output_size=20, connections_per_output=3)

        # Fully connected layers
        self.fc1 = nn.Linear(20, 10)  # Fully connected layer (20 -> 10)
        self.fc2 = nn.Linear(10, 5)   # Fully connected layer (10 -> 5)
        self.fc3 = nn.Linear(5, 1)    # Fully connected layer (5 -> 1)

        # Activation functions
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Pass through sparse layers
        x = self.hidden1(x)
        x = self.relu(x)

        x = self.hidden2(x)
        x = self.relu(x)

        x = self.hidden3(x)
        x = self.relu(x)

        x = self.hidden4(x)
        x = self.relu(x)

        # Pass through fully connected layers
        x = self.fc1(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.relu(x)

        x = self.fc3(x)

        # Final sigmoid activation for binary classification
        x = self.sigmoid(x)
        return x


In [5]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# Convert features and labels to NumPy
X = features.values  # Shape: (num_patients, 1000)
y = labels.values    # Shape: (num_patients,)

# Split data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)  # Float for BCE loss
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Training data size: {len(train_loader.dataset)}")
print(f"Validation data size: {len(val_loader.dataset)}")
print(f"Test data size: {len(test_loader.dataset)}")


Training data size: 330
Validation data size: 83
Test data size: 104
