In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import Dataset, DataLoader

# Custom Dataset for Medical Data
class MedicalDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.FloatTensor(labels)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Data Preprocessing Function
def preprocess_medical_data(df, create_severity=True):
    """
    Preprocesses NIH Glioblastoma medical data with categorical encoding and normalization
    """
    
    # Create severity level if it doesn't exist
    if create_severity and 'severity_level' not in df.columns:
        print("\nCreating severity_level from existing features...")
        severity = np.zeros(len(df))
        
        # Base severity on multiple factors (0-100 scale)
        # Factor 1: Tumor grade (0-30 points)
        if 'diagnoses.tumor_grade' in df.columns:
            grade_map = {'G1': 5, 'G2': 15, 'G3': 25, 'G4': 30, 'GX': 15}
            severity += df['diagnoses.tumor_grade'].map(grade_map).fillna(15)
        
        # Factor 2: Vital status (0-25 points)
        if 'demographic.vital_status' in df.columns:
            vital_map = {'Alive': 0, 'Dead': 25}
            severity += df['demographic.vital_status'].map(vital_map).fillna(12)
        
        # Factor 3: Metastasis (0-20 points)
        if 'diagnoses.metastasis_at_diagnosis' in df.columns:
            meta_map = {'Yes': 20, 'No': 0, 'yes': 20, 'no': 0}
            severity += df['diagnoses.metastasis_at_diagnosis'].map(meta_map).fillna(5)
        
        # Factor 4: Prior malignancy (0-15 points)
        if 'diagnoses.prior_malignancy' in df.columns:
            prior_map = {'Yes': 15, 'No': 0, 'yes': 15, 'no': 0}
            severity += df['diagnoses.prior_malignancy'].map(prior_map).fillna(5)
        
        # Factor 5: Disease status (0-10 points)
        if 'diagnoses.last_known_disease_status' in df.columns:
            status_map = {'Tumor free': 0, 'With tumor': 10, 'not reported': 5}
            severity += df['diagnoses.last_known_disease_status'].map(status_map).fillna(5)
        
        # Clip to 0-100 range
        df['severity_level'] = np.clip(severity, 0, 100)
        print(f"Severity level created. Range: {df['severity_level'].min():.1f} - {df['severity_level'].max():.1f}")
        print(f"Mean severity: {df['severity_level'].mean():.1f}")
    
    # Select relevant features from the NIH dataset
    feature_mapping = {
        'demographic.gender': 'gender',
        'demographic.race': 'race',
        'demographic.ethnicity': 'ethnicity',
        'diagnoses.age_at_diagnosis': 'age_at_diagnosis',
        'demographic.vital_status': 'vital_status',
        'diagnoses.tumor_grade': 'tumor_grade',
        'diagnoses.morphology': 'morphology',
        'diagnoses.site_of_resection_or_biopsy': 'site_of_biopsy',
        'diagnoses.laterality': 'laterality',
        'diagnoses.prior_malignancy': 'prior_malignancy',
        'diagnoses.prior_treatment': 'prior_treatment',
        'diagnoses.synchronous_malignancy': 'another_malignancy',
        'exposures.alcohol_history': 'alcohol_history',
        'exposures.alcohol_intensity': 'alcohol_intensity',
        'exposures.cigarettes_per_day': 'tobacco_frequency',
        'exposures.tobacco_smoking_onset_year': 'tobacco_onset',
        'diagnoses.metastasis_at_diagnosis': 'metastasis',
        'diagnoses.last_known_disease_status': 'disease_status',
        'diagnoses.progression_or_recurrence': 'progression',
        'demographic.days_to_death': 'days_to_death',
        'follow_ups.karnofsky_performance_status': 'karnofsky_score',
        'diagnoses.who_cns_grade': 'who_grade'
    }
    
    # Create a simplified dataframe with mapped columns
    df_selected = pd.DataFrame()
    for orig_col, new_col in feature_mapping.items():
        if orig_col in df.columns:
            df_selected[new_col] = df[orig_col]
    
    # Add target variable
    df_selected['severity_level'] = df['severity_level']
    
    # Remove rows where severity_level is NaN
    df_selected = df_selected.dropna(subset=['severity_level'])
    
    # Separate features and target
    X = df_selected.drop('severity_level', axis=1)
    y = df_selected['severity_level'].values.reshape(-1, 1)
    
    # Identify columns that actually exist
    existing_cols = X.columns.tolist()
    
    # Separate numerical and categorical
    numerical_cols = ['age_at_diagnosis', 'alcohol_intensity', 'tobacco_frequency', 
                      'tobacco_onset', 'days_to_death', 'karnofsky_score']
    numerical_cols = [col for col in numerical_cols if col in existing_cols]
    
    categorical_cols = [col for col in existing_cols if col not in numerical_cols]
    
    print(f"\nNumerical features: {numerical_cols}")
    print(f"Categorical features: {categorical_cols}")
    
    # Handle missing values and data cleaning
    X_processed = X.copy()
    
    # Process numerical columns
    for col in numerical_cols:
        # Convert to numeric, coercing errors to NaN
        X_processed[col] = pd.to_numeric(X_processed[col], errors='coerce')
        
        # Get non-NaN values
        non_nan_values = X_processed[col].dropna()
        
        if len(non_nan_values) > 0:
            median_val = non_nan_values.median()
            X_processed[col] = X_processed[col].fillna(median_val)
        else:
            # If all values are NaN, fill with 0
            X_processed[col] = 0
            print(f"Warning: Column '{col}' has all missing values, filled with 0")
    
    # Process categorical columns
    for col in categorical_cols:
        X_processed[col] = X_processed[col].fillna('Unknown')
        X_processed[col] = X_processed[col].astype(str)
    
    # Encode categorical variables
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X_processed[col] = le.fit_transform(X_processed[col])
        label_encoders[col] = le
    
    # Normalize numerical features (only if they exist and have non-zero variance)
    if numerical_cols:
        scaler = StandardScaler()
        
        # Check for constant columns (zero variance)
        for col in numerical_cols:
            if X_processed[col].std() == 0:
                print(f"Warning: Column '{col}' has zero variance, skipping normalization")
                continue
        
        # Only scale columns with variance
        cols_to_scale = [col for col in numerical_cols if X_processed[col].std() > 0]
        
        if cols_to_scale:
            X_processed[cols_to_scale] = scaler.fit_transform(X_processed[cols_to_scale])
    else:
        scaler = None
    
    # Convert to numpy array
    X_final = X_processed.values.astype(np.float32)
    
    # Check for NaN or Inf values
    if np.any(np.isnan(X_final)) or np.any(np.isinf(X_final)):
        print("Warning: Found NaN or Inf values in processed data. Replacing with 0.")
        X_final = np.nan_to_num(X_final, nan=0.0, posinf=0.0, neginf=0.0)
    
    # Normalize target (0-100 to 0-1)
    y_normalized = y / 100.0
    
    print(f"\nFinal processed shape: {X_final.shape}")
    print(f"Target range: {y_normalized.min():.2f} - {y_normalized.max():.2f}")
    
    return X_final, y_normalized, label_encoders, scaler

# Feedforward Neural Network Model
class MedicalFeedforwardNN(nn.Module):
    def __init__(self, input_dim, hidden_dim1=128, hidden_dim2=64, hidden_dim3=32):
        super(MedicalFeedforwardNN, self).__init__()
        
        # Layer 1: input -> hidden1
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.bn1 = nn.BatchNorm1d(hidden_dim1)  # Batch normalization
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        
        # Layer 2: hidden1 -> hidden2
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        
        # Layer 3: hidden2 -> hidden3
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.bn3 = nn.BatchNorm1d(hidden_dim3)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.2)
        
        # Output layer: hidden3 -> 1 (severity score)
        self.fc4 = nn.Linear(hidden_dim3, 1)
        self.sigmoid = nn.Sigmoid()  # Output between 0-1 (scaled to 0-100)
        
        # Initialize weights properly
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu1(out)
        out = self.dropout1(out)
        
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu2(out)
        out = self.dropout2(out)
        
        out = self.fc3(out)
        out = self.bn3(out)
        out = self.relu3(out)
        out = self.dropout3(out)
        
        out = self.fc4(out)
        out = self.sigmoid(out)  # Scale to 0-1
        
        return out

# Training Function
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        batch_count = 0
        for features, labels in train_loader:
            features = features.to(device)
            labels = labels.to(device)
            
            # Clear gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            # Check for NaN loss
            if torch.isnan(loss):
                print(f"NaN loss detected at epoch {epoch+1}, skipping batch")
                continue
            
            # Backward pass
            loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            epoch_loss += loss.item()
            batch_count += 1
        
        avg_loss = epoch_loss / batch_count if batch_count > 0 else float('inf')
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for features, labels in test_loader:
            features = features.to(device)
            outputs = model(features)
            
            # Scale back to 0-100
            predictions.extend((outputs.cpu().numpy() * 100).flatten())
            actuals.extend((labels.cpu().numpy() * 100).flatten())
    
    # Calculate metrics
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    mae = np.mean(np.abs(predictions - actuals))
    rmse = np.sqrt(np.mean((predictions - actuals)**2))
    
    print(f'\nEvaluation Results:')
    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'Root Mean Squared Error: {rmse:.2f}')
    
    return predictions, actuals

# Example Usage
if __name__ == "__main__":
    # Load CSV file
    # Replace 'your_medical_data.csv' with your actual CSV file path
    csv_file_path = 'Downloads/NIH Glioblastoma data.csv'
    
    try:
        # Read CSV file with low_memory=False to handle mixed types
        print(f"Loading data from {csv_file_path}...")
        df = pd.read_csv(csv_file_path, low_memory=False)
        
        print(f"Data loaded successfully! Shape: {df.shape}")
        print(f"\nSample columns: {list(df.columns[:20])}")
        print(f"\nFirst few rows preview:")
        print(df.iloc[:3, :10])
        
        # The preprocessing function will create severity_level if needed
        print("\nPreprocessing data...")
        
    except FileNotFoundError:
        print(f"File '{csv_file_path}' not found. Creating synthetic data for demonstration...")
        # Create synthetic data as fallback
        np.random.seed(42)
        n_samples = 1000
        
        sample_data = {
            'gender': np.random.choice(['Male', 'Female'], n_samples),
            'race': np.random.choice(['White', 'Black', 'Asian', 'Other'], n_samples),
            'ethnicity': np.random.choice(['Hispanic', 'Non-Hispanic'], n_samples),
            'age_at_diagnosis': np.random.randint(20, 90, n_samples),
            'vital_status': np.random.choice(['Alive', 'Dead', 'Unknown'], n_samples),
            'tumor_grade': np.random.choice(['G1', 'G2', 'G3', 'G4'], n_samples),
            'morphology': np.random.choice(['8000/3', '8000/6', '8010/3'], n_samples),
            'site_of_biopsy': np.random.choice(['Lung', 'Breast', 'Colon', 'Prostate'], n_samples),
            'laterality': np.random.choice(['Left', 'Right', 'Bilateral', 'None'], n_samples),
            'prior_cancer': np.random.choice(['Yes', 'No'], n_samples),
            'prior_treatment': np.random.choice(['Yes', 'No'], n_samples),
            'another_malignancy': np.random.choice(['Yes', 'No'], n_samples),
            'alcohol_history': np.random.choice(['Yes', 'No', 'Former'], n_samples),
            'alcohol_intensity': np.random.randint(0, 10, n_samples),
            'tobacco_frequency': np.random.randint(0, 40, n_samples),
            'tobacco_onset': np.random.randint(10, 50, n_samples),
            'menopause_status': np.random.choice(['Pre', 'Post', 'N/A'], n_samples),
            'allergies': np.random.choice(['Yes', 'No'], n_samples),
            'pregnancy_outcome': np.random.choice(['Live Birth', 'None', 'Other'], n_samples),
            'num_pregnancies': np.random.randint(0, 6, n_samples),
            'tumor_descriptors': np.random.choice(['Metastatic', 'Premalignant', 'Localized'], n_samples),
            'severity_level': np.random.randint(0, 101, n_samples)  # Target: 0-100
        }
        
        df = pd.DataFrame(sample_data)
    
    # Preprocess data
    X, y, encoders, scaler = preprocess_medical_data(df, create_severity=True)
    
    print(f"\nProcessed data shape: {X.shape}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of samples: {X.shape[0]}")
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create datasets and loaders
    train_dataset = MedicalDataset(X_train, y_train)
    test_dataset = MedicalDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    # Initialize model
    input_dim = X.shape[1]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = MedicalFeedforwardNN(input_dim).to(device)
    
    # Loss and optimizer
    criterion = nn.MSELoss()  # Mean Squared Error for regression
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Train model
    print("Training Medical Cancer Severity Prediction Model...")
    train_model(model, train_loader, criterion, optimizer, device, num_epochs=100)
    
    # Evaluate model
    predictions, actuals = evaluate_model(model, test_loader, device)
    
    # Save model
    torch.save(model.state_dict(), 'medical_cancer_severity_model.pth')
    print("\nModel saved as 'medical_cancer_severity_model.pth'")
 

Loading data from Downloads/NIH Glioblastoma data.csv...
Data loaded successfully! Shape: (21634, 164)

Sample columns: ['primary_site', 'disease_type', 'updated_datetime', 'case_id', 'submitter_id', 'index_date', 'state', 'consent_type', 'project.project_id', 'demographic.ethnicity', 'demographic.gender', 'demographic.race', 'demographic.vital_status', 'demographic.age_at_index', 'demographic.submitter_id', 'demographic.days_to_birth', 'demographic.demographic_id', 'demographic.age_is_obfuscated', 'demographic.updated_datetime', 'demographic.days_to_death']

First few rows preview:
  primary_site disease_type                  updated_datetime  \
0        Brain      Gliomas  2025-01-05T15:37:46.919964-06:00   
1        Brain      Gliomas  2025-01-05T15:37:46.919964-06:00   
2        Brain      Gliomas  2025-01-05T15:37:46.919964-06:00   

                                case_id  submitter_id index_date     state  \
0  0078b0c4-68a9-483b-9aab-61156d263213  TCGA-14-1034  Diagnosis  relea