In [None]:
# Side Notes/ideas
# Paients that have died due to the cancer can measure the severity through the grade of tumor, metastasis at diagnosis, prior malignancy, last known disease status, and time from diagnosis to death.
# In the current dataset there is a bias towards patients that are dead, in the testing dataset need to ensure that there is a balance of alive and dead patients
# Use this NN to classify severity level of cancer in alive patients
# as input the NN will take the features mentioned above and output a severity level from 1-5 (1 being least severe, 5 being most severe)


In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

# ============== FEATURE WEIGHTS CONFIGURATION ==============
# Assign importance weights for each feature (tune these based on medical research)
# Higher values = more important for severity prediction
FEATURE_WEIGHTS = {
    'demographic.gender': 0.3,                              # Low importance (demographic)
    'demographic.race': 0.2,                                # Low importance (demographic)
    'demographic.ethnicity': 0.2,                           # Low importance (demographic)
    'diagnoses.age_at_diagnosis': 0.6,                      # Moderate (age affects prognosis)
    'demographic.vital_status': 1.0,                        # Important (outcome indicator)
    'diagnoses.tumor_grade': 1.5,                           # VERY IMPORTANT (critical for severity)
    'diagnoses.laterality': 0.3,                            # Low importance (side of tumor)
    'diagnoses.prior_malignancy': 0.8,                      # Important (history matters)
    'diagnoses.prior_treatment': 0.9,                       # Important (treatment affects severity)
    'diagnoses.synchronous_malignancy': 0.7,               # Moderate (multiple cancers)
    'diagnoses.metastasis_at_diagnosis': 1.5,              # VERY IMPORTANT (critical indicator)
    'diagnoses.last_known_disease_status': 1.3,            # VERY IMPORTANT (current status)
    'diagnoses.progression_or_recurrence': 1.2,            # Very Important (progression matters)
    'demographic.days_to_death': 1.1,                       # Very Important (survival time)
    'follow_ups.karnofsky_performance_status': 1.0,        # Important (performance score)
    'diagnoses.who_cns_grade': 1.4                         # VERY IMPORTANT (WHO grade is critical)
}

# ============== 1. DEFINE THE NEURAL NETWORK ==============
# Updated for 5-class classification (severity levels 1-5)
class CancerSeverityPredictor(nn.Module):
    def __init__(self, input_size, num_classes=5):
        super(CancerSeverityPredictor, self).__init__()
        
        # Architecture: input -> 128 -> 64 -> 32 -> num_classes
        self.layer1 = nn.Linear(input_size, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.3)
        
        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.3)
        
        self.layer3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.dropout3 = nn.Dropout(0.2)
        
        # Output layer: 5 classes for severity levels 1-5
        self.output = nn.Linear(32, num_classes)
        
        # Weight initialization
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        # Layer 1
        x = self.layer1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout1(x)
        
        # Layer 2
        x = self.layer2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        
        # Layer 3
        x = self.layer3(x)
        x = self.bn3(x)
        x = torch.relu(x)
        x = self.dropout3(x)
        
        # Output layer (logits for 5 classes)
        x = self.output(x)
        return x

# ============== 2. PREPARE DATA ==============
# Load medical dataset
def load_medical_data():
    try:
        # Load the actual CSV file
        csv_path = r'C:\Users\Meghaj Kabra\Desktop\AI Health 2026 Competition\output.csv'
        df = pd.read_csv(csv_path, low_memory=False)
        print(f"‚úì Loaded data: {df.shape}")
        
        feature_mapping = {
        'demographic.gender': 'gender',
        'demographic.race': 'race',
        'demographic.ethnicity': 'ethnicity',
        'diagnoses.age_at_diagnosis': 'age_at_diagnosis',
        'demographic.vital_status': 'vital_status',
        'diagnoses.tumor_grade': 'tumor_grade',
        'diagnoses.laterality': 'laterality',
        'diagnoses.prior_malignancy': 'prior_malignancy',
        'diagnoses.prior_treatment': 'prior_treatment',
        'diagnoses.synchronous_malignancy': 'another_malignancy',
        'diagnoses.metastasis_at_diagnosis': 'metastasis',
        'diagnoses.last_known_disease_status': 'disease_status',
        'diagnoses.progression_or_recurrence': 'progression',
        'demographic.days_to_death': 'days_to_death',
        'follow_ups.karnofsky_performance_status': 'karnofsky_score',
        'diagnoses.who_cns_grade': 'who_grade'
        }
        
        features = list(feature_mapping.keys())
        # Check which columns exist
        available_cols = [col for col in feature_mapping if col in df.columns]
        print(f"‚úì Found {len(available_cols)} features: {available_cols[:5]}...")
        
        # Extract features
        X_df = df[available_cols].copy()
        
        # CREATE SEVERITY TARGET (1-5 scale)
        # You can modify this logic based on your medical data
        severity = np.ones(len(df))  # Default severity 1
        
        # Encode categorical features
        for col in X_df.columns:
            if X_df[col].dtype == 'object':
                le = LabelEncoder()
                X_df[col] = le.fit_transform(X_df[col].fillna('Unknown'))
        
        # Fill missing numeric values
        X_df = X_df.fillna(X_df.median())
        
        # ============== APPLY FEATURE WEIGHTS ==============
        # Multiply each feature by its importance weight
        print("\n‚öñÔ∏è  Applying feature weights:")
        for col_idx, col_name in enumerate(available_cols):
            if col_name in FEATURE_WEIGHTS:
                weight = FEATURE_WEIGHTS[col_name]
                X_df.iloc[:, col_idx] *= weight
                print(f"  {col_name}: weight = {weight}")
        
        X = X_df.values.astype(np.float32)
        
        # Create severity labels (1-5) - MODIFY THIS BASED ON YOUR MEDICAL DATA
        # This is a placeholder - you should replace with actual severity scoring
        y = severity.astype(int)
        
        print(f"‚úì Processed shape: X={X.shape}, y={y.shape}")
        print(f"‚úì Severity distribution: {np.bincount(y)}")
        return X, y
        
    except Exception as e:
        print(f"‚úó Error loading data: {e}")
        print("‚ö† Using synthetic data for demonstration")
        np.random.seed(42)
        X = np.random.rand(1000, len(FEATURE_WEIGHTS))
        # Apply weights to synthetic data
        for i, weight in enumerate(FEATURE_WEIGHTS.values()):
            X[:, i] *= weight
        y = np.random.randint(1, 6, 1000)  # Severity 1-5
        return X, y


X, y = load_medical_data()

# Normalize features (after weight application)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)  # Changed to LongTensor for classification
X_test = torch.FloatTensor(X_test)
y_test = torch.LongTensor(y_test)

# ============== 3. SETUP TRAINING ==============
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nüñ•Ô∏è Using device: {device}")

model = CancerSeverityPredictor(input_size=X_train.shape[1], num_classes=5).to(device)
criterion = nn.CrossEntropyLoss()  # Multi-class classification loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move data to device
X_train = X_train.to(device)
y_train = y_train.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)

# ============== 4. TRAINING LOOP ==============
num_epochs = 100
batch_size = 32

print("\nüìä Training Cancer Severity Prediction Model (1-5 Classification)...")
for epoch in range(num_epochs):
    model.train()
    
    # Mini-batch training
    permutation = torch.randperm(X_train.size()[0])
    epoch_loss = 0
    num_batches = 0
    
    for i in range(0, X_train.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X_train[indices], y_train[indices]
        
        # Skip batch if size is 1 (BatchNorm issue)
        if batch_x.size(0) == 1:
            continue
        
        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        epoch_loss += loss.item()
        num_batches += 1
    
    # Print progress
    if (epoch + 1) % 10 == 0:
        avg_loss = epoch_loss / num_batches if num_batches > 0 else 0
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')

# ============== 5. EVALUATION ==============
model.eval()
with torch.no_grad():
    # Get predictions
    train_logits = model(X_train)
    train_pred = torch.argmax(train_logits, dim=1) + 1  # +1 to convert 0-4 to 1-5
    
    test_logits = model(X_test)
    test_pred = torch.argmax(test_logits, dim=1) + 1  # +1 to convert 0-4 to 1-5
    
    # Calculate accuracy
    train_acc = (train_pred == y_train).float().mean()
    test_acc = (test_pred == y_test).float().mean()

print(f"\n‚úÖ Results:")
print(f"Training Accuracy: {train_acc*100:.2f}%")
print(f"Test Accuracy: {test_acc*100:.2f}%")

# Show sample predictions
print(f"\nüîç Sample Predictions (first 10 test samples):")
print(f"{'Actual':<8} {'Predicted':<10} {'Confidence':<12}")
print("-" * 30)
for i in range(min(10, len(y_test))):
    actual = y_test[i].item()
    predicted = test_pred[i].item()
    confidence = torch.nn.functional.softmax(test_logits[i], dim=0).max().item() * 100
    print(f"{actual:<8} {predicted:<10} {confidence:.1f}%")

# ============== 6. SAVE MODEL ==============
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scaler': scaler,
    'input_size': X_train.shape[1],
    'feature_weights': FEATURE_WEIGHTS
}, 'cancer_severity_model.pth')
print(f"\nüíæ Model saved as 'cancer_severity_model.pth'")
print(f"üìã Feature Weights Summary:")
print(f"   Highest weighted: tumor_grade, metastasis, who_cns_grade")
print(f"   Lowest weighted: gender, race, ethnicity, tobacco factors")


‚úì Loaded data: (21634, 164)
‚úì Found 16 features: ['demographic.gender', 'demographic.race', 'demographic.ethnicity', 'diagnoses.age_at_diagnosis', 'demographic.vital_status']...

‚öñÔ∏è  Applying feature weights:
  demographic.gender: weight = 0.3
  demographic.race: weight = 0.2
  demographic.ethnicity: weight = 0.2
  diagnoses.age_at_diagnosis: weight = 0.6
  demographic.vital_status: weight = 1.0
  diagnoses.tumor_grade: weight = 1.5
  diagnoses.laterality: weight = 0.3
  diagnoses.prior_malignancy: weight = 0.8
  diagnoses.prior_treatment: weight = 0.9
  diagnoses.synchronous_malignancy: weight = 0.7
  diagnoses.metastasis_at_diagnosis: weight = 1.5
  diagnoses.last_known_disease_status: weight = 1.3
  diagnoses.progression_or_recurrence: weight = 1.2
  demographic.days_to_death: weight = 1.1
  follow_ups.karnofsky_performance_status: weight = 1.0
  diagnoses.who_cns_grade: weight = 1.4
‚úì Processed shape: X=(21634, 16), y=(21634,)
‚úì Severity distribution: [    0 21634]

üñ

1        0.0
2        0.0
3        0.0
4        0.0
        ... 
21629    0.3
21630    0.3
21631    0.0
21632    0.3
21633    0.3
Name: demographic.gender, Length: 21634, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_df.iloc[:, col_idx] *= weight
1        0.6
2        0.6
3        0.6
4        0.6
        ... 
21629    1.0
21630    1.0
21631    0.6
21632    0.6
21633    0.6
Name: demographic.race, Length: 21634, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_df.iloc[:, col_idx] *= weight
1        0.6
2        0.6
3        0.6
4        0.6
        ... 
21629    0.4
21630    0.4
21631    0.6
21632    0.6
21633    0.6
Name: demographic.ethnicity, Length: 21634, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X_df.iloc[:, col_idx] *= weight
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
21629    1

Epoch [10/100], Loss: 0.0001
Epoch [20/100], Loss: 0.0000
Epoch [30/100], Loss: 0.0000
Epoch [40/100], Loss: 0.0000
Epoch [50/100], Loss: 0.0000
Epoch [60/100], Loss: 0.0000
Epoch [70/100], Loss: 0.0000
Epoch [80/100], Loss: 0.0000
Epoch [90/100], Loss: 0.0000
Epoch [100/100], Loss: 0.0000

‚úÖ Results:
Training Accuracy: 0.00%
Test Accuracy: 0.00%

üîç Sample Predictions (first 10 test samples):
Actual   Predicted  Confidence  
------------------------------
1        2          100.0%
1        2          100.0%
1        2          100.0%
1        2          100.0%
1        2          100.0%
1        2          100.0%
1        2          100.0%
1        2          100.0%
1        2          100.0%
1        2          100.0%

üíæ Model saved as 'cancer_severity_model.pth'
üìã Feature Weights Summary:
   Highest weighted: tumor_grade, metastasis, who_cns_grade
   Lowest weighted: gender, race, ethnicity, tobacco factors
