In [None]:
import numpy as np
import pandas as pd
import ast
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [None]:
import pandas as pd

train_cohort = pd.read_csv('cohort_train.csv')
train_ids = train_cohort['study_id'].to_list()

test_cohort = pd.read_csv('cohort_test.csv')
test_ids = test_cohort['study_id'].to_list()

# Load embeddings
embeddings_df = pd.read_csv('../NLP_processing/embeddings/intense_pneumonia_embeddings/embedded_reports_mean.csv')  


embeddings_train = embeddings_df[embeddings_df['study_id'].isin(train_ids)]


embeddings_test= embeddings_df[embeddings_df['study_id'].isin(test_ids)]

print(f"Test embeddings shape: {embeddings_train .shape}")
print(f"Test embeddings shape: {embeddings_test.shape}")

Test embeddings shape: (2207, 14)
Test embeddings shape: (2207, 14)


In [None]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, 
    recall_score, precision_score, confusion_matrix,
    classification_report
)
import numpy as np


class SVMRadiologyClassifier:
    def __init__(self, train_df, test_df, embeddings_train, embeddings_test):
        """
        Initialize the SVM classifier with separate training and test datasets.
        
        Parameters:
            train_df: DataFrame containing the training cohort reports and metadata
            test_df: DataFrame containing the test cohort reports and metadata
            embeddings_train: Pre-computed embeddings for training cohort
            embeddings_test: Pre-computed embeddings for test cohort
        """
        self.train_df = train_df
        self.test_df = test_df
        
        # Convert string embeddings to numpy arrays if needed
        self.embeddings_train = np.array([
            np.array(ast.literal_eval(emb)) if isinstance(emb, str) else emb 
            for emb in embeddings_train['embedding']
        ])
        self.embeddings_test = np.array([
            np.array(ast.literal_eval(emb)) if isinstance(emb, str) else emb 
            for emb in embeddings_test['embedding']
        ])
        
        self.model = None
        self.scaler = StandardScaler()
        self.misclassified_cases = None
        
    def prepare_data(self):
        """
        Prepare and scale the data from the separate train and test cohorts.
        """
        # Get labels
        y_train = self.train_df['Y'].values
        y_test = self.test_df['Y'].values
        
        # Scale the embeddings
        X_train = self.scaler.fit_transform(self.embeddings_train)
        X_test = self.scaler.transform(self.embeddings_test)
        
        return X_train, X_test, y_train, y_test

    
    
    def train_and_evaluate(self, kernel='rbf', C=2.0, probability=True): # made C slightly higher for tighter margin 
        """
        Train the SVM model and evaluate its performance on the test set.
        Handles different sizes between training and test sets.
        """
        X_train, X_test, y_train, y_test = self.prepare_data()
        

        self.model = SVC(
            kernel=kernel,
            C=C,
            probability=probability,
            random_state=42
        )
        

        self.model.fit(X_train, y_train)
    
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        

        if len(y_test) != len(y_pred):
            print(f"Note: Train set has {len(y_train)} samples, Test set has {len(y_test)} samples")
            y_pred = y_pred[:len(y_test)]
            y_pred_proba = y_pred_proba[:len(y_test)]
        

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba),
            'f1': f1_score(y_test, y_pred),
            'sensitivity': tp / (tp + fn),  # True Positive Rate / Recall
            'specificity': tn / (tn + fp)   # True Negative Rate
        }
        
        # Examine misclassified
        misclassified_mask = y_test != y_pred
        self.misclassified_cases = self.test_df[misclassified_mask].copy()
        self.misclassified_cases['predicted_label'] = y_pred[misclassified_mask]
        self.misclassified_cases['true_label'] = y_test[misclassified_mask]
        self.misclassified_cases['prediction_probability'] = y_pred_proba[misclassified_mask]
    
        return metrics

    
    def analyze_misclassified_cases(self):
        """
        Analyze cases where the model made mistakes.
        """
        if self.misclassified_cases is None:
            print("Please run train_and_evaluate first.")
            return
        
        print("\nAnalysis of Misclassified Cases:")
        print(f"Total misclassified cases: {len(self.misclassified_cases)}")
        
        # Analyze false positives and negatives
        false_positives = self.misclassified_cases[
            (self.misclassified_cases['predicted_label'] == 1) & 
            (self.misclassified_cases['true_label'] == 0)
        ]
        
        false_negatives = self.misclassified_cases[
            (self.misclassified_cases['predicted_label'] == 0) & 
            (self.misclassified_cases['true_label'] == 1)
        ]
        
        print(f"\nFalse Positives: {len(false_positives)} cases")
        print(f"False Negatives: {len(false_negatives)} cases")
        
        # Display example cases
        def display_cases(cases, case_type, n=5):
            print(f"\nExample {case_type} (showing {min(n, len(cases))} cases):")
            for _, case in cases.head(n).iterrows():
                print(f"\nStudy ID: {case['study_id']}")
                print(f"Confidence: {case['prediction_probability']:.3f}")
                print("-" * 80)
        
        display_cases(false_positives, "False Positives")
        display_cases(false_negatives, "False Negatives")
        
        return false_positives, false_negatives

def main():
    """
    Main function to run the SVM classification pipeline.
    """
    # Run Classifier
    classifier = SVMRadiologyClassifier(
        train_cohort, 
        test_cohort, 
        embeddings_train, 
        embeddings_test
    )
    
    # Train and evaluate with different kernel options
    kernels = ['rbf', 'linear']
    best_metrics = None
    best_kernel = None
    
    for kernel in kernels:
        print(f"\nTraining with {kernel} kernel:")
        metrics = classifier.train_and_evaluate(kernel=kernel)
        
        print("\nModel Performance Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"AUC-ROC: {metrics['auc']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Sensitivity: {metrics['sensitivity']:.4f}")
        
        if best_metrics is None or metrics['auc'] > best_metrics['auc']:
            best_metrics = metrics
            best_kernel = kernel
    
    print(f"\nBest performing kernel: {best_kernel}")
    
    false_positives, false_negatives = classifier.analyze_misclassified_cases()
    return classifier, best_metrics, (false_positives, false_negatives)
    

In [9]:
# loading XGBOOST Prediction Learning Data 
classifier, metrics, error_analysis = main()


Training with rbf kernel:
Note: Train set has 2207 samples, Test set has 552 samples

Model Performance Metrics:
Accuracy: 0.7065
AUC-ROC: 0.5236
F1 Score: 0.1649
Sensitivity: 0.1143

Training with linear kernel:
Note: Train set has 2207 samples, Test set has 552 samples

Model Performance Metrics:
Accuracy: 0.6486
AUC-ROC: 0.4935
F1 Score: 0.1917
Sensitivity: 0.1643

Best performing kernel: rbf

Analysis of Misclassified Cases:
Total misclassified cases: 194

False Positives: 77 cases
False Negatives: 117 cases

Example False Positives (showing 5 cases):

Study ID: 58561179
Confidence: 0.241
--------------------------------------------------------------------------------

Study ID: 59450170
Confidence: 0.248
--------------------------------------------------------------------------------

Study ID: 50964535
Confidence: 0.243
--------------------------------------------------------------------------------

Study ID: 55766889
Confidence: 0.245
------------------------------------------

In [4]:
def train_and_evaluate(self, kernel='rbf', C=1.0, probability=True):
    """
    Train the SVM model and evaluate its performance on the test set.
    Handles different sizes between training and test sets.
    """
    X_train, X_test, y_train, y_test = self.prepare_data()
    
    # Initialize and train SVM model
    self.model = SVC(
        kernel=kernel,
        C=C,
        probability=probability,
        random_state=42
    )
    
    # Train the model
    self.model.fit(X_train, y_train)
    
    # Make predictions on test set
    y_pred = self.model.predict(X_test)
    y_pred_proba = self.model.predict_proba(X_test)[:, 1]
    
    # Ensure predictions align with test labels
    if len(y_test) != len(y_pred):
        print(f"Note: Train set has {len(y_train)} samples, Test set has {len(y_test)} samples")
        y_pred = y_pred[:len(y_test)]
        y_pred_proba = y_pred_proba[:len(y_test)]
    
    # Calculate confusion matrix for sensitivity calculation
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_pred_proba),
        'f1': f1_score(y_test, y_pred),
        'sensitivity': tp / (tp + fn),  # True Positive Rate / Recall
        'specificity': tn / (tn + fp)   # True Negative Rate
    }
    
    # Store misclassified cases
    misclassified_mask = y_test != y_pred
    self.misclassified_cases = self.test_df[misclassified_mask].copy()
    self.misclassified_cases['predicted_label'] = y_pred[misclassified_mask]
    self.misclassified_cases['true_label'] = y_test[misclassified_mask]
    self.misclassified_cases['prediction_probability'] = y_pred_proba[misclassified_mask]
    
    return metrics

def main():
    """
    Main function to run the SVM classification pipeline.
    """
    # Initialize and run the classifier
    classifier = SVMRadiologyClassifier(
        train_cohort, 
        test_cohort, 
        embeddings_train, 
        embeddings_test
    )
    
    # Train and evaluate with different kernel options
    kernels = ['rbf', 'linear']
    best_metrics = None
    best_kernel = None
    
    for kernel in kernels:
        print(f"\nTraining with {kernel} kernel:")
        metrics = classifier.train_and_evaluate(kernel=kernel)
        
        print("\nModel Performance Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"AUC-ROC: {metrics['auc']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Sensitivity: {metrics['sensitivity']:.4f}")
        print(f"Specificity: {metrics['specificity']:.4f}")
        
        if best_metrics is None or metrics['auc'] > best_metrics['auc']:
            best_metrics = metrics
            best_kernel = kernel
    
    print(f"\nBest performing kernel: {best_kernel}")
    
    false_positives, false_negatives = classifier.analyze_misclassified_cases()
    return classifier, best_metrics, (false_positives, false_negatives)

In [None]:
    
    def train_and_evaluate(self, kernel='rbf', C=1.0, probability=True):
        """
        Train the SVM model and evaluate its performance on the test set.
        Handles different sizes between training and test sets.
        """
        X_train, X_test, y_train, y_test = self.prepare_data()
        
        # Initialize and train SVM model
        self.model = SVC(
            kernel=kernel,
            C=C,
            probability=probability,
            random_state=42
        )
        
        # Train the model
        self.model.fit(X_train, y_train)
        
        # Make predictions on test set
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        
        # Ensure predictions align with test labels
        if len(y_test) != len(y_pred):
            print(f"Note: Train set has {len(y_train)} samples, Test set has {len(y_test)} samples")
            y_pred = y_pred[:len(y_test)]
            y_pred_proba = y_pred_proba[:len(y_test)]
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba),
            'f1': f1_score(y_test, y_pred)
            'sensitivity': recall_score(y_true, y_pred),  # Same as recall
        }
        
        # make sure we store the misclassified cases
        misclassified_mask = y_test != y_pred
        self.misclassified_cases = self.test_df[misclassified_mask].copy()
        self.misclassified_cases['predicted_label'] = y_pred[misclassified_mask]
        self.misclassified_cases['true_label'] = y_test[misclassified_mask]
        self.misclassified_cases['prediction_probability'] = y_pred_proba[misclassified_mask]
        
        return metrics