In [6]:
import numpy as np
import pandas as pd
import ast
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [7]:
import pandas as pd

train_cohort = pd.read_csv('cohort_train.csv')
train_ids = train_cohort['study_id'].to_list()

test_cohort = pd.read_csv('cohort_test.csv')
test_ids = test_cohort['study_id'].to_list()

# Load embeddings
embeddings_df = pd.read_csv('../NLP_processing/embeddings/intense_pneumonia_embeddings/embedded_reports_mean.csv')  

embeddings_train = embeddings_df[embeddings_df['study_id'].isin(train_ids)]
embeddings_test= embeddings_df[embeddings_df['study_id'].isin(test_ids)]

print(f"Test embeddings shape: {embeddings_train .shape}")
print(f"Test embeddings shape: {embeddings_test.shape}")

Test embeddings shape: (2207, 14)
Test embeddings shape: (552, 14)


In [8]:
import pandas as pd
import torch

# Load the embedded reports tensor
embedded_reports_tensor = torch.load('../NLP_processing/embeddings/intense_pneumonia_embeddings/attention_results_embedded_reports.pt')

# Extract the relevant data from the tensor
embeddings = embedded_reports_tensor['embeddings']
study_ids = embedded_reports_tensor['study_ids']

# Create a DataFrame to associate embeddings with study IDs
embeddings_df = pd.DataFrame({
    'study_id': study_ids,
    'embedding': list(embeddings)  # Convert array rows to list for pandas compatibility
})

# Load training and testing cohort study IDs
train_cohort = pd.read_csv('cohort_train.csv')
train_ids = train_cohort['study_id'].to_list()

test_cohort = pd.read_csv('cohort_test.csv')
test_ids = test_cohort['study_id'].to_list()

# Filter embeddings for training and testing cohorts
embeddings_train = embeddings_df[embeddings_df['study_id'].isin(train_ids)]
embeddings_test = embeddings_df[embeddings_df['study_id'].isin(test_ids)]

print(f"Training embeddings shape: {embeddings_train.shape}")
print(f"Test embeddings shape: {embeddings_test.shape}")

Training embeddings shape: (2207, 2)
Test embeddings shape: (552, 2)


  embedded_reports_tensor = torch.load('../NLP_processing/embeddings/intense_pneumonia_embeddings/attention_results_embedded_reports.pt')


In [10]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score, 
    recall_score, precision_score, confusion_matrix,
    classification_report
)
import numpy as np

class SVMRadiologyClassifier:
    def __init__(self, train_df, test_df, embeddings_train, embeddings_test):
        """
        Initialize the SVM classifier with separate training and test datasets.
        
        Parameters:
            train_df: DataFrame containing the training cohort reports and metadata
            test_df: DataFrame containing the test cohort reports and metadata
            embeddings_train: Pre-computed embeddings for training cohort
            embeddings_test: Pre-computed embeddings for test cohort
        """
        self.train_df = train_df
        self.test_df = test_df
        
        # Convert string embeddings to numpy arrays if needed
        self.embeddings_train = np.array([
            np.array(ast.literal_eval(emb)) if isinstance(emb, str) else emb 
            for emb in embeddings_train['embedding']
        ])
        self.embeddings_test = np.array([
            np.array(ast.literal_eval(emb)) if isinstance(emb, str) else emb 
            for emb in embeddings_test['embedding']
        ])
        
        self.model = None
        self.scaler = StandardScaler()
        self.misclassified_cases = None
        
    def prepare_data(self):
        """
        Prepare and scale the data from the separate train and test cohorts.
        """
        # Get labels
        y_train = self.train_df['Y'].values
        y_test = self.test_df['Y'].values
        
        # Scale the embeddings
        X_train = self.scaler.fit_transform(self.embeddings_train)
        X_test = self.scaler.transform(self.embeddings_test)
        
        return X_train, X_test, y_train, y_test
    
    
    def train_and_evaluate(self, kernel='rbf', C=2.0, probability=True): # made C slightly higher for tighter margin 
        """
        Train the SVM model and evaluate its performance on the test set.
        Handles different sizes between training and test sets.
        """
        X_train, X_test, y_train, y_test = self.prepare_data()
        

        self.model = SVC(
            kernel=kernel,
            C=C,
            probability=probability,
            random_state=42
        )

        self.model.fit(X_train, y_train)
    
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        

        if len(y_test) != len(y_pred):
            print(f"Note: Train set has {len(y_train)} samples, Test set has {len(y_test)} samples")
            y_pred = y_pred[:len(y_test)]
            y_pred_proba = y_pred_proba[:len(y_test)]
        

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba),
            'f1': f1_score(y_test, y_pred),
            'sensitivity': tp / (tp + fn),  # True Positive Rate / Recall
            'specificity': tn / (tn + fp)   # True Negative Rate
        }
        
        # Examine misclassified
        misclassified_mask = y_test != y_pred
        self.misclassified_cases = self.test_df[misclassified_mask].copy()
        self.misclassified_cases['predicted_label'] = y_pred[misclassified_mask]
        self.misclassified_cases['true_label'] = y_test[misclassified_mask]
        self.misclassified_cases['prediction_probability'] = y_pred_proba[misclassified_mask]
    
        return metrics

    
    def analyze_misclassified_cases(self):
        """
        Analyze cases where the model made mistakes.
        """
        if self.misclassified_cases is None:
            print("Please run train_and_evaluate first.")
            return
        
        print("\nAnalysis of Misclassified Cases:")
        print(f"Total misclassified cases: {len(self.misclassified_cases)}")
        
        # Analyze false positives and negatives
        false_positives = self.misclassified_cases[
            (self.misclassified_cases['predicted_label'] == 1) & 
            (self.misclassified_cases['true_label'] == 0)
        ]
        
        false_negatives = self.misclassified_cases[
            (self.misclassified_cases['predicted_label'] == 0) & 
            (self.misclassified_cases['true_label'] == 1)
        ]
        
        print(f"\nFalse Positives: {len(false_positives)} cases")
        print(f"False Negatives: {len(false_negatives)} cases")
        
        # Display example cases
        def display_cases(cases, case_type, n=5):
            print(f"\nExample {case_type} (showing {min(n, len(cases))} cases):")
            for _, case in cases.head(n).iterrows():
                print(f"\nStudy ID: {case['study_id']}")
                print(f"Confidence: {case['prediction_probability']:.3f}")
                print("-" * 80)
        
        display_cases(false_positives, "False Positives")
        display_cases(false_negatives, "False Negatives")
        
        return false_positives, false_negatives

def main():
    """
    Main function to run the SVM classification pipeline.
    """
    # Run Classifier
    classifier = SVMRadiologyClassifier(
        train_cohort, 
        test_cohort, 
        embeddings_train, 
        embeddings_test
    )
    
    # Train and evaluate with different kernel options
    kernels = ['rbf', 'linear']
    best_metrics = None
    best_kernel = None
    
    for kernel in kernels:
        print(f"\nTraining with {kernel} kernel:")
        metrics = classifier.train_and_evaluate(kernel=kernel)
        
        print("\nModel Performance Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"AUC-ROC: {metrics['auc']:.4f}")
        print(f"F1 Score: {metrics['f1']:.4f}")
        print(f"Sensitivity: {metrics['sensitivity']:.4f}")
        
        if best_metrics is None or metrics['auc'] > best_metrics['auc']:
            best_metrics = metrics
            best_kernel = kernel
    
    print(f"\nBest performing kernel: {best_kernel}")
    
    false_positives, false_negatives = classifier.analyze_misclassified_cases()
    return classifier, best_metrics, (false_positives, false_negatives)
    

In [12]:
# loading XGBOOST Prediction Learning Data 
classifier, metrics, error_analysis = main()


Training with rbf kernel:

Model Performance Metrics:
Accuracy: 0.7482
AUC-ROC: 0.5379
F1 Score: 0.0142
Sensitivity: 0.0071

Training with linear kernel:

Model Performance Metrics:
Accuracy: 0.5942
AUC-ROC: 0.4358
F1 Score: 0.1825
Sensitivity: 0.1786

Best performing kernel: rbf

Analysis of Misclassified Cases:
Total misclassified cases: 224

False Positives: 109 cases
False Negatives: 115 cases

Example False Positives (showing 5 cases):

Study ID: 57219438
Confidence: 0.237
--------------------------------------------------------------------------------

Study ID: 54264165
Confidence: 0.239
--------------------------------------------------------------------------------

Study ID: 57321775
Confidence: 0.237
--------------------------------------------------------------------------------

Study ID: 52128893
Confidence: 0.242
--------------------------------------------------------------------------------

Study ID: 57298755
Confidence: 0.238
----------------------------------------

In [14]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import re
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance, KeyBERTInspired
from hdbscan import HDBSCAN
from umap import UMAP

class IntegratedRadiologyClassifier:
    def __init__(self, pretrained_model_name, save_path='../../models/'):
        """
        Initialize the integrated classifier with custom embedding model and BERTopic.
        """
        self.save_path = Path(save_path)
        self.embedder = RadiologyReportEmbedder(pretrained_model_name)
        self.scaler = StandardScaler()
        self.setup_bertopic()
        self.svm = None
        self.misclassified_cases = None
        
    def setup_bertopic(self):
        """
        Initialize BERTopic with custom configuration.
        """
        # UMAP configuration
        self.umap_model = UMAP(
            n_neighbors=30,
            n_components=5,
            min_dist=0.0,
            metric='cosine'
        )
        
        # HDBSCAN configuration
        self.hdbscan_model = HDBSCAN(
            min_samples=20,
            gen_min_span_tree=True,
            prediction_data=True,
            min_cluster_size=20,
            metric='euclidean',
            cluster_selection_method='leaf'
        )
        
        # Vectorizer configuration
        self.vectorizer_model = CountVectorizer(
            strip_accents='unicode',
            stop_words='english',
            ngram_range=(1, 3),
            max_df=0.5
        )
        
        # CTF-IDF configuration
        self.ctfidf_model = ClassTfidfTransformer(
            bm25_weighting=False,
            reduce_frequent_words=True
        )
        
        # Representation model configuration
        self.representation_model = [
            KeyBERTInspired(top_n_words=30, random_state=42),
            MaximalMarginalRelevance(diversity=0.8)
        ]
        
        # Initialize BERTopic
        self.topic_model = BERTopic(
            umap_model=self.umap_model,
            hdbscan_model=self.hdbscan_model,
            ctfidf_model=self.ctfidf_model,
            vectorizer_model=self.vectorizer_model,
            representation_model=self.representation_model,
            top_n_words=30,
            calculate_probabilities=True
        )
    
    def process_and_embed_reports(self, reports_df):
        """
        Process reports and create embeddings.
        """
        # Combine relevant sections
        reports_df['combined_text'] = reports_df.apply(
            lambda x: f"Findings: {x['findings']} Impression: {x['impression']}", 
            axis=1
        )
        
        # Create embeddings
        embeddings = self.embedder.create_embeddings(reports_df['combined_text'].tolist())
        return embeddings
    
    def fit_transform_topics(self, reports_df, embeddings):
        """
        Fit BERTopic model and transform documents to topic space.
        """
        docs = reports_df['combined_text'].tolist()
        topics, probabilities = self.topic_model.fit_transform(
            docs, 
            embeddings=embeddings
        )
        return topics, probabilities
    
    def prepare_data(self, train_df, test_df, train_embeddings, test_embeddings):
        """
        Prepare data for SVM classification.
        """
        # Get topic probabilities for both sets
        _, train_probs = self.topic_model.transform(
            train_df['combined_text'].tolist(),
            train_embeddings
        )
        _, test_probs = self.topic_model.transform(
            test_df['combined_text'].tolist(),
            test_embeddings
        )
        
        # Scale probabilities
        X_train = self.scaler.fit_transform(train_probs)
        X_test = self.scaler.transform(test_probs)
        
        # Get labels
        y_train = train_df['Y'].values
        y_test = test_df['Y'].values
        
        return X_train, X_test, y_train, y_test
    
    def train_and_evaluate(self, train_df, test_df, kernel='rbf', C=2.0):
        """
        Train SVM and evaluate performance.
        """
        # Create embeddings
        train_embeddings = self.process_and_embed_reports(train_df)
        test_embeddings = self.process_and_embed_reports(test_df)
        
        # Fit topic model and prepare data
        self.fit_transform_topics(train_df, train_embeddings)
        X_train, X_test, y_train, y_test = self.prepare_data(
            train_df, test_df, train_embeddings, test_embeddings
        )
        
        # Train SVM
        self.svm = SVC(kernel=kernel, C=C, probability=True, random_state=42)
        self.svm.fit(X_train, y_train)
        
        # Make predictions
        y_pred = self.svm.predict(X_test)
        y_pred_proba = self.svm.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba),
            'f1': f1_score(y_test, y_pred),
            'sensitivity': tp / (tp + fn),
            'specificity': tn / (tn + fp)
        }
        
        # Store misclassified cases
        self._store_misclassified(test_df, y_test, y_pred, y_pred_proba)
        
        return metrics
    
    def _store_misclassified(self, test_df, y_true, y_pred, y_pred_proba):
        """
        Store misclassified cases for analysis.
        """
        misclassified_mask = y_true != y_pred
        self.misclassified_cases = test_df[misclassified_mask].copy()
        self.misclassified_cases['predicted_label'] = y_pred[misclassified_mask]
        self.misclassified_cases['true_label'] = y_true[misclassified_mask]
        self.misclassified_cases['prediction_probability'] = y_pred_proba[misclassified_mask]
    
    def save_model(self, model_name):
        """
        Save the complete model including BERTopic and SVM.
        """
        save_dir = self.save_path / model_name
        save_dir.mkdir(parents=True, exist_ok=True)
        
        # Save BERTopic model
        self.topic_model.save(
            str(save_dir / "bertopic_model"),
            serialization="pytorch",
            save_ctfidf=True
        )
        
        # Save SVM model
        torch.save(self.svm, save_dir / "svm_model.pt")
        
        # Save scaler
        torch.save(self.scaler, save_dir / "scaler.pt")
        
        # Save topic info
        topic_info = self.topic_model.get_topic_info()
        topic_info.to_csv(save_dir / "topic_info.csv")
        
    def load_model(self, model_name):
        """
        Load a previously saved model.
        """
        model_dir = self.save_path / model_name
        
        # Load BERTopic
        self.topic_model = BERTopic.load(
            str(model_dir / "bertopic_model")
        )
        
        # Load SVM and scaler
        self.svm = torch.load(model_dir / "svm_model.pt")
        self.scaler = torch.load(model_dir / "scaler.pt")

def run_classification_pipeline(train_df, test_df, pretrained_model_name, model_save_path):
    """
    Run the complete classification pipeline.
    """
    # Initialize classifier
    classifier = IntegratedRadiologyClassifier(
        pretrained_model_name,
        save_path=model_save_path
    )
    
    # Train and evaluate with different kernels
    kernels = ['rbf', 'linear']
    best_metrics = None
    best_kernel = None
    
    for kernel in kernels:
        print(f"\nTraining with {kernel} kernel:")
        metrics = classifier.train_and_evaluate(train_df, test_df, kernel=kernel)
        
        print("\nModel Performance Metrics:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")
        
        if best_metrics is None or metrics['auc'] > best_metrics['auc']:
            best_metrics = metrics
            best_kernel = kernel
    
    print(f"\nBest performing kernel: {best_kernel}")
    
    # Save the best model
    classifier.save_model(f"radiology_classifier_{best_kernel}")
    
    return classifier, best_metrics