### Model the Severity of the Pneumonia Case

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [None]:
import pandas as pd

train_cohort = pd.read_csv('cohort_train.csv')
train_ids = train_cohort['study_id'].to_list()

test_cohort = pd.read_csv('cohort_test.csv')
test_ids = test_cohort['study_id'].to_list()

# Load embeddings
embeddings_df = pd.read_csv('../NLP_processing/embeddings/intense_pneumonia_embeddings/embedded_reports_mean.csv')  

# Filter embeddings for the training set
train_df = embeddings_df[embeddings_df['study_id'].isin(train_ids)]

# Filter embeddings for the testing set
tes_df = embeddings_df[embeddings_df['study_id'].isin(test_ids)]

# Save filtered embeddings (optional, if needed)
train_df.to_csv('train_embeddings.csv', index=False)
test_df.to_csv('test_embeddings.csv', index=False)

# Print shapes to verify
print(f"Train embeddings shape: {train_embeddings_df.shape}")
print(f"Test embeddings shape: {test_embeddings_df.shape}")

Test embeddings shape: (2207, 14)
Test embeddings shape: (2207, 14)


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import ast
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score


class RadiologyReportClassifier:
    def __init__(self, train_df, test_df, embeddings_train, embeddings_test):
        """
        Initialize the classifier with separate training and test datasets.
        
        Parameters:
            train_df: DataFrame containing the training cohort reports and metadata
            test_df: DataFrame containing the test cohort reports and metadata
            embeddings_train: Pre-computed embeddings for training cohort
            embeddings_test: Pre-computed embeddings for test cohort
        """
        self.train_df = train_df
        self.test_df = test_df
        
        # Convert string embeddings to numpy arrays
        self.embeddings_train = np.array([
            np.array(ast.literal_eval(emb)) for emb in embeddings_train['embedding']
        ])
        self.embeddings_test = np.array([
            np.array(ast.literal_eval(emb)) for emb in embeddings_test['embedding']
        ])
        
        self.model = None
        self.misclassified_cases = None
        
    def prepare_data(self):
        """
        Prepare the data from the separate train and test cohorts.
        
        Returns:
            X_train: Training embeddings
            X_test: Test embeddings
            y_train: Training labels
            y_test: Test labels
        """
        # Get labels from the DataFrames
        y_train = self.train_df['Y'].values
        y_test = self.test_df['Y'].values
    
        
        return self.embeddings_train, self.embeddings_test, y_train, y_test
    
    def train_and_evaluate(self):
        """
        Train the model and evaluate its performance on the test set.
        Stores misclassified cases for later analysis.
        """
        # Prepare the data
        X_train, X_test, y_train, y_test = self.prepare_data()
        
        # Initialize and train XGBoost model
 
        self.model = xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=5,
            random_state=42,
            alpha=0.5  # L1 regularization term
        )
        
        # Train the model
        self.model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'auc': roc_auc_score(y_test, y_pred_proba),
            'f1': f1_score(y_test, y_pred)
        }
        
        # Store misclassified cases
        misclassified_mask = y_test != y_pred
        self.misclassified_cases = self.test_df[misclassified_mask].copy()
        self.misclassified_cases['predicted_label'] = y_pred[misclassified_mask]
        self.misclassified_cases['true_label'] = y_test[misclassified_mask]
        self.misclassified_cases['prediction_probability'] = y_pred_proba[misclassified_mask]
        
        return metrics
    
    def analyze_misclassified_cases(self):
        """
        Provide detailed analysis of cases where the model made mistakes.
        Shows both false positives and false negatives with their reports.
        """
        if self.misclassified_cases is None:
            print("Please run train_and_evaluate first.")
            return
        
        print("\nAnalysis of Misclassified Cases:")
        print(f"Total misclassified cases: {len(self.misclassified_cases)}")
        
        # Analyze false positives and negatives
        false_positives = self.misclassified_cases[
            (self.misclassified_cases['predicted_label'] == 1) & 
            (self.misclassified_cases['true_label'] == 0)
        ]
        
        false_negatives = self.misclassified_cases[
            (self.misclassified_cases['predicted_label'] == 0) & 
            (self.misclassified_cases['true_label'] == 1)
        ]
        
        print(f"\nFalse Positives: {len(false_positives)} cases")
        print(f"False Negatives: {len(false_negatives)} cases")
        
        # Display example cases with their reports
        def display_cases(cases, case_type, n=5):
            print(f"\nExample {case_type} (showing {min(n, len(cases))} cases):")
            for _, case in cases.head(n).iterrows():
                print(f"\nStudy ID: {case['study_id']}")
                print(f"Confidence: {case['prediction_probability']:.3f}")
                print("-" * 80)
        
        display_cases(false_positives, "False Positives")
        display_cases(false_negatives, "False Negatives")
        
        return false_positives, false_negatives

def main():
    """
    Main function to run the classification pipeline using separate cohort files.
    """
    # Load your train and test cohorts
    train_df = train_cohort
    test_df = test_cohort
    
    # Load the corresponding embeddings
    # Modify this part based on how your embeddings are stored
    embeddings_train = train_embeddings_df 
    embeddings_test = test_embeddings_df   
    
    # Initialize and run the classifier
    classifier = RadiologyReportClassifier(
        train_df, 
        test_df, 
        embeddings_train, 
        embeddings_test
    )
    
    # Train and evaluate
    metrics = classifier.train_and_evaluate()
    
    # Print performance metrics
    print("\nModel Performance Metrics:")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"AUC-ROC: {metrics['auc']:.4f}")
    print(f"F1 Score: {metrics['f1']:.4f}")
    
    # Analyze misclassified cases
    false_positives, false_negatives = classifier.analyze_misclassified_cases()
    
    return classifier, metrics, (false_positives, false_negatives)

In [23]:
# loading XGBOOST Prediction Learning Data 
classifier, metrics, error_analysis = main()


Training with rbf kernel:
Note: Train set has 2207 samples, Test set has 552 samples

Model Performance Metrics:
Accuracy: 0.7464
AUC-ROC: 0.5246
F1 Score: 0.0000

Training with linear kernel:
Note: Train set has 2207 samples, Test set has 552 samples

Model Performance Metrics:
Accuracy: 0.6576
AUC-ROC: 0.5007
F1 Score: 0.1957

Best performing kernel: rbf

Analysis of Misclassified Cases:
Total misclassified cases: 189

False Positives: 72 cases
False Negatives: 117 cases

Example False Positives (showing 5 cases):

Study ID: 59450170
Confidence: 0.245
--------------------------------------------------------------------------------

Study ID: 50964535
Confidence: 0.242
--------------------------------------------------------------------------------

Study ID: 55766889
Confidence: 0.242
--------------------------------------------------------------------------------

Study ID: 57696885
Confidence: 0.242
--------------------------------------------------------------------------------

