### Model the Severity of the Pneumonia Case

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [38]:
import torch

# Load the BERTopic HDBSCAN probabilities with weights_only=False
doc_topic_probabilities = torch.load('../../models/BERTopic_All_Pneumonia_Type/representations/specter_representation_keybert_mmr/specter_probabilities_keybert_mmr.pt',
                                     weights_only=False)

print("Successfully loaded HDBSCAN probabilities!")
doc_topic_probabilities.shape

pneumonia_type_df=pd.read_csv('../../NLP_processing/NER_embeddings/pneumonia_type/radgraph_with_embeddings.csv')


# Remove NaN or empty radgraph_text entries
pneumonia_type_df = pneumonia_type_df[pneumonia_type_df['radgraph_text'].notna()]
pneumonia_type_df = pneumonia_type_df[pneumonia_type_df['radgraph_text'].str.strip() != '']

# Extract clean docs and embeddings
docs = pneumonia_type_df['radgraph_text'].astype(str).tolist()
embeddings = np.vstack(pneumonia_type_df['embedding'].values)

# Ensure shape consistency
assert len(docs) == embeddings.shape[0], "Mismatch between docs and embeddings!"

Successfully loaded HDBSCAN probabilities!


In [40]:
# Convert probabilities to a DataFrame

# Ensure 'study_id' exists and extract it
study_ids = pneumonia_type_df["study_id"].tolist()


topic_prob_df = pd.DataFrame(doc_topic_probabilities, columns=[f"Topic_{i}" for i in range(doc_topic_probabilities.shape[1])])

# Add study_id to align with original data
topic_prob_df["study_id"] = study_ids

# Save this as a CSV for verification
topic_prob_df.to_csv("bertopic_study_probabilities.csv", index=False)
print("✅ Saved BERTopic probabilities with study IDs to 'bertopic_study_probabilities.csv'")

✅ Saved BERTopic probabilities with study IDs to 'bertopic_study_probabilities.csv'


In [41]:
import pandas as pd

train_cohort = pd.read_csv('cohort_train.csv')
train_ids = train_cohort['study_id'].to_list()

test_cohort = pd.read_csv('cohort_test.csv')
test_ids = test_cohort['study_id'].to_list()

# Merge with training cohort
train_df = train_cohort.merge(topic_prob_df, on="study_id", how="inner")

# Merge with testing cohort
test_df = test_cohort.merge(topic_prob_df, on="study_id", how="inner")

# Save the final datasets
train_df.to_csv("train_bertopic_probabilities.csv", index=False)
test_df.to_csv("test_bertopic_probabilities.csv", index=False)

# Print shapes to verify
print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

Train dataset shape: (1876, 45)
Test dataset shape: (462, 45)


In [42]:
# Load BERTopic probability embeddings for the training and test sets
train_embeddings_df = pd.read_csv("train_bertopic_probabilities.csv")
test_embeddings_df = pd.read_csv("test_bertopic_probabilities.csv")

# Ensure 'study_id' column exists in embeddings
if 'study_id' not in train_embeddings_df.columns or 'study_id' not in test_embeddings_df.columns:
    raise ValueError("Error: 'study_id' column is missing in embeddings CSV files.")

# Merge the BERTopic embeddings with the train/test cohort metadata
train_cohort = train_cohort.merge(train_embeddings_df, on="study_id", how="inner")
test_cohort = test_cohort.merge(test_embeddings_df, on="study_id", how="inner")

In [49]:
import pandas as pd
import numpy as np

# Load the cohort files
train_cohort = pd.read_csv("cohort_train.csv")
test_cohort = pd.read_csv("cohort_test.csv")

# Load the BERTopic probabilities DataFrame
embeddings_df = pd.read_csv("bertopic_study_probabilities.csv")  # Replace with actual path

# Extract only `study_id` and topic columns (Topic_0, Topic_1, ..., Topic_33)
topic_columns = [col for col in embeddings_df.columns if col.startswith("Topic_")]
study_topic_df = embeddings_df[["study_id"] + topic_columns]

# Filter training and testing sets based on `study_id`
train_df = study_topic_df[study_topic_df["study_id"].isin(train_cohort["study_id"])]
test_df = study_topic_df[study_topic_df["study_id"].isin(test_cohort["study_id"])]

# Convert topic probabilities to NumPy arrays
X_train = train_df[topic_columns].to_numpy()
X_test = test_df[topic_columns].to_numpy()

# Labels for training/testing (assuming `Y` is the label column in the cohort files)
y_train = train_cohort["Y"].values
y_test = test_cohort["Y"].values

# Print shapes to verify
print(f"Train shape: {X_train.shape}, Labels: {y_train.shape}")
print(f"Test shape: {X_test.shape}, Labels: {y_test.shape}")

# Save filtered train & test data
train_df.to_csv("train_embeddings.csv", index=False)
test_df.to_csv("test_embeddings.csv", index=False)


Train shape: (1876, 34), Labels: (2207,)
Test shape: (462, 34), Labels: (552,)


In [51]:
import pandas as pd

# Load Train & Test Cohorts
train_cohort = pd.read_csv("cohort_train.csv")
test_cohort = pd.read_csv("cohort_test.csv")

# Load BERTopic Probabilities (Make sure this file contains study_id + topic columns)
topic_prob_df = pd.read_csv("bertopic_study_probabilities.csv")  

# Merge with Training Cohort
train_df = train_cohort.merge(topic_prob_df, on="study_id", how="inner")

# Merge with Testing Cohort
test_df = test_cohort.merge(topic_prob_df, on="study_id", how="inner")

# Save the final datasets
train_df.to_csv("train_bertopic_probabilities.csv", index=False)
test_df.to_csv("test_bertopic_probabilities.csv", index=False)

# Print shapes to verify
print(f"✅ Train dataset shape: {train_df.shape}")
print(f"✅ Test dataset shape: {test_df.shape}")

✅ Train dataset shape: (1876, 45)
✅ Test dataset shape: (462, 45)


In [52]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

# Load Processed Train & Test Data
train_df = pd.read_csv("train_bertopic_probabilities.csv")
test_df = pd.read_csv("test_bertopic_probabilities.csv")

# Identify Topic Columns (Automatically detect Topic_* columns)
topic_columns = [col for col in train_df.columns if col.startswith("Topic_")]

# Extract Features (Topic Probabilities) and Labels (Y)
X_train = train_df[topic_columns].to_numpy()
X_test = test_df[topic_columns].to_numpy()
y_train = train_df["Y"].values  # Labels
y_test = test_df["Y"].values  # Labels

# Train XGBoost Model
model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Compute Metrics
metrics = {
    "accuracy": accuracy_score(y_test, y_pred),
    "auc": roc_auc_score(y_test, y_pred_proba),
    "f1": f1_score(y_test, y_pred)
}

# Print Performance Metrics
print("\n📊 Model Performance Metrics:")
print(f"✅ Accuracy: {metrics['accuracy']:.4f}")
print(f"✅ AUC-ROC: {metrics['auc']:.4f}")
print(f"✅ F1 Score: {metrics['f1']:.4f}")



📊 Model Performance Metrics:
✅ Accuracy: 0.7857
✅ AUC-ROC: 0.6020
✅ F1 Score: 0.1391
