In [None]:
import pickle
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold
import os

# Load processed data from the pickle file
with open(".../processed_data.pkl", "rb") as f:
    data = pickle.load(f)

structured_data = data["structured_data"]
text_embeddings = data["text_embeddings"]
target = data["target"]
id_array = data["ids"]

# Define a function to run K-Fold cross-validation using GBC
def run_gbc_cv(features, labels, id_array, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_ids = []
    all_true_labels = []
    all_pred_probs = []

    for train_idx, val_idx in kf.split(features):
        X_train = features[train_idx]
        X_val = features[val_idx]
        y_train = labels.iloc[train_idx].values
        y_val = labels.iloc[val_idx].values

        # Initialize the Gradient Boosting Classifier with default parameters
        model = GradientBoostingClassifier(random_state=42)
        model.fit(X_train, y_train)
        pred_probs = model.predict_proba(X_val)[:, 1]

        all_ids.extend(id_array[val_idx])
        all_true_labels.extend(y_val)
        all_pred_probs.extend(pred_probs)

    results_df = pd.DataFrame({
        'ID': all_ids,
        'True_Label': all_true_labels,
        'Predicted_Probability': all_pred_probs
    })
    return results_df

# Run CV for structured data only
results_structured = run_gbc_cv(structured_data, target, id_array, n_splits=5)
results_structured.to_csv("gbc_results_structured.csv", index=False)
print("GBC CV results for structured data saved as 'gbc_results_structured.csv'.")

# Run CV for text embeddings only
results_text = run_gbc_cv(text_embeddings, target, id_array, n_splits=5)
results_text.to_csv("gbc_results_unstructured_gpt2.csv", index=False)
print("GBC CV results for text embeddings saved as 'gbc_results_unstructured_gpt2.csv'.")

# Run CV for combined data (concatenation of structured and text embeddings)
combined_features = np.concatenate((structured_data, text_embeddings), axis=1)
results_combined = run_gbc_cv(combined_features, target, id_array, n_splits=5)
results_combined.to_csv("gbc_results_all_gpt2.csv", index=False)
print("GBC CV results for combined data saved as 'gbc_results_all_gpt2.csv'.")


In [None]:
import pandas as pd
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, precision_score, recall_score
import numpy as np
import matplotlib.pyplot as plt
import time

# File names (only filenames, no directory path)
file_structured = "gbc_results_structured.csv"
file_text = "gbc_results_unstructured_gpt2.csv"
file_combined = "gbc_results_all_gpt2.csv"

# Load the CSV files
df_structured = pd.read_csv(file_structured)
df_text = pd.read_csv(file_text)
df_combined = pd.read_csv(file_combined)

# Merge the three datasets on the "ID" column
merged_df = df_combined.merge(df_structured, on='ID', suffixes=('_combined', '_structured'))\
                       .merge(df_text, on='ID', suffixes=('', '_text'))

# Rename the true label column for consistency
merged_df['New_True_Label'] = merged_df['True_Label_combined']

# Extract true labels and predicted probabilities
y_true = merged_df['New_True_Label']
y_pred_comb = merged_df['Predicted_Probability_combined']      # Combined results
y_pred_struct = merged_df['Predicted_Probability_structured']  # Structured-only results
y_pred_text = merged_df['Predicted_Probability']               # Unstructured-only results

# Function to determine the optimal threshold (closest to the top-left corner of the ROC curve)
def find_optimal_cutoff(y_true, y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    optimal_idx = np.argmin(np.sqrt((1 - tpr)**2 + (fpr)**2))
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold, fpr, tpr, thresholds

# Calculate the optimal threshold for each set of predictions
opt_thresh_comb, fpr_comb, tpr_comb, _ = find_optimal_cutoff(y_true, y_pred_comb)
opt_thresh_struct, fpr_struct, tpr_struct, _ = find_optimal_cutoff(y_true, y_pred_struct)
opt_thresh_text, fpr_text, tpr_text, _ = find_optimal_cutoff(y_true, y_pred_text)

# Function to calculate evaluation metrics using a given threshold
def calculate_metrics(y_true, y_pred_prob, threshold):
    y_pred = [1 if prob >= threshold else 0 for prob in y_pred_prob]
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred)  # Sensitivity equals recall
    specificity = tn / (tn + fp)
    return accuracy, precision, sensitivity, specificity

# Calculate metrics for each modality
metrics_comb = calculate_metrics(y_true, y_pred_comb, opt_thresh_comb)
metrics_struct = calculate_metrics(y_true, y_pred_struct, opt_thresh_struct)
metrics_text = calculate_metrics(y_true, y_pred_text, opt_thresh_text)

# Print out the optimal thresholds and metrics
print(f"Optimal Threshold for Combined Data: {opt_thresh_comb}")
print(f"Metrics: Accuracy={metrics_comb[0]}, Precision={metrics_comb[1]}, Sensitivity={metrics_comb[2]}, Specificity={metrics_comb[3]}")
print(f"Optimal Threshold for Structured Data: {opt_thresh_struct}")
print(f"Metrics: Accuracy={metrics_struct[0]}, Precision={metrics_struct[1]}, Sensitivity={metrics_struct[2]}, Specificity={metrics_struct[3]}")
print(f"Optimal Threshold for Unstructured Data: {opt_thresh_text}")
print(f"Metrics: Accuracy={metrics_text[0]}, Precision={metrics_text[1]}, Sensitivity={metrics_text[2]}, Specificity={metrics_text[3]}")

# Save the merged CV results to CSV
merged_df.to_csv("merged_gbc_cv_results.csv", index=False)
print("Merged GBC CV results saved as 'merged_gbc_cv_results.csv'.")

# Create a DataFrame to store the metrics results
results = {
    'Model': ['Combined Data', 'Structured Data', 'Unstructured Data'],
    'Optimal Cutoff': [opt_thresh_comb, opt_thresh_struct, opt_thresh_text],
    'Accuracy': [metrics_comb[0], metrics_struct[0], metrics_text[0]],
    'Precision': [metrics_comb[1], metrics_struct[1], metrics_text[1]],
    'Sensitivity': [metrics_comb[2], metrics_struct[2], metrics_text[2]],
    'Specificity': [metrics_comb[3], metrics_struct[3], metrics_text[3]]
}
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv("gbc_cv_metrics_results.csv", index=False)
print("GBC CV metrics results saved as 'gbc_cv_metrics_results.csv'.")

# Plot ROC curves for each modality
plt.figure()
plt.plot(fpr_comb, tpr_comb, color='red', lw=2, label=f'Combined (AUC = {auc(fpr_comb, tpr_comb):.3f})')
plt.plot(fpr_struct, tpr_struct, color='blue', lw=2, label=f'Structured (AUC = {auc(fpr_struct, tpr_struct):.3f})')
plt.plot(fpr_text, tpr_text, color='green', lw=2, label=f'Unstructured (AUC = {auc(fpr_text, tpr_text):.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for GBC Models')
plt.legend(loc="lower right")
plt.savefig("gbc_ROC_three_models.png", dpi=900)
plt.show()

end_time = time.time()
print(f"Total execution time: {end_time - start_time:.2f} seconds")
