In [None]:
import pickle

with open("processed_data.pkl", "rb") as f:
    data = pickle.load(f)

structured_data_normalized = data["structured_data"]
text_embeddings = data["text_embeddings"]
target = data["target"]
id_array = data["ids"]

print("Processed data loaded successfully.")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold
import pandas as pd
import numpy as np

def run_lr_cv_tuned(features, labels, id_array, n_splits=5):
    """
    Perform K-Fold CV using LogisticRegression with hyperparameter tuning via GridSearchCV.
    Returns a DataFrame with columns: ID, True_Label, and Predicted_Probability.
    """
    # Define hyperparameter grid.
    param_grid = {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],  # l1 may require a different solver.
        'solver': ['lbfgs'] # Default solver for L2.
    }

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    all_ids = []
    all_true_labels = []
    all_pred_probs = []

    for train_idx, val_idx in kf.split(features):
        X_train = features[train_idx]
        X_val = features[val_idx]
        y_train = labels.iloc[train_idx].values
        y_val = labels.iloc[val_idx].values

        base_lr = LogisticRegression(random_state=42, max_iter=1000)
        grid_search = GridSearchCV(base_lr, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_lr = grid_search.best_estimator_
        pred_probs = best_lr.predict_proba(X_val)[:, 1]

        all_ids.extend(id_array[val_idx])
        all_true_labels.extend(y_val)
        all_pred_probs.extend(pred_probs)

    results_df = pd.DataFrame({
        'ID': all_ids,
        'True_Label': all_true_labels,
        'Predicted_Probability': all_pred_probs
    })
    return results_df

# For Structured data.
lr_results_structured = run_lr_cv_tuned(structured_data_normalized, target, df['ID'].values, n_splits=5)
lr_results_structured.to_csv("lr_results_structured.csv", index=False)
print("Tuned Logistic Regression Structured results saved as 'lr_results_structured.csv'.")

# For Text data.
lr_results_text = run_lr_cv_tuned(text_embeddings, target, df['ID'].values, n_splits=5)
lr_results_text.to_csv("lr_results_unstructured_gpt2.csv", index=False)
print("Tuned Logistic Regression Text results saved as 'lr_results_unstructured_gpt2.csv'.")

# For Combined data, concatenate structured and text features.
combined_features_lr = np.concatenate((structured_data_normalized, text_embeddings), axis=1)
lr_results_combined = run_lr_cv_tuned(combined_features_lr, target, df['ID'].values, n_splits=5)
lr_results_combined.to_csv("lr_results_all_gpt2.csv", index=False)
print("Tuned Logistic Regression Combined results saved as 'lr_results_all_gpt2.csv'.")


In [None]:
import pandas as pd
from sklearn.metrics import roc_curve, auc, confusion_matrix, accuracy_score, precision_score, recall_score
import numpy as np
import matplotlib.pyplot as plt
import time

# Load the CSV files saved from CV.
file1 = "lr_results_structured.csv"
file2 = "lr_results_unstructured_gpt2.csv"
file3 = "lr_results_all_gpt2.csv"

df_structured_lr = pd.read_csv(file1)
df_text_lr = pd.read_csv(file2)
df_combined_lr = pd.read_csv(file3)

# Merge the datasets on the 'ID' column.
merged_lr_df = df_combined_lr.merge(df_structured_lr, on='ID', suffixes=('_combined', '_structured'))\
                              .merge(df_text_lr, on='ID', suffixes=('', '_text'))

# Rename the true label column for consistency.
merged_lr_df['New_True_Label'] = merged_lr_df['True_Label_combined']

# Extract true labels and predicted probabilities.
y_true_lr = merged_lr_df['New_True_Label']
y_pred_comb_lr = merged_lr_df['Predicted_Probability_combined']      # Combined results.
y_pred_struct_lr = merged_lr_df['Predicted_Probability_structured']  # Structured-only.
y_pred_text_lr = merged_lr_df['Predicted_Probability']               # Unstructured-only.

# Function to find the optimal cutoff.
def find_optimal_cutoff(y_true, y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
    optimal_idx = np.argmin(np.sqrt((1 - tpr)**2 + (fpr)**2))
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold, fpr, tpr, thresholds

# Compute optimal thresholds.
opt_thresh_comb_lr, fpr_comb_lr, tpr_comb_lr, _ = find_optimal_cutoff(y_true_lr, y_pred_comb_lr)
opt_thresh_struct_lr, fpr_struct_lr, tpr_struct_lr, _ = find_optimal_cutoff(y_true_lr, y_pred_struct_lr)
opt_thresh_text_lr, fpr_text_lr, tpr_text_lr, _ = find_optimal_cutoff(y_true_lr, y_pred_text_lr)

# Function to calculate metrics.
def calculate_metrics(y_true, y_pred_prob, threshold):
    y_pred = [1 if prob >= threshold else 0 for prob in y_pred_prob]
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred)  # Sensitivity = Recall.
    specificity = tn / (tn + fp)
    return accuracy, precision, sensitivity, specificity

# Calculate metrics for each model.
metrics_comb_lr = calculate_metrics(y_true_lr, y_pred_comb_lr, opt_thresh_comb_lr)
metrics_struct_lr = calculate_metrics(y_true_lr, y_pred_struct_lr, opt_thresh_struct_lr)
metrics_text_lr = calculate_metrics(y_true_lr, y_pred_text_lr, opt_thresh_text_lr)

print(f'Optimal Threshold for Combined Data: {opt_thresh_comb_lr}')
print(f'Metrics: Accuracy={metrics_comb_lr[0]}, Precision={metrics_comb_lr[1]}, Sensitivity={metrics_comb_lr[2]}, Specificity={metrics_comb_lr[3]}')
print(f'Optimal Threshold for Structured Data: {opt_thresh_struct_lr}')
print(f'Metrics: Accuracy={metrics_struct_lr[0]}, Precision={metrics_struct_lr[1]}, Sensitivity={metrics_struct_lr[2]}, Specificity={metrics_struct_lr[3]}')
print(f'Optimal Threshold for Unstructured Data: {opt_thresh_text_lr}')
print(f'Metrics: Accuracy={metrics_text_lr[0]}, Precision={metrics_text_lr[1]}, Sensitivity={metrics_text_lr[2]}, Specificity={metrics_text_lr[3]}')

# Save merged CV results.
merged_lr_df.to_csv("merged_lr_cv_results.csv", index=False)
print("Merged LR CV results saved as 'merged_lr_cv_results.csv'.")

# Create a DataFrame for metrics.
results_lr = {
    'Model': ['Combined Data', 'Structured Data', 'Unstructured Data'],
    'Optimal Cutoff': [opt_thresh_comb_lr, opt_thresh_struct_lr, opt_thresh_text_lr],
    'Accuracy': [metrics_comb_lr[0], metrics_struct_lr[0], metrics_text_lr[0]],
    'Precision': [metrics_comb_lr[1], metrics_struct_lr[1], metrics_text_lr[1]],
    'Sensitivity': [metrics_comb_lr[2], metrics_struct_lr[2], metrics_text_lr[2]],
    'Specificity': [metrics_comb_lr[3], metrics_struct_lr[3], metrics_text_lr[3]]
}
results_lr_df = pd.DataFrame(results_lr)
print(results_lr_df)
results_lr_df.to_csv("lr_cv_metrics_results.csv", index=False)
print("LR CV metrics results saved as 'lr_cv_metrics_results.csv'.")

# Plot ROC curves.
plt.figure()
plt.plot(fpr_comb_lr, tpr_comb_lr, color='red', lw=2, label=f'Combined (AUC = {auc(fpr_comb_lr, tpr_comb_lr):.3f})')
plt.plot(fpr_struct_lr, tpr_struct_lr, color='blue', lw=2, label=f'Structured (AUC = {auc(fpr_struct_lr, tpr_struct_lr):.3f})')
plt.plot(fpr_text_lr, tpr_text_lr, color='green', lw=2, label=f'Unstructured (AUC = {auc(fpr_text_lr, tpr_text_lr):.3f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Logistic Regression Models')
plt.legend(loc="lower right")
plt.savefig("lr_ROC_three_models.png", dpi=900)
plt.show()
