In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, matthews_corrcoef, confusion_matrix)

# load cohort 1
cohort_1 = pd.read_csv("AthleteTBIData.csv")
cohort_1_filtered = cohort_1[cohort_1["OrigiinalGroupID"].str.contains("post 6 hours", na=False)]
cohort_1_filtered = cohort_1_filtered.drop(columns=['SampleID', 'OrigiinalGroupID'])
cohort_1_filtered.reset_index(drop=True, inplace=True)

# load cohort 2
cohort_2 = pd.read_csv("ExternalTBIData.csv")
cohort_2_filtered = cohort_2.drop(index=[59, 62, 63, 64, 67, 79, 81, 83])
cohort_2_filtered = cohort_2_filtered.drop(columns=['SampleID'])
cohort_2_filtered.reset_index(drop=True, inplace=True)

# plot correlational matrix
def plot_correlation_matrix(data, cohort_name, top_n=20):
    plt.figure(figsize=(12, 8))
    correlation_matrix = data.corr()
    
    # selecting biomarkers with highest variance
    top_features = correlation_matrix.var().nlargest(top_n).index
    limited_corr_matrix = correlation_matrix.loc[top_features, top_features]
    
    sns.heatmap(limited_corr_matrix, annot=False, cmap="coolwarm", linewidths=0.5)
    plt.title(f"Correlation Matrix for {cohort_name} Cohort (Top {top_n} Features)")
    plt.show()


# preprocessing and training func
def train_logistic_regression(data, cohort_name):
    # Check for missing values
    if data.isnull().sum().sum() > 0:
        print(f"Warning: Missing values detected in {cohort_name} dataset. Imputing with column mean.")
        data.fillna(data.mean(), inplace=True)  # Impute missing values with column mean
    
    X = data.drop(columns=["GroupNo"])  # Adjust target column
    y = data["GroupNo"]
    
    # training and test 80/20
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # standardizing feautures
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # training
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    
    # predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    
    # performance metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_prob)
    mcc = matthews_corrcoef(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # results
    print(f"Performance for {cohort_name} cohort:")
    print(f"Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall (Sensitivity): {recall:.4f}, ")
    print(f"Specificity: {specificity:.4f}, F1 Score: {f1:.4f}, AUC-ROC: {auc_roc:.4f}, MCC: {mcc:.4f}\n")
    
    return model

# train models
athlete_model = train_logistic_regression(cohort_1_filtered, "Athlete")
external_model = train_logistic_regression(cohort_2_filtered, "External")


Performance for Athlete cohort:
Accuracy: 0.5455, Precision: 0.5000, Recall (Sensitivity): 0.6000, 
Specificity: 0.6000, F1 Score: 0.5455, AUC-ROC: 0.4667, MCC: 0.1000

Performance for External cohort:
Accuracy: 0.7500, Precision: 0.6000, Recall (Sensitivity): 0.6000, 
Specificity: 0.8182, F1 Score: 0.6000, AUC-ROC: 0.7818, MCC: 0.4182

