In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv("data/individual_features_chagas.csv")

# Step 1: Split patient_ids
patient_ids = df['patient_id'].unique()
patient_labels = df.groupby('patient_id')['class_1'].first().values

train_ids, test_ids = train_test_split(
    patient_ids, test_size=0.3, stratify=patient_labels, random_state=42
)

# Step 2: Filter rows by patient_ids
train_df = df[df['patient_id'].isin(train_ids)].copy()
test_df = df[df['patient_id'].isin(test_ids)].copy()

# Step 3: Extract features and labels
drop_cols = ['patient_id', 'weight', 'Obito_MS', 'Time', 'class_1', 'event']
X_train = train_df.drop(columns=drop_cols).values
y_train = train_df["class_1"].values
groups_train = train_df["patient_id"].values

X_test = test_df.drop(columns=drop_cols).values
y_test = test_df["class_1"].values
groups_test = test_df["patient_id"].values

## Utils


In [None]:
from importlib import reload
import utils

reload(utils)

from utils import (
    preprocess,
    find_best_fold,
    calculate_metrics,
    plot_confusion_matrix,
    display_kfold_scores,
    apply_grid_search_grouped,
    extract_params_and_k,
    get_kfold_results_grouped,
    gmean_scorer,
    CorrelationFeatureReducer,
    StratifiedGroupKFold
)


from sklearn.decomposition import PCA

## Constants


In [None]:
MODEL_PREFIX = "clf"
K_KEY = "select__k"
REDUCTION_TYPE = "kbest"

SELECTOR_MAP = {
    "kbest": SelectKBest(f_classif),
    "corr": CorrelationFeatureReducer(),
    "pca": PCA(),
}

SELECTOR = SELECTOR_MAP[REDUCTION_TYPE]

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

## Logistic Regression


In [None]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("select", SELECTOR),
        ("clf", LogisticRegression(solver="liblinear", max_iter=1000, random_state=42)),
    ]
)

param_grid = {
    "select__k": [10, 20, 30, 40, 50, X.shape[-1]],
    "clf__penalty": ["l1", "l2"],
    "clf__C": [0.001, 0.01, 0.1, 1, 10],
    "clf__class_weight": [None, "balanced"],
}

params = apply_grid_search_grouped(
    X_train, y_train, estimator=pipeline, param_grid=param_grid, scoring='f1', groups=groups_train
)

best_params, best_k = extract_params_and_k(params, MODEL_PREFIX, K_KEY)

print(f"Best params: {best_params}")
print(f"Best k: {best_k}")

In [None]:
X_train_, X_test_ = preprocess(
    X_train, X_test, y_train, k=best_k, reduction_type=REDUCTION_TYPE
)

model = LogisticRegression(
    solver="liblinear", max_iter=1000, random_state=42, **best_params
)
model.fit(X_train_, y_train)

y_pred = model.predict(X_test_)
print("Results from individual segments:")
_ = calculate_metrics(y_test, y_pred, display=True)

In [None]:
# Add predictions back to test_df
test_df["y_pred"] = y_pred
test_df["y_true"] = y_test  # ensure this aligns row-wise

# Group by patient_id and apply majority voting
grouped = test_df.groupby("patient_id")

# patient_preds = grouped["y_pred"].agg(lambda x: int(x.mean() > 0.5)).values # Average
patient_preds = grouped["y_pred"].agg(lambda x: int(x.sum() > (len(x) // 2))).values  # Majority
patient_labels = grouped["y_true"].first().values

# Remove the "y_pred" and "y_true" columns
test_df.drop(columns=["y_pred", "y_true"], inplace=True)

print("Results per-patient:")
_ = calculate_metrics(patient_labels, patient_preds, display=True)

In [None]:
plot_confusion_matrix(patient_labels, patient_preds, title="Confusion Matrix L.R.")

In [None]:
metrics = get_kfold_results_grouped(
    model=LogisticRegression(
        solver="liblinear", max_iter=1000, random_state=42, **best_params
    ),
    X=X_train,
    y=y_train,
    groups=groups_train,
    best_k=best_k,
    preprocess_reduction_type=REDUCTION_TYPE,
)

display_kfold_scores(metrics)

## SVC


In [None]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("select", SELECTOR),
        ("clf", SVC(random_state=42)),
    ]
)

param_grid = {
    "select__k": [10, 20, 30, 40, 50, X.shape[-1]],
    "clf__kernel": ["linear", "poly", "rbf", "sigmoid"],
    "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "clf__gamma": ["scale", "auto"],
    "clf__class_weight": [None, "balanced"],
}

params = apply_grid_search_grouped(
    X_train, y_train, estimator=pipeline, param_grid=param_grid, scoring='f1', groups=groups_train
)

best_params, best_k = extract_params_and_k(params, MODEL_PREFIX, K_KEY)

print(f"Best params: {best_params}")
print(f"Best k: {best_k}")

In [None]:
X_train_, X_test_ = preprocess(
    X_train, X_test, y_train, k=best_k, reduction_type=REDUCTION_TYPE
)

model = SVC(random_state=42, **best_params)
model.fit(X_train_, y_train)

y_pred = model.predict(X_test_)

print("Results from individual segments:")
_ = calculate_metrics(y_test, y_pred, display=True)

In [None]:
# Add predictions back to test_df
test_df["y_pred"] = y_pred
test_df["y_true"] = y_test  # ensure this aligns row-wise

# Group by patient_id and apply majority voting
grouped = test_df.groupby("patient_id")

# patient_preds = grouped["y_pred"].agg(lambda x: int(x.mean() > 0.5)).values # Average
patient_preds = grouped["y_pred"].agg(lambda x: int(x.sum() > (len(x) // 2))).values  # Majority
patient_labels = grouped["y_true"].first().values

# Remove the "y_pred" and "y_true" columns
test_df.drop(columns=["y_pred", "y_true"], inplace=True)

print("Results per-patient:")
_ = calculate_metrics(patient_labels, patient_preds, display=True)

In [None]:
plot_confusion_matrix(patient_labels, patient_preds, title="Confusion Matrix SVM")

In [None]:
metrics = get_kfold_results_grouped(
    model=SVC(random_state=42, **best_params),
    X=X_train,
    y=y_train,
    groups=groups_train,
    best_k=best_k,
    preprocess_reduction_type=REDUCTION_TYPE,
)

display_kfold_scores(metrics)

## KNN


In [None]:
pipeline = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("select", SELECTOR),
        ("clf", KNeighborsClassifier()),
    ]
)

param_grid = {
    "select__k": [10, 20, 30, 40, 50, X.shape[-1]],
    "clf__n_neighbors": [3, 5, 7, 9, 11, 13],
    "clf__weights": ["uniform", "distance"],
    "clf__p": [1, 2, 3],
}

params = apply_grid_search_grouped(
    X_train, y_train, estimator=pipeline, param_grid=param_grid, scoring='f1', groups=groups_train
)

best_params, best_k = extract_params_and_k(params, MODEL_PREFIX, K_KEY)

print(f"Best params: {best_params}")
print(f"Best k: {best_k}")

In [None]:
X_train_, X_test_ = preprocess(
    X_train, X_test, y_train, k=best_k, reduction_type=REDUCTION_TYPE
)

model = KNeighborsClassifier(**best_params)
model.fit(X_train_, y_train)

y_pred = model.predict(X_test_)

print("Results from individual segments:")
_ = calculate_metrics(y_test, y_pred, display=True)

In [None]:
# Add predictions back to test_df
test_df["y_pred"] = y_pred
test_df["y_true"] = y_test  # ensure this aligns row-wise

# Group by patient_id and apply majority voting
grouped = test_df.groupby("patient_id")

# patient_preds = grouped["y_pred"].agg(lambda x: int(x.mean() > 0.5)).values # Average
patient_preds = grouped["y_pred"].agg(lambda x: int(x.sum() > (len(x) // 2))).values  # Majority
patient_labels = grouped["y_true"].first().values

# Remove the "y_pred" and "y_true" columns
test_df.drop(columns=["y_pred", "y_true"], inplace=True)

print("Results per-patient:")
_ = calculate_metrics(patient_labels, patient_preds, display=True)

In [None]:
plot_confusion_matrix(patient_labels, patient_preds, "Confusion matrix KNN")

In [None]:
metrics = get_kfold_results_grouped(
    model=KNeighborsClassifier(**best_params),
    X=X_train,
    y=y_train,
    groups=groups_train,
    best_k=best_k,
    preprocess_reduction_type=REDUCTION_TYPE,
)

display_kfold_scores(metrics)