In [28]:
import numpy as np
import pandas as pd
from pykalman import KalmanFilter
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Load the cleaned dataset
data_cleaned = pd.read_csv('../../../Data_processing/Outputs/train.csv')

data_cleaned = data_cleaned.loc[data_cleaned['usubjid'].isin(data_cleaned[data_cleaned['label'] == 1]["usubjid"].unique())]
data_cleaned.dropna(inplace=True)

# Preprocess the data
features = ['aval_AlloMap', 'aval_AlloSure', 'ady_dna']
target = 'label'

# Group data by 'usubjid' to maintain sequence structure
grouped_data = data_cleaned.groupby('usubjid')

# Assuming you have a grouped dataframe called 'grouped_df'
shuffled_df = grouped_data.sample(frac=1, random_state=42)
shuffled_df = shuffled_df.sort_values(['usubjid', 'ady_dna'])

# Reset the index of the shuffled dataframe
grouped_data = shuffled_df.reset_index(drop=True)
grouped_data = grouped_data.groupby('usubjid')


# prepare for train patients separately by each iteration
X_grouped = [group[['aval_AlloSure', 'aval_AlloMap', "ady_dna"]].values for name, group in grouped_data]
y_grouped = [group['label'].values for name, group in grouped_data]


# Calculate the split index for 80%/20%
split_index = int(0.80 * len(X_grouped))

# Split the sequences into training and testing sets
X_train_grouped = X_grouped[:split_index]
X_test_grouped = X_grouped[split_index:]
y_train_grouped = y_grouped[:split_index]
y_test_grouped = y_grouped[split_index:]

# Concatenate the sequences to form the training and testing sets
X_train = np.concatenate(X_train_grouped)
X_test = np.concatenate(X_test_grouped)
y_train = np.concatenate(y_train_grouped)
y_test = np.concatenate(y_test_grouped)

# # Normalize the feature columns
# scaler = MinMaxScaler()
# data_cleaned[features] = scaler.fit_transform(data_cleaned[features])

# Prepare data for the Kalman Filter
X = X_train
y = y_train

# Initialize the Kalman Filter
kf = KalmanFilter(initial_state_mean=np.zeros(X.shape[1]),
                  n_dim_obs=X.shape[1],
                  n_dim_state=X.shape[1])

# Estimate the Kalman Filter parameters using EM algorithm
kf = kf.em(X, n_iter=10)

# Use the filter to estimate the hidden states
state_means, state_covariances = kf.filter(X_test)


# # Calculate predicted labels based on a threshold (e.g., 0.5)
# predicted_labels = (predicted_allosure > 0.5).astype(int)


# Use the state means for predictions
predicted_allosure = state_means[:, 0]
predicted_allomap = state_means[:, 1]

# Calculate predicted labels based on a threshold (e.g., 0.5)
predicted_labels = (predicted_allosure > 0.5).astype(int)

# Calculate predicted labels based on the given thresholds
# The label will be 1 (positive) if both conditions are satisfied
# predicted_labels = np.where((predicted_allosure > 0.5) & (predicted_allomap > 30), 1, 0)


# Evaluate the performance
print("Classification Report:")
print(classification_report(y_test, predicted_labels))

print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))

# Calculate AUC
auc = roc_auc_score(y_test, prob_scores)
print(f"AUC: {auc}")

accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy}")

Classification Report:
              precision    recall  f1-score   support

         0.0       0.74      0.83      0.78       302
         1.0       0.61      0.46      0.52       167

    accuracy                           0.70       469
   macro avg       0.67      0.65      0.65       469
weighted avg       0.69      0.70      0.69       469

Confusion Matrix:
[[252  50]
 [ 90  77]]
AUC: 0.6740095967006384
Accuracy: 0.7014925373134329
