# No data transformation, use ady_dna as addtional features

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression

# Load the cleaned dataset
data_cleaned = pd.read_csv('../../../Data_processing/Outputs/train.csv')
#optional
data_cleaned = data_cleaned.loc[data_cleaned['usubjid'].isin(data_cleaned[data_cleaned['label'] == 1]["usubjid"].unique())]

data_cleaned.dropna(inplace=True)

# Preprocess the data
features = ['aval_AlloMap', 'aval_AlloSure', 'ady_dna']
target = 'label'

# Group data by 'usubjid' to maintain sequence structure
grouped_data = data_cleaned.groupby('usubjid')

# Shuffle and sort grouped data to maintain sequence order
shuffled_df = grouped_data.sample(frac=1, random_state=42)
shuffled_df = shuffled_df.sort_values(['usubjid', 'ady_dna'])

# Reset the index of the shuffled dataframe
grouped_data = shuffled_df.reset_index(drop=True)
grouped_data = grouped_data.groupby('usubjid')

X_grouped = [group[features].values for name, group in grouped_data]
y_grouped = [group[target].values for name, group in grouped_data]

# Calculate the split index for 80%/20%
split_index = int(0.80 * len(X_grouped))

# Split the sequences into training and testing sets
X_train_grouped = X_grouped[:split_index]
X_test_grouped = X_grouped[split_index:]
y_train_grouped = y_grouped[:split_index]
y_test_grouped = y_grouped[split_index:]

# Concatenate the sequences to form the training and testing sets
X_train = np.concatenate(X_train_grouped)
X_test = np.concatenate(X_test_grouped)
y_train = np.concatenate(y_train_grouped)
y_test = np.concatenate(y_test_grouped)


# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predict probabilities on the test set
y_pred_probs = logistic_model.predict_proba(X_test)[:, 1]  # Probability of the positive class

# Predict the class labels on the test set
y_pred = logistic_model.predict(X_test)
predicted_labels = logistic_model.predict(X_test)


# Generate and print classification metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate AUC-ROC
auc = roc_auc_score(y_test, y_pred_probs)
print(f"AUC-ROC: {auc}")

accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy}")


Classification Report:
              precision    recall  f1-score   support

         0.0       0.65      0.99      0.78       302
         1.0       0.57      0.02      0.05       167

    accuracy                           0.65       469
   macro avg       0.61      0.51      0.41       469
weighted avg       0.62      0.65      0.52       469

Confusion Matrix:
[[299   3]
 [163   4]]
AUC-ROC: 0.6487885156838641
Accuracy: 0.6460554371002132


# Allosure baseline

In [5]:
# Calculate AUC-ROC
X = np.concatenate([X_train, X_test])
y = np.concatenate([y_train, y_test])
auc = roc_auc_score(y, X[:, 1])
print(f"AUC-ROC: {auc}")

#create a threshold for allosure
predicted_allosure = X_test[:, 1]
predicted_labels = (predicted_allosure > 0.1).astype(int)
# Evaluate the performance
print("Classification Report:")
print(classification_report(y_test, predicted_labels))
print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))

AUC-ROC: 0.6380095628358352
Classification Report:
              precision    recall  f1-score   support

         0.0       0.76      0.64      0.70       302
         1.0       0.49      0.63      0.55       167

    accuracy                           0.64       469
   macro avg       0.63      0.64      0.62       469
weighted avg       0.66      0.64      0.64       469

Confusion Matrix:
[[194 108]
 [ 62 105]]


# Allomap baseline

In [3]:
# Calculate AUC-ROC
X = np.concatenate([X_train, X_test])
y = np.concatenate([y_train, y_test])
auc = roc_auc_score(y_test, X_test[:, 0])
print(f"AUC-ROC: {auc}")

#create a threshold for allomap
predicted_allomap = X_test[:, 0]
predicted_labels = (predicted_allomap > 35).astype(int)
# Evaluate the performance
print("Classification Report:")
print(classification_report(y_test, predicted_labels))
print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))

AUC-ROC: 0.5894436292976961
Classification Report:
              precision    recall  f1-score   support

         0.0       0.67      0.89      0.77       302
         1.0       0.53      0.22      0.31       167

    accuracy                           0.65       469
   macro avg       0.60      0.55      0.54       469
weighted avg       0.62      0.65      0.60       469

Confusion Matrix:
[[270  32]
 [131  36]]
