In [1]:
# Standard library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Data Visualization
from pandas.plotting import scatter_matrix

# Preprocessing and Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Model and Estimators
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier

# Model Selection and Validation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, classification_report, roc_curve, log_loss

# Imbalanced Dataset Handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.calibration import CalibratedClassifierCV
from scipy.stats import uniform


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# kNN

In [2]:
# Load the training dataset
data = pd.read_csv("NHANES_data_train.csv")

# Specify numeric and categorical columns
numeric_columns = ['Income', 'Age', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR']
categorical_columns = ['Sex', 'Race', 'Edu', 'Diabetes', 'CurrentSmoker', 'isActive', 'isInsured']

# Define preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', KNNImputer()),  
    ('scaler', StandardScaler()),  
    ('feature_selection', SelectKBest(f_classif, k=10))  
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_columns),
    ('cat', categorical_pipeline, categorical_columns)
])

# Define a pipeline for resampling and classification
model_pipeline = ImPipeline([
    ('preprocessor', preprocessor),
    ('resampling', SMOTE(random_state=42)), 
    ('classifier', KNeighborsClassifier())  
])

def remap_labels(target):
    return 0 if target == 2 else target

targets = data['MI'].apply(remap_labels)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(['MI'], axis=1), targets, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'classifier__n_neighbors': [11],  
    'classifier__weights': ['distance'],  
    'preprocessor__num__feature_selection__k': [10]  
}

# Perform grid search with cross-validation
clf = GridSearchCV(model_pipeline, param_grid, scoring='roc_auc', cv=5)
clf.fit(X_train, y_train)

# Get the best estimator
best_clf = clf.best_estimator_

# Fit the calibrated classifier on the validation data
X_train_calib, X_val_calib, y_train_calib, y_val_calib = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
calibrated_clf = CalibratedClassifierCV(best_clf, cv='prefit')
calibrated_clf.fit(X_val_calib, y_val_calib)

y_prob = calibrated_clf.predict_proba(X_test)[:, 1]
y_pred = calibrated_clf.predict(X_test)  # Predicting the classes

# Output evaluation metrics
print(f"Best Parameters: {clf.best_params_}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Best Parameters: {'classifier__n_neighbors': 11, 'classifier__weights': 'distance', 'preprocessor__num__feature_selection__k': 10}
ROC AUC Score: 0.7032812279960569
Accuracy: 0.8333333333333334
Confusion Matrix: 
[[670 119]
 [ 17  10]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.98      0.85      0.91       789
           1       0.08      0.37      0.13        27

    accuracy                           0.83       816
   macro avg       0.53      0.61      0.52       816
weighted avg       0.95      0.83      0.88       816



In [3]:
# Apply the trained model to new data
new_data = pd.read_csv("NHANES_test_data_4_students.csv")
if 'MI' in new_data.columns:
    new_data.drop(columns=['MI'], inplace=True)

# Predict probabilities on the new dataset
new_probabilities = calibrated_clf.predict_proba(new_data.drop(['ParticipantID'], axis=1))[:, 1]

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'ParticipantID': new_data['ParticipantID'],
    'MI_Probability': new_probabilities
})

# Save the predictions to a CSV file
predictions_df.to_csv("kNN_pred.csv", index=False)
print("Participant ID and MI probabilities saved to kNN_pred.csv.")

Participant ID and MI probabilities saved to kNN_pred.csv.


In [4]:
print("Script to create custom true_MI_labels.csv to be used in calculations for the Evaluator, provoded by Andy Markland")
dataset = pd.read_csv("NHANES_data_train.csv")

partition = dataset.sample(frac=0.5)

group_1 = partition[partition['MI'] == 1]
group_2 = partition[partition['MI'] == 2]

count_group_1 = len(group_1)
count_group_2 = len(group_2)

if count_group_1 < count_group_2:
    group_2_downsampled = group_2.sample(n=count_group_1)
    balanced_dataset = pd.concat([group_1, group_2_downsampled])
else:
    group_1_downsampled = group_1.sample(n=count_group_2)
    balanced_dataset = pd.concat([group_1_downsampled, group_2])

balanced_dataset.reset_index(drop=True, inplace=True)

new_data = balanced_dataset
trueLabels =  new_data[['ParticipantID', 'MI']]
trueLabels['MI'] = trueLabels['MI'].replace({2: 0, 1: 1})
print(trueLabels)
if 'MI' in new_data.columns:
    new_data.drop(columns=['MI'], inplace=True)

# Predict probabilities on the new dataset
new_probabilities = calibrated_clf.predict_proba(new_data.drop(['ParticipantID'], axis=1))[:, 1]
print(clf.classes_)

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'ParticipantID': new_data['ParticipantID'],
    'MI_Probability': new_probabilities
})

# Save the predictions to a CSV file
predictions_df.to_csv("kNN_pred_proba_KL.csv", index=False)
trueLabels.to_csv("true_MI_Label_kNN.csv", index=False)
trueLabels.to_csv("true_MI_Label_kNN.csv", index=False)
print("Participant ID and MI probabilities saved to kNN_pred_proba_K.csv with true_MI_Label_kNN.csv.")

Script to create custom true_MI_labels.csv to be used in calculations for the Evaluator, provoded by Andy Markland
     ParticipantID  MI
0             3193   1
1             4022   1
2             1565   1
3             2538   1
4             1637   1
..             ...  ..
155           2462   0
156           3001   0
157           2619   0
158           2692   0
159           2785   0

[160 rows x 2 columns]
[0 1]
Participant ID and MI probabilities saved to kNN_pred_proba_K.csv with true_MI_Label_kNN.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trueLabels['MI'] = trueLabels['MI'].replace({2: 0, 1: 1})


# Logistic Regression

In [5]:
# Load the training dataset
data = pd.read_csv("NHANES_data_train.csv")

print("Takes a while but i could not get it to be any more efficent while being quick")

# Define preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_columns),
    ('cat', categorical_pipeline, categorical_columns)])

# Adjust the logistic regression classifier for Elastic Net regularization
logistic_regression = LogisticRegression(max_iter=10000, solver='saga', penalty='elasticnet', l1_ratio=0.5, random_state=42)

# Define a pipeline for resampling, logistic regression, and calibration
model_pipeline = ImPipeline([
    ('preprocessor', preprocessor),
    ('resampling', SMOTE(random_state=42)),
    ('classifier', CalibratedClassifierCV(estimator=logistic_regression, cv=5))  # Corrected parameter name
])

# Split the dataset
X = data.drop(['MI'], axis=1)
y = data['MI'].apply(remap_labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter distribution for hyperparameter tuning
param_distributions = {
    'classifier__estimator__C': uniform(0.01, 10),
    'classifier__estimator__l1_ratio': uniform(0, 1)  # Appropriate for 'elasticnet' penalty
}

# Perform randomized search with cross-validation focusing on log loss
clf = RandomizedSearchCV(model_pipeline, param_distributions, n_iter=10, scoring='neg_log_loss', cv=3, verbose=1, random_state=42)
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print(f"Best Parameters: {clf.best_params_}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
print(f"Log Loss: {log_loss(y_test, y_prob)}")


Takes a while but i could not get it to be any more efficent while being quick
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'classifier__estimator__C': 0.21584494295802448, 'classifier__estimator__l1_ratio': 0.9699098521619943}
ROC AUC Score: 0.7912970004224757
Accuracy: 0.7610294117647058
Confusion Matrix: 
[[603 186]
 [  9  18]]
Classification Report: 
              precision    recall  f1-score   support

           0       0.99      0.76      0.86       789
           1       0.09      0.67      0.16        27

    accuracy                           0.76       816
   macro avg       0.54      0.72      0.51       816
weighted avg       0.96      0.76      0.84       816

Log Loss: 0.4969409124622444


In [6]:
# Apply the trained model to new data
new_data = pd.read_csv("NHANES_test_data_4_students.csv")
if 'MI' in new_data.columns:
    new_data.drop(columns=['MI'], inplace=True)

# Predict probabilities on the new dataset
new_probabilities = clf.predict_proba(new_data.drop(['ParticipantID'], axis=1))[:, 1]

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'ParticipantID': new_data['ParticipantID'],
    'MI_Probability': new_probabilities
})

# Save the predictions to a CSV file
predictions_df.to_csv("LogisticRegression_pred.csv", index=False)
print("Participant ID and MI probabilities saved to LogisticRegression_pred.csv.")

Participant ID and MI probabilities saved to LogisticRegression_pred.csv.


In [7]:
print("Script to create custom true_MI_labels.csv to be used in calculations for the Evaluator")
dataset = pd.read_csv("NHANES_data_train.csv")

partition = dataset.sample(frac=0.5)

group_1 = partition[partition['MI'] == 1]
group_2 = partition[partition['MI'] == 2]

count_group_1 = len(group_1)
count_group_2 = len(group_2)

if count_group_1 < count_group_2:
    group_2_downsampled = group_2.sample(n=count_group_1)
    balanced_dataset = pd.concat([group_1, group_2_downsampled])
else:
    group_1_downsampled = group_1.sample(n=count_group_2)
    balanced_dataset = pd.concat([group_1_downsampled, group_2])

balanced_dataset.reset_index(drop=True, inplace=True)

new_data = balanced_dataset
trueLabels =  new_data[['ParticipantID', 'MI']]
trueLabels['MI'] = trueLabels['MI'].replace({2: 0, 1: 1})
print(trueLabels)
if 'MI' in new_data.columns:
    new_data.drop(columns=['MI'], inplace=True)

# Predict probabilities on the new dataset
new_probabilities = clf.predict_proba(new_data.drop(['ParticipantID'], axis=1))[:, 1]
print(clf.classes_)

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'ParticipantID': new_data['ParticipantID'],
    'MI_Probability': new_probabilities
})

# Save the predictions to a CSV file
predictions_df.to_csv("LogisticRegression_pred_proba_KL.csv", index=False)
trueLabels.to_csv("true_MI_Label_log.csv", index=False)
print("Participant ID and MI probabilities saved to LogisticRegression_pred_proba_KL.csv with true_MI_Label_log.csv.")

Script to create custom true_MI_labels.csv to be used in calculations for the Evaluator
     ParticipantID  MI
0             1913   1
1              684   1
2              842   1
3              442   1
4             2787   1
..             ...  ..
111           3489   0
112            536   0
113             61   0
114           2839   0
115           1472   0

[116 rows x 2 columns]
[0 1]
Participant ID and MI probabilities saved to LogisticRegression_pred_proba_KL.csv with true_MI_Label_log.csv.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trueLabels['MI'] = trueLabels['MI'].replace({2: 0, 1: 1})
