In [1]:
# Standard library imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Model and Estimators
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Preprocessing and Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Model selection and validation
from sklearn.model_selection import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_auc_score, classification_report, roc_curve

# Imbalanced dataset handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline

# Data Visualization
from pandas.plotting import scatter_matrix


# kNN

In [2]:
# Load the training dataset
data = pd.read_csv("NHANES_data_train.csv")

# Specify numeric and categorical columns
numeric_columns = ['Income', 'Age', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR']
categorical_columns = ['Sex', 'Race', 'Edu', 'Diabetes', 'CurrentSmoker', 'isActive', 'isInsured']

# Define preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_columns),
    ('cat', categorical_pipeline, categorical_columns)])

# Define a pipeline for resampling and classification
model_pipeline = ImPipeline([
    ('preprocessor', preprocessor),
    ('resampling', SMOTE(random_state=42)),
    ('classifier', KNeighborsClassifier(n_neighbors=11))  # Adjust based on best parameters if needed
])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop(['MI'], axis=1), data['MI'], test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {'classifier__n_neighbors': [3, 5, 7, 9, 11]}

# Perform grid search with cross-validation
clf = GridSearchCV(model_pipeline, param_grid, scoring='roc_auc', cv=11)
clf.fit(X_train, y_train)

# Evaluation
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]  # Probability predictions

# Output evaluation metrics
print(f"Best Parameters: {clf.best_params_}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")



Best Parameters: {'classifier__n_neighbors': 11}
ROC AUC Score: 0.7510679247054406
Accuracy: 0.7426470588235294
Confusion Matrix: 
[[ 16  11]
 [199 590]]
Classification Report: 
              precision    recall  f1-score   support

           1       0.07      0.59      0.13        27
           2       0.98      0.75      0.85       789

    accuracy                           0.74       816
   macro avg       0.53      0.67      0.49       816
weighted avg       0.95      0.74      0.83       816



In [6]:
# Apply the trained model to new data
new_data = pd.read_csv("NHANES_test_data_4_students.csv")
if 'MI' in new_data.columns:
    new_data.drop(columns=['MI'], inplace=True)

# Predict probabilities on the new dataset
new_probabilities = clf.predict_proba(new_data.drop(['ParticipantID'], axis=1))[:, 1]

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'ParticipantID': new_data['ParticipantID'],
    'MI_Probability': new_probabilities
})

# Save the predictions to a CSV file
predictions_df.to_csv("kNN_pred.csv", index=False)
print("Participant ID and MI probabilities saved to kNN_pred.csv.")


Participant ID and MI probabilities saved to kNN_pred.csv.


# Logistic Regression

In [4]:
# Load the training dataset
data = pd.read_csv("NHANES_data_train.csv")

# Specify numeric and categorical columns
numeric_columns = ['Income', 'Age', 'Diastolic', 'Systolic', 'Pulse', 'BMI', 'HDL', 'Trig', 'LDL', 'TCHOL', 'kidneys_eGFR']
categorical_columns = ['Sex', 'Race', 'Edu', 'Diabetes', 'CurrentSmoker', 'isActive', 'isInsured']

# Define preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler())])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_columns),
    ('cat', categorical_pipeline, categorical_columns)])

# Define a pipeline for resampling and logistic regression classification
model_pipeline = ImPipeline([
    ('preprocessor', preprocessor),
    ('resampling', SMOTE(random_state=42)),
    ('classifier', LogisticRegression(C=0.01, penalty='l2', solver='liblinear', max_iter=1000, random_state=42))
])


# Split the dataset into training and test sets
X = data.drop(['MI'], axis=1)
y = data['MI']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning, including solver compatibility
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l2'],  # 'l1' removed due to solver compatibility
    'classifier__solver': ['lbfgs', 'liblinear', 'saga']  # Added compatible solvers
}

# Perform grid search with cross-validation
clf = GridSearchCV(model_pipeline, param_grid, scoring='roc_auc', cv=5, verbose=1)
clf.fit(X_train, y_train)


# Evaluation
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]  # Probability predictions

print(f"Best Parameters: {clf.best_params_}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_prob)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")


Fitting 5 folds for each of 15 candidates, totalling 75 fits
Best Parameters: {'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
ROC AUC Score: 0.7907336994789467
Accuracy: 0.7573529411764706
Confusion Matrix: 
[[ 18   9]
 [189 600]]
Classification Report: 
              precision    recall  f1-score   support

           1       0.09      0.67      0.15        27
           2       0.99      0.76      0.86       789

    accuracy                           0.76       816
   macro avg       0.54      0.71      0.51       816
weighted avg       0.96      0.76      0.84       816



In [7]:
# Apply the trained model to new data
new_data = pd.read_csv("NHANES_test_data_4_students.csv")
if 'MI' in new_data.columns:
    new_data.drop(columns=['MI'], inplace=True)

# Predict probabilities on the new dataset
new_probabilities = clf.predict_proba(new_data.drop(['ParticipantID'], axis=1))[:, 1]

# Create a DataFrame with predictions
predictions_df = pd.DataFrame({
    'ParticipantID': new_data['ParticipantID'],
    'MI_Probability': new_probabilities
})

# Save the predictions to a CSV file
predictions_df.to_csv("LogisticRegression_pred.csv", index=False)
print("Participant ID and MI probabilities saved to LogisticRegression_pred.csv.")

Participant ID and MI probabilities saved to LogisticRegression_pred.csv.
