In [28]:
from sklearn.impute import KNNImputer
import pandas as pd

# Load your dataset
df = pd.read_csv('NHANES_data_train.csv')  # Adjust the path to your dataset

# Assuming 'ParticipantID' is a column in your DataFrame, set it as the index
df.set_index('ParticipantID', inplace=True)

# Separate features and target if they're all in the same DataFrame
X = df.drop('MI', axis=1)  # Exclude 'MI' which is the target variable
y = df['MI']

# Instantiate the KNNImputer
imputer = KNNImputer(n_neighbors=5)  # Adjust 'n_neighbors' as needed

# Apply imputation to the DataFrame
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Print the imputed DataFrame to verify
print(X_imputed.head())


   Income  Sex   Age  Race  Edu  Diastolic    Systolic      Pulse   BMI   HDL  \
0    4.56  1.0  70.0   3.0  3.2  69.000000  124.333333  61.333333  25.6  46.0   
1    5.00  2.0  49.0   3.0  5.0  64.000000  102.666667  66.000000  22.4  63.0   
2    1.59  1.0  54.0   4.0  3.2  96.000000  160.000000  71.533333  25.4  60.0   
3    2.37  1.0  73.0   3.0  2.8  58.000000  134.666667  62.666667  34.0  41.0   
4    5.00  1.0  42.0   3.0  3.4  82.333333  116.000000  63.666667  30.4  49.0   

    Trig    LDL  TCHOL  kidneys_eGFR  Diabetes  CurrentSmoker  isActive  \
0  186.0  207.0  289.0     77.244722       2.0            2.0       2.0   
1   60.0  108.0  183.0    101.217970       2.0            2.0       2.0   
2  102.0  179.0  261.0     71.863349       2.0            2.0       2.0   
3  105.0  110.0  172.0     72.459827       2.0            2.0       1.0   
4   98.0  203.0  275.0     79.009375       2.0            2.0       2.0   

   isInsured  
0        1.0  
1        1.0  
2        1.0  
3 

In [29]:
from sklearn.model_selection import train_test_split

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the kNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # Adjust n_neighbors as needed
knn.fit(X_train_scaled, y_train)

# Predict probabilities for the test set
y_pred_proba_knn = knn.predict_proba(X_test_scaled)[:, 1]

# Create a DataFrame with ParticipantID and predicted probabilities for kNN
output_df_knn = pd.DataFrame({'ParticipantID': X_test.index, 'Pred_Probability': y_pred_proba_knn})

# Save the DataFrame to a CSV file
output_df_knn.to_csv('kNN_pred.csv', index=False)


In [27]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the logistic regression model
logreg = LogisticRegression(max_iter=1000)  # Adjust max_iter as needed
logreg.fit(X_train_scaled, y_train)

# Predict probabilities for the test set using logistic regression
y_pred_proba_logreg = logreg.predict_proba(X_test_scaled)[:, 1]

# Create a DataFrame with ParticipantID and predicted probabilities for logistic regression
output_df_logreg = pd.DataFrame({'ParticipantID': X_test.index, 'Pred_Probability': y_pred_proba_logreg})

# Save the DataFrame to a CSV file
output_df_logreg.to_csv('regression_pred.csv', index=False)


In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Assuming X_train_scaled and X_test_scaled are already defined

# Define a pipeline combining a scaler and the kNN classifier
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('knn', KNeighborsClassifier())
])

# Define a parameter grid to search over
param_grid = {
    'knn__n_neighbors': range(1, 31),
    'knn__weights': ['uniform', 'distance']
}

# Setup grid search
grid_search_knn = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='accuracy')

# Perform grid search
grid_search_knn.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search_knn.best_params_)
print("Best cross-validation score:", grid_search_knn.best_score_)

# Predict with the best estimator
y_pred_proba_knn = grid_search_knn.predict_proba(X_test)[:, 1]


Best parameters: {'knn__n_neighbors': 6, 'knn__weights': 'distance'}
Best cross-validation score: 0.9641437818844597
