In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import confusion_matrix, recall_score, precision_score, roc_auc_score, roc_curve, accuracy_score, RocCurveDisplay

import matplotlib.pyplot as plt

In [None]:
# Load the data
df = pd.read_excel("gym.xlsx")
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
indAtts = ["Age", "Income", "Hours"]
depAtt = "Enroll"

In [None]:
# Separate features and target variable
Xs = df[indAtts]
Xs.head()

In [None]:
y = df[depAtt]
y


In [None]:
# Scale the features
scaler = StandardScaler()
XsScaled = scaler.fit_transform(Xs)
XsScaled

In [None]:
# Combine scaled features and target into a new DataFrame
dfScaled = pd.DataFrame(XsScaled, columns=Xs.columns)
dfScaled[depAtt] = y.astype('category')
dfScaled

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    dfScaled[indAtts],
    dfScaled[depAtt],
    test_size=0.4,
    random_state=1,
    stratify=dfScaled[depAtt]
)


In [None]:
X_train.head()

In [None]:
# Perform k-NN classification with cross-validation to find the best k
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': list(range(1, 11))}
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

In [None]:
# Display detailed results
results = pd.DataFrame(grid_search.cv_results_)
results

In [None]:
# Make predictions on the test set
best_knn = grid_search.best_estimator_
# new_k = 15  # Replace with the desired k value
# best_knn.set_params(n_neighbors=new_k)
predictions = best_knn.predict(X_test)
# Combine y_test and predictions into a DataFrame
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
results_df

In [None]:
# Confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Convert confusion matrix to DataFrame with predicted as rows and actual as columns
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Predicted_0', 'Predicted_1'], columns=['Actual_0', 'Actual_1'])
print("Confusion Matrix as DataFrame:")
print(conf_matrix_df)


In [None]:
# Calculate metrics
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
accuracy = accuracy_score(y_test, predictions)
specificity = conf_matrix[0, 0] / (conf_matrix[0, 0] + conf_matrix[0, 1])
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall (Sensitivity): {recall}")
print(f"Specificity: {specificity}")
print(f"F1 Score: {f1_score}")

In [None]:
# Predict probabilities for ROC curve
probs= best_knn.predict_proba(X_test)[:, 1]
probs

In [None]:
roc_auc = roc_auc_score(y_test, probs)
print("ROC AUC:", roc_auc)

In [None]:
# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, probs )
plt.figure()
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()