In [5]:
import pandas as pd
import numpy as np
import altair as alt

from sklearn import metrics  
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier




In [24]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
knn = KNeighborsClassifier()

data = pd.read_csv('filtered_data_full.csv')
y = data['Grade'] # series not df
X = data.drop(columns=['Grade'])

scores = cross_val_score(knn, X, y, cv=cv, scoring='f1_weighted')  # or accuracy
print(f"Cross-validated F1-score: {scores.mean():.4f}")

Cross-validated F1-score: 0.5735


In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Define the parameter grid to search over
param_grid = {
    'n_neighbors': list(range(3, 15)),         # Number of neighbors to consider
    'weights': ['uniform', 'distance'],     # Weight function used in prediction
    'metric': ['euclidean', 'manhattan', 'minkowski']    # Distance metric
}

# Initialize GridSearchCV with cross-validation and the parameter grid
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring="f1_weighted",n_jobs=-1, verbose=1)

# Fit GridSearchCV on the training data
grid_search.fit(X, y)

# Print the best parameters and the best score
print(f"Best hyperparameters: {grid_search.best_params_}")
print(f"F1-Score: {grid_search.best_score_:.4f}")

# Evaluate the best model on the test data
test_accuracy = grid_search.best_estimator_.score(X, y)
print(f"Test set accuracy: {test_accuracy:.4f}")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best hyperparameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'uniform'}
F1-Score: 0.5765
Test set accuracy: 0.8193


In [29]:
# Access the cross-validation results
results = grid_search.cv_results_

# The 'mean_test_score' will give the average F1 score for each hyperparameter combination
best_f1_score = results['mean_test_score'][grid_search.best_index_]

print("Best F1 score from GridSearchCV:", best_f1_score)

Best F1 score from GridSearchCV: 0.5764690374984492


In [30]:
import altair as alt
import pandas as pd

# Get the results from GridSearchCV
results = pd.DataFrame(grid_search.cv_results_)

# Create a base Altair chart with line and scatter plots
base = alt.Chart(results).encode(
    x=alt.X('param_n_neighbors:O', title="n_neighbors"),  # Treat `param_n_neighbors` as ordinal for line chart
    y=alt.Y('mean_test_score:Q', title="F1-Score", scale=alt.Scale(domain=[0.45, 0.6])),    # Quantitative for the test score
)

# Line plot for the mean_test_score vs param_n_neighbors, colored by param_weights
line_plot = base.mark_line().encode(
    color=alt.Color('param_weights:N', title="weights"),  # Different colors for each weight type
    detail='param_weights:N'  # Ensure line segmentation by param_weights
)

# Scatter plot for the mean_test_score vs param_n_neighbors, colored by param_weights
scatter_plot = base.mark_point(filled=True, size=100).encode(
    color='param_weights:N',  # Different colors for each weight type
)

# Combine the line and scatter plots, and facet by `param_metric`
final_chart = (line_plot + scatter_plot).facet(
    column=alt.Column('param_metric:N', title="metric"),  # Separate the plots by `param_metric`
).resolve_scale(
    y='shared'  # Share the y-axis scale across facets
)

# Display the chart
final_chart


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Get the best model from GridSearchCV
best_knn = grid_search.best_estimator_

# Predict the labels for the test set
y_pred = best_knn.predict(X_test)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=set(y_test), yticklabels=set(y_test))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()
