In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics  
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf = RandomForestClassifier(random_state=42)

data = pd.read_csv('filtered_data_full.csv')
y = data['Grade'] # series not df
X = data.drop(columns=['Grade'])

scores = cross_val_score(clf, X, y, cv=cv, scoring='f1_weighted')  # or accuracy
print("Cross-validated F1-score:", scores.mean())

Cross-validated F1-score: 0.830364547625577


In [13]:
# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 75, 100, 125, 200], # [50, 75, 100, 125, 150],  # Number of trees
    'max_depth': [None, 5, 10, 15, 20],  # Maximum depth of each tree
    'min_samples_split': [2, 3, 4], # Number of samples required to split a node
    'min_samples_leaf': [1, 2, 3, 4] # Number of samples required in leaf
}

# Initialize the RandomForest classifier
clf = RandomForestClassifier(random_state=42)

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    scoring='f1_macro',  # You can change this based on your evaluation metric
    n_jobs=-1,  # Use all available processors
    verbose=1  # To see progress
)

# Fit GridSearchCV
grid_search.fit(X, y)

# Best parameters from GridSearchCV
print("Best parameters:", grid_search.best_params_)

# Use the best estimator from grid search
best_clf = grid_search.best_estimator_

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 200}


In [14]:
# Access the cross-validation results
cv_results = grid_search.cv_results_

# The 'mean_test_score' will give the average F1 score for each hyperparameter combination
best_f1_score = cv_results['mean_test_score'][grid_search.best_index_]

print("Best F1 score from GridSearchCV:", best_f1_score)

Best F1 score from GridSearchCV: 0.866169016169016
