In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

data = pd.read_csv("wine.csv")
data = data.dropna()

# all data is already numeric, no need to convert anything into float/int vals

X = data.drop("Class", axis=1)
y = data["Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

#hyperparamter options
param_grid = {
    'n_estimators': [200, 500, 800],
    'max_depth': [4, 5, 6]
}

#perform grid search with cv
rf_test_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_test_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

#ise best model to make prediction
best_rf = grid_search.best_estimator_
y_best_pred = best_rf.predict(X_test)
best_test_accuracy = accuracy_score(y_test, y_best_pred)
print("Confusion matrix of best model:")
conf_matrix = confusion_matrix(y_test, y_best_pred)
print(conf_matrix)
print(f"Test accuracy of best model: {best_test_accuracy}")


Best Parameters: {'max_depth': 4, 'n_estimators': 200}
Best Cross-Validation Accuracy: 0.9753
Confusion matrix of best model:
[[18  0  0]
 [ 0 21  0]
 [ 0  0 15]]
Test accuracy of best model: 1.0
