In [46]:
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
# import graphviz

In [33]:
df = pd.read_csv('../data/dataset.csv')

In [34]:
# Split the data into features (X) and target (y)
X = df.drop(columns=['Class'], axis=1)
y = df['Class']

In [52]:
# Verificar las dimensiones
print("Dimensiones de X:", X.shape)
print("Dimensiones de y:", y.shape)

# Mostrar los primeros registros
print(X.head())
print(y.head())

Dimensiones de X: (178, 13)
Dimensiones de y: (178,)
   Alcohol  Malic acid   Ash  Alcalinity  Magnesium  Total phenols  \
0    14.23        1.71  2.43        15.6        127           2.80   
1    13.20        1.78  2.14        11.2        100           2.65   
2    13.16        2.36  2.67        18.6        101           2.80   
3    14.37        1.95  2.50        16.8        113           3.85   
4    13.24        2.59  2.87        21.0        118           2.80   

   Flavanoids  Nonflavanoid phenols  Proanthocyanins  Color intensity   Hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   OD280/OD315 of diluted wines  Proline  
0       

In [38]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [53]:
# Verificar tamaños
print("Tamaño de X_train:", X_train.shape)
print("Tamaño de X_test:", X_test.shape)

Tamaño de X_train: (142, 13)
Tamaño de X_test: (36, 13)


In [60]:
param_dist = {
    "n_estimators": randint(50, 200),  # Número de árboles (50 a 200)
    "max_depth": randint(5, 20),       # Profundidad máxima (5 a 20)
    "min_samples_split": randint(2, 10) # Mínimo de muestras por división
}

In [61]:
# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [62]:
# Fit the classifier to the training data
rf_classifier.fit(X_train, y_train)

In [63]:
# Aplicar RandomizedSearchCV
random_search = RandomizedSearchCV(
    rf_classifier, param_distributions=param_dist, n_iter=20, cv=5, scoring="accuracy", random_state=42
)

In [64]:
# Entrenar el modelo con búsqueda de hiperparámetros
random_search.fit(X_train, y_train)

In [65]:
# Imprimir los mejores parámetros encontrados
print("Mejores hiperparámetros:", random_search.best_params_)

Mejores hiperparámetros: {'max_depth': 19, 'min_samples_split': 4, 'n_estimators': 121}


In [68]:
# Evaluar el modelo optimizado
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

In [69]:
# Make predictions
y_pred = rf_classifier.predict(X_test)

In [70]:
# Sample prediction
sample = X_test.iloc[0:1]  # Keep as DataFrame to match model input format
prediction = rf_classifier.predict(sample)


In [71]:
# Export the first three decision trees from the forest
# Calculate accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)


Accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

