In [1]:
# Step 1: Import necessary libraries and load the wine dataset

from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ShuffleSplit

# Load wine dataset
wine = load_wine()
X, y = wine.data, wine.target

# Step 2: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Use random search CV to hyperparameter tune the Decision Tree
param_dist = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_classifier = DecisionTreeClassifier()
random_search = RandomizedSearchCV(dt_classifier, param_distributions=param_dist, n_iter=100, cv=5, random_state=42)
random_search.fit(X_train, y_train)

# Best hyperparameters
best_params = random_search.best_params_

# Step 4: Evaluate the Decision Tree on the test set
best_dt_model = random_search.best_estimator_
y_pred_dt = best_dt_model.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

print(f"Best hyperparameters: {best_params}")
print(f"Decision Tree accuracy on test set: {accuracy_dt}")


Best hyperparameters: {'splitter': 'random', 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': None, 'criterion': 'gini'}
Decision Tree accuracy on test set: 0.9166666666666666


In [2]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Create 10 subsets of the training dataset using ShuffleSplit
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=42)

# Step 2: Train 1 decision tree on each subset using the best hyperparameter values
forest_models = []

for train_index, _ in ss.split(X_train):
    subset_X_train, subset_y_train = X_train[train_index], y_train[train_index]

    dt_model = DecisionTreeClassifier(**best_params)
    dt_model.fit(subset_X_train, subset_y_train)
    
    forest_models.append(dt_model)

# Step 3: Evaluate all the trees on the test dataset
forest_accuracies = []

for dt_model in forest_models:
    y_pred_rf = dt_model.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    forest_accuracies.append(accuracy_rf)

# Compare with Decision Tree accuracy
average_accuracy_rf = sum(forest_accuracies) / len(forest_accuracies)

print(f"Random Forest average accuracy on test set: {average_accuracy_rf}")

# Compare with Decision Tree accuracy
if average_accuracy_rf > accuracy_dt:
    print("Random Forest is performing better than the single Decision Tree.")
else:
    print("Random Forest is not performing better than the single Decision Tree.")


Random Forest average accuracy on test set: 0.9083333333333334
Random Forest is not performing better than the single Decision Tree.
