# Supervised Learning with Decision Trees

In [2]:
import os
from PIL import Image
import numpy as np 
import logging

#Dataset Directory 
alphabet_dir = "ASL_Dataset/Train"

images = []
labels = []

logging.basicConfig(filename='training.log', level=logging.INFO, format='%(asctime)s %(message)s')

In [3]:
def log_accuracy(depth, train, valid):

    logging.info(f'The Training Accuracy for max_depth {depth} is: {train}')
    logging.info(f'The Validation Accuracy for max_depth {depth} is: {train}')
    logging.info(" ")


In [4]:
max_images_per_folder = 100

for folder_name in os.listdir(alphabet_dir):
    folder_path = os.path.join(alphabet_dir, folder_name)

    image_counter = 0

    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            if image_counter >= max_images_per_folder:
                break
            image_path = os.path.join(folder_path, filename)
            image = Image.open(image_path)

            # Resize the image to 256x256 pixels
            image = image.resize((256, 256))

            image = np.array(image)
            images.append(image)
            label = folder_name
            labels.append(label)
            image_counter += 1

images = np.array(images)
labels = np.array(labels)

In [5]:
len(labels)

2800

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_rem, y_train, y_rem = train_test_split(images, labels, test_size=0.4, random_state=2411)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
X_valid = X_valid.reshape(X_valid.shape[0], -1)



In [16]:
for max_d in range(1,31):
    classifier = DecisionTreeClassifier(max_depth=max_d, random_state=2411)

    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    training_score = classifier.score(X_train, y_train)
    validation_score = classifier.score(X_valid, y_valid)

    log_accuracy(max_d, training_score, validation_score)
    print('')


In [7]:
import ray 
from joblib import parallel_backend

In [8]:
from ray.util.joblib import register_ray 

In [9]:
register_ray()

In [10]:
ray.init()

2023-06-02 12:03:57,513	INFO worker.py:1625 -- Started a local Ray instance.


0,1
Python version:,3.8.12
Ray version:,2.4.0


In [11]:
with parallel_backend("ray"):
    for max_d in range(1,6):
        classifier = DecisionTreeClassifier(max_depth=max_d, random_state=2411)

        classifier.fit(X_train, y_train, n_jobs=-1)
        y_pred = classifier.predict(X_test)

        training_score = classifier.score(X_train, y_train)
        validation_score = classifier.score(X_valid, y_valid)

        log_accuracy(max_d, training_score, validation_score)
        print('')

TypeError: fit() got an unexpected keyword argument 'n_jobs'

In [7]:
import os
  
n_cpu = os.cpu_count()
print("Number of CPUs in the system:", n_cpu)

Number of CPUs in the system: 16


In [8]:
from sklearn.model_selection import GridSearchCV

params = {'criterion': ['gini', 'entropy'],
                    'max_depth': list(range(1, 16))}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=2411), params, verbose=4, cv=3, n_jobs=-1)
grid_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2411),
             n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15]},
             verbose=4)

In [9]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=8, random_state=2411)