In [2]:
import os
from PIL import Image
import numpy as np 

#Dataset Directory 
alphabet_dir = "ASL_Dataset/Train"

images = []
labels = []

In [3]:
max_images_per_folder = 100

for folder_name in os.listdir(alphabet_dir):
    folder_path = os.path.join(alphabet_dir, folder_name)

    image_counter = 0

    for filename in os.listdir(folder_path):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            if image_counter >= max_images_per_folder:
                break
            image_path = os.path.join(folder_path, filename)
            image = Image.open(image_path)

            # Resize the image to 256x256 pixels
            image = image.resize((256, 256))

            image = np.array(image)
            images.append(image)
            label = folder_name
            labels.append(label)
            image_counter += 1

images = np.array(images)
labels = np.array(labels)

In [4]:
len(labels)

2800

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

import numpy as np

X = images
y = labels

X_labeled, X_unlabeled, y_labeled, _ = train_test_split(X, y, test_size=0.8, stratify=y)

X_labeled = X_labeled.reshape(X_labeled.shape[0], -1)
X_unlabeled = X_unlabeled.reshape(X_unlabeled.shape[0], -1)

params = {'criterion': ['entropy'],
                    'max_depth': [8]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=2411), params, verbose=4, cv=3, n_jobs=-1)
grid_search_cv.fit(X_labeled, y_labeled)


model = grid_search_cv.best_estimator_
pseudo_labels = model.predict(X_unlabeled)

confidence_threshold = np.percentile(model.predict_proba(X_unlabeled).max(axis=1), 90)
high_confidence_indices = np.where(model.predict_proba(X_unlabeled).max(axis=1) >= confidence_threshold)[0]
high_confidence_samples = X_unlabeled[high_confidence_indices]
high_confidence_labels = pseudo_labels[high_confidence_indices]

X_combined = np.concatenate((X_labeled, high_confidence_samples), axis=0)
y_combined = np.concatenate((y_labeled, high_confidence_labels), axis=0)

num_iterations = 1
for _ in range(num_iterations):
    grid_search_cv.fit(X_combined, y_combined)
    model = grid_search_cv.best_estimator_
    pseudo_labels = model.predict(X_unlabeled)
    confidence_threshold = np.percentile(model.predict_proba(X_unlabeled).max(axis=1), 90)
    high_confidence_indices = np.where(model.predict_proba(X_unlabeled).max(axis=1) >= confidence_threshold)[0]
    high_confidence_samples = X_unlabeled[high_confidence_indices]
    high_confidence_labels = pseudo_labels[high_confidence_indices]
    X_combined = np.concatenate((X_labeled, high_confidence_samples), axis=0)
    y_combined = np.concatenate((y_labeled, high_confidence_labels), axis=0)

accuracy = grid_search_cv.best_score_
print(f"Final Model Accuracy: {accuracy}")
