In [1]:
# define constants
NUM_IMAGES_PER_CLASS = 1000
TEST_SPLIT_PERCENTAGE = 0.15
TRAIN_VAL_SPLIT_PERCENTAGE = 1 - TEST_SPLIT_PERCENTAGE  # the rest is for training and validation
BATCH_SIZE = 50  # number of images to copy at once
RANDOM_SEED = 42  # seed for reproducibility

In [2]:
PROJECT_DATASET_DIR = '/content/drive/MyDrive/project_dataset'
CLASSES_ORIGINAL = ['airplane_cabin', 'hockey_arena', 'movie_theater', 'staircase', 'supermarket']
CLASSES_NEW = ['airplane_cabin', 'hockey_arena', 'movie_theater', 'staircase', 'supermarket']

# Implement the Normalization and Augumentation.


*   Normalization for training validation and test.
*   Augumentation for training.



In [13]:
# standard library imports
import os  #  directory and file operations
import shutil  #  copying files
import  time  #  adding delays

# installed library imports
from sklearn.model_selection import train_test_split  #  splitting datasets
from PIL import Image  #  image processing
import torchvision.transforms as transforms  #  data augmentation
import numpy as np  #  numerical operations
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import shuffle

In [4]:
TRAIN_VAL_DIR = os.path.join(PROJECT_DATASET_DIR, 'train_val')
TEST_DIR = os.path.join(PROJECT_DATASET_DIR, 'test')

TRAIN_DIR = os.path.join(PROJECT_DATASET_DIR, 'train')  # new directory
VALIDATION_DIR = os.path.join(PROJECT_DATASET_DIR, 'validation')  # new directory

IMAGE_SIZE = (256, 256)
VALIDATION_SPLIT = 15 / 85  # train_val is 85% of the total dataset, we want the validation set to be 15% of the total dataset
CLASSES = ['airplane_cabin', 'hockey_arena', 'movie_theater', 'staircase', 'supermarket']
RANDOM_SEED = 42
BATCH_SIZE = 100

# Load image data and labels for each dataset.


*   Number of training data: 14000
*   Number of validation/test data: 750





In [14]:
def load_images_and_labels(base_dir):
    """
    Load image data and labels from a specified directory. This function traverses each class directory, loads each image, and assigns labels based on the directory name.

    Inputs:
    - base_dir: A string, the path to the directory containing subdirectories for each class.

    Outputs:
    - A tuple containing two numpy arrays:
      1. `data`: an array of flattened image data.
      2. `labels`: an array of labels corresponding to the images.

    Each class is expected to have its own subdirectory within `base_dir` with the name of the class.
    """
    # Initialize lists to store image data and labels
    data = []
    labels = []

    # Define the classes and create a mapping from class names to a unique integer
    classes = ['airplane_cabin', 'hockey_arena', 'movie_theater', 'staircase', 'supermarket']
    class_labels = {cls_name: idx for idx, cls_name in enumerate(classes)}

    # Iterate over each class directory in the base directory
    for cls_name in classes:
        class_dir = os.path.join(base_dir, cls_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            img = Image.open(img_path)  # Open the image directly
            img_data = np.array(img).flatten()  # Flatten the image data into a one-dimensional vector
            data.append(img_data)
            labels.append(class_labels[cls_name])

    # Convert the lists of data and labels to numpy arrays and return
    return np.array(data), np.array(labels)

# Load training, validation, and test data
train_data, train_labels = load_images_and_labels(TRAIN_DIR)
validation_data, validation_labels = load_images_and_labels(VALIDATION_DIR)
test_data, test_labels = load_images_and_labels(TEST_DIR)

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Data Scaling
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_data)
validation_scaled = scaler.transform(validation_data)

# Create a decision tree model
model_dt = DecisionTreeClassifier(max_depth=10, min_samples_split=5, min_samples_leaf=2, criterion='entropy', random_state=RANDOM_SEED)
model_dt.fit(train_scaled, train_labels)

# Evaluate the model using test data
validation_predictions_dt = model_dt.predict(validation_data)
vali_accuracy_dt = accuracy_score(validation_labels, validation_predictions_dt)
print(f'Test Accuracy for model_dt: {vali_accuracy_dt}')

Test Accuracy for model_dt: 0.46266666666666667


# Supervised Decision Tree with 5-fold validation

need to modify with the vali part

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd

# need to modify here
data = #full_train_data
labels = #full_train_labels

# Define parameters
max_depths = [10, 20, 30, None]
min_samples_splits = [2, 5, 10]
min_samples_leafs = [1, 2, 4]
criterions = ['gini', 'entropy']

# Initialize a list to record the results
results = []

# Create an instance of StratifiedKFold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


for depth in max_depths:
    for min_samples_split in min_samples_splits:
        for min_samples_leaf in min_samples_leafs:
            for criterion in criterions:
                scores = []  # Store the accuracy for each fold
                # Train model on each fold
                for train_index, test_index in kf.split(data, labels):
                    X_train, X_test = data[train_index], data[test_index]
                    y_train, y_test = labels[train_index], labels[test_index]

                    # Create a decision tree model
                    model = DecisionTreeClassifier(
                        max_depth=depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        criterion=criterion,
                        random_state=RANDOM_SEED
                    )

                    model.fit(X_train, y_train)
                    predictions = model.predict(X_test)

                    # Calculate and record the accuracy
                    score = accuracy_score(y_test, predictions)
                    scores.append(score)

                # Calculate the average accuracy under the current parameter combination
                mean_score = np.mean(scores)
                results.append({
                    'max_depth': depth,
                    'min_samples_split': min_samples_split,
                    'min_samples_leaf': min_samples_leaf,
                    'criterion': criterion,
                    'mean_accuracy': mean_score
                })

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Find the best parameter combination
best_params = results_df.loc[results_df['mean_accuracy'].idxmax()]
print("Best parameters found:", best_params)

# Save the results to a CSV file
results_df.to_csv('/content/drive/MyDrive/manual_gridsearch_results.csv', index=False)


# Generating the labeled and unlabeled data for Semi-Supervised Decision Tree

In [None]:
def load_images_and_split(base_dir, labeled_ratio=0.2):
    """
    Load images from a directory and split them into labeled and unlabeled datasets.

    Args:
    - base_dir (str): Path to the directory containing subdirectories for each class.
    - labeled_ratio (float): The fraction of data to keep as labeled.

    Returns:
    - labeled_data (np.array): Array of labeled data.
    - labeled_labels (list): List of labels for the labeled data.
    - unlabeled_data (np.array): Array of unlabeled data.
    """
    data = []
    labels = []
    classes = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    class_labels = {cls_name: idx for idx, cls_name in enumerate(classes)}

    # Load all data and labels
    for cls_name in classes:
        class_dir = os.path.join(base_dir, cls_name)
        for img_name in os.listdir(class_dir):
            img_path = os.path.join(class_dir, img_name)
            img = Image.open(img_path)
            img_data = np.array(img).flatten()
            data.append(img_data)
            labels.append(class_labels[cls_name])

    # Convert to numpy arrays
    data = np.array(data)
    labels = np.array(labels)

    # Split into labeled and unlabeled
    labeled_indices, unlabeled_indices = train_test_split(
        np.arange(len(labels)),
        test_size=1-labeled_ratio,
        random_state= RANDOM_SEED,
        stratify=labels  # Keep the distribution of classes consistent
    )

    labeled_data = data[labeled_indices]
    labeled_labels = labels[labeled_indices]
    unlabeled_data = data[unlabeled_indices]

    return labeled_data, labeled_labels, unlabeled_data

labeled_data, labeled_labels, unlabeled_data = load_images_and_split(TRAIN_DIR, labeled_ratio=0.2)

print("Labeled data shape:", labeled_data.shape)
print("Unlabeled data shape:", unlabeled_data.shape)

# Semi-Supervised Decision Tree

 *need to add validation*

In [None]:


import matplotlib.pyplot as plt

# Create the decision tree model
model = DecisionTreeClassifier(random_state=42)

# Initial training: Use the labeled data to train the model
model.fit(labeled_data, labeled_labels)

# Threshold list: decrease from 95% to 90% to 85% to 80%
thresholds = [0.95, 0.90, 0.85, 0.80]
test_accuracies = []  # Store the test accuracy for each threshold

# Iterate through each threshold
for confidence_threshold in thresholds:
    pseudo_labels = []  # Reset the pseudo label list for each loop

    # Predict the labels for the unlabeled data
    probabilities = model.predict_proba(unlabeled_data)
    max_probabilities = np.max(probabilities, axis=1)

    # Select high-confidence predictions to use as pseudo labels
    for i, prob in enumerate(max_probabilities):
        if prob > confidence_threshold:
            pseudo_labels.append((unlabeled_data[i], np.argmax(probabilities[i])))

    # If there are enough high-confidence predictions, add them as pseudo labels to the training data
    if pseudo_labels:
        pseudo_data, pseudo_labels = zip(*pseudo_labels)
        new_data = np.vstack([labeled_data] + list(pseudo_data))
        new_labels = np.hstack([labeled_labels] + list(pseudo_labels))

        # Retrain the model with the new training data including pseudo labels
        model.fit(new_data, new_labels)

    # Measure the model's performance on the original labeled test set
    test_accuracy = model.score(test_data, test_labels)
    test_accuracies.append(test_accuracy)
    print(f'Test Accuracy at threshold {confidence_threshold*100}%: {test_accuracy}')

# Plot the test accuracies
plt.figure(figsize=(10, 6))
plt.plot([str(th*100) for th in thresholds], test_accuracies, marker='o', linestyle='-', color='b')
plt.title('Test Accuracy vs Confidence Threshold')
plt.xlabel('Confidence Threshold (%)')
plt.ylabel('Test Accuracy')
plt.grid(True)
plt.show()


# Semi-Supervised Decision Tree with 5-fold validation