# Imports

In [1]:
import os
from PIL import Image

import numpy as np

import torch.utils.data
import torchvision.datasets as datasets
from torchvision import transforms
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
import pickle
import os

## Paths

In [4]:
root_path = ''
train_path = os.path.join(root_path, 'train')

os.makedirs(train_path, exist_ok=True)

# Feature Extraction
Extracts important features from data

In [5]:
train_data = os.path.join(train_path, 'training')
test_data = os.path.join(train_path, 'testing')

dst = os.path.join(train_path, 'dimension_reduced_data')

os.makedirs(dst, exist_ok=True)

initial_transforms = transforms.Compose([
    transforms.Resize((244, 244)),
    transforms.ToTensor(),
])

dataset = datasets.ImageFolder(train_data, transform=initial_transforms)

# converts training data tensors to numpy arrays
n = len(dataset)
X = np.zeros((n, 3, 244, 244))
y = np.zeros(n)

for i, (inputs, labels) in enumerate(tqdm(dataset)):
    X[i] = inputs.numpy()
    y[i] = labels

# class to index dictionary
class_to_idx = dataset.class_to_idx
with open(os.path.join(dst, 'class_to_idx.pkl'), 'wb') as handle:
    pickle.dump(class_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

# class names numpy array
idx_to_class = {v: k for k, v in class_to_idx.items()}
class_names = [idx_to_class[i] for i in range(len(idx_to_class))]
np.save(os.path.join(dst, 'class_names.npy'), class_names)


# converts test data to numpy arrays
test_files = os.listdir(test_data)

n = len(test_files)
X_test = np.zeros((n, 3, 244, 244))

for file in os.listdir(test_data):
    img = Image.open(f'{test_data}/{file}')
    X_test[i] = np.asanyarray(img)

100%|██████████| 9377/9377 [00:30<00:00, 305.50it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'train/testing'

## Normalize Data

In [None]:
scaler = StandardScaler()

# normalize training data
n, d1, d2, d3 = X.shape
X = X.reshape((n, d1 * d2 * d3))
X = scaler.fit_transform(X)

# normalize testing data
n, d1, d2, d3 = X_test.shape
X_test = X_test.reshape((n, d1 * d2 * d3))
X_test = scaler.fit_transform(X_test)

## PCA
Dimension reduction on data for full rank matrix

In [3]:
pca = PCA(0.90)

X = pca.fit_transform(X)
X_test = pca.fit_transform(X_test)

NameError: name 'X' is not defined

## LDA
Supervised dimension reduction on data that will be used for training

In [None]:
lda = LinearDiscriminantAnalysis()

X = lda.fit_transform(X, y)
X_test = lda.transform(X_test)

## Saves data
- training data: [dimension_reduced_data/X.npy](dimension_reduced_data/X_train_pca_lda.npy)
- labels: [dimension_reduced_data/y.npy](dimension_reduced_data/y.npy)
- testing data: [dimension_reduced_data/X_test.npy](dimension_reduced_data/X_test.npy)

In [None]:
np.save(os.path.join(dst, 'X.npy'), X)
np.save(os.path.join(dst, 'y.npy'), y)

np.save(os.path.join(dst, 'X_test.npy'), X_test)