In [1]:
import os
from PIL import Image

import numpy as np

import torch.utils.data
import torchvision.datasets as datasets
from torchvision import transforms
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
import pickle
import os



# path to Git Repo from Google CoLab file
path = './'
root_path = path if os.path.isdir(path) else ''
data_dir = os.path.join(root_path, 'training')

dimension_reduced_data = os.path.join(root_path, 'dimension_reduced_data')
if not os.path.exists(dimension_reduced_data):
    os.makedirs(dimension_reduced_data)

initial_transforms = transforms.Compose([
    transforms.Resize((244, 244)),
    transforms.ToTensor(),
])

dataset = datasets.ImageFolder(data_dir, transform=initial_transforms)

def get_mean_std(loader):
    num_pixels = 0
    mean = 0.0
    std = 0.0
    images: torch.Tensor
    for images, _ in loader:
        batch_size, num_channels, height, width = images.shape
        num_pixels += batch_size * height * width
        mean += images.mean(axis=(0, 2, 3)).sum()
        std += images.std(axis=(0, 2, 3)).sum()

    mean /= num_pixels
    std /= num_pixels

    return mean, std

batch_size = 32
loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
mean, std = get_mean_std(loader)

data_transforms = transforms.Compose([
    transforms.Resize((244, 244)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

dataset = datasets.ImageFolder(data_dir, transform=data_transforms)

class_to_idx = dataset.class_to_idx
with open(os.path.join(dimension_reduced_data, 'class_to_idx.pkl'), 'wb') as handle:
    pickle.dump(class_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

idx_to_class = {v: k for k, v in class_to_idx.items()}
class_names = [idx_to_class[i] for i in range(len(idx_to_class))]

np.save(os.path.join(dimension_reduced_data, 'class_names.npy'), class_names)

n_train = len(dataset)
X_train = np.zeros((n_train, 3, 244, 244))
y_train = np.zeros(n_train)

for i, (inputs, labels) in enumerate(tqdm(dataset)):
    X_train[i] = inputs.numpy()
    y_train[i] = labels






data_dir = os.path.join(root_path, 'validation')
data_transforms = transforms.Compose([
    transforms.Resize((244, 244)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])
val_dataset = datasets.ImageFolder(data_dir, transform=data_transforms)

n_val = len(val_dataset)
X_val = np.zeros((n_val, 3, 244, 244))
y_val = np.zeros(n_val)

for i, (inputs, labels) in enumerate(tqdm(val_dataset)):
    X_val[i] = inputs.numpy()
    y_val[i] = labels






n_val, d1, d2, d3 = X_val.shape
X_val = X_val.reshape((n_val, d1 * d2 * d3))

n_train, d1, d2, d3 = X_train.shape
X_train = X_train.reshape((n_train, d1 * d2 * d3))

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

np.save(os.path.join(dimension_reduced_data, 'y_train.npy'), y_train)
np.save(os.path.join(dimension_reduced_data, 'y_val.npy'), y_val)





apply_pca = True
apply_lda = True

if apply_pca:
    pca = PCA(0.90) 
    X_train = pca.fit_transform(X_train)
    X_val = pca.transform(X_val)
    # with open(os.path.join(dimension_reduced_data, 'pca_model.pkl'), 'wb') as handle:
    #     pickle.dump(pca, handle)
    np.save(os.path.join(dimension_reduced_data, 'X_train_pca.npy'), X_train)
    np.save(os.path.join(dimension_reduced_data, 'X_val_pca.npy'), X_val)

if apply_lda:
    lda = LinearDiscriminantAnalysis()
    X_train = lda.fit_transform(X_train, y_train)
    X_val = lda.transform(X_val)

    if apply_pca:
        # with open(os.path.join(dimension_reduced_data, 'pca_lda_model.pkl'), 'wb') as handle:
        #     pickle.dump(lda, handle)

        np.save(os.path.join(dimension_reduced_data, 'X_train_pca_lda.npy'), X_train)
        np.save(os.path.join(dimension_reduced_data, 'X_val_pca_lda.npy'), X_val)
     
    else:
        # with open(os.path.join(dimension_reduced_data, 'lda_model.pkl'), 'wb') as handle:
        #     pickle.dump(lda, handle)
        np.save(os.path.join(dimension_reduced_data, 'X_train_lda.npy'), X_train)
        np.save(os.path.join(dimension_reduced_data, 'X_val_lda.npy'), X_val)

100%|██████████| 9916/9916 [00:22<00:00, 435.53it/s]
100%|██████████| 154/154 [00:00<00:00, 414.48it/s]
