# Preprocessing
This notebook manages the data pipeline and performs feature extraction for testing.

## Install Dependencies

In [None]:
%pip install ...

## Imports

In [1]:
import os
import pickle

import numpy as np
import torchvision.datasets as datasets

from tqdm import tqdm
from autocrop import Cropper
from torchvision import transforms
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Root Path

In [2]:
root_path = ''
train_path = train_path = os.path.join(root_path, 'train')
grade_path = os.path.join(root_path, 'grade')

os.makedirs(train_path, exist_ok=True)
os.makedirs(grade_path, exist_ok=True)

# Data Pipeline
Preparing the data for feature extraction

## Crop Data
The sorted images will be cropped and saved in testing/ <br>
Data will be put into subdirectories organized by labels

In [None]:
src = os.path.join(root_path, 'training_validation_set_0226')
dst = os.path.join(train_path, 'testing')

rej = os.path.join(train_path, 'rejected')

os.makedirs(dst, exist_ok=True)
os.makedirs(rej, exist_ok=True)

# autocropper
cropper = Cropper(244, 244)

rejected_count = 0

for filename in tqdm(os.listdir(src)):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):


        # crops image
        cropped_array = cropper.crop(f'{src}/{filename}')

        if type(cropped_array) != type(None):
            # saves successfully cropped image in subdir
            img = Image.fromarray(cropped_array)
            img.save(f'{dst}/{filename}')
        else:
            rejsubdir = os.path.join(rej, dst)
            os.makedirs(rejsubdir, exist_ok=True)

            # saves rejected image in rejected/training/[label]/
            img = Image.open(f'{src}/{filename}')
            img.save(f'{rejsubdir}/{filename}')

            rejected_count += 1

print(f'Number of rejected images: {rejected_count}')

## Rejected Data
Handles rejected data that autocropper could not recognize <br>
The data will be saved to testing/

# Feature Extraction
Extracts important features from data

In [None]:
data_dir = os.path.join(train_path, 'training')
dimension_reduced_data = os.path.join(train_path, 'dimension_reduced_data')

os.makedirs(dimension_reduced_data, exist_ok=True)

initial_transforms = transforms.Compose([
    transforms.Resize((244, 244)),
    transforms.ToTensor(),
])

dataset = datasets.ImageFolder(data_dir, transform=initial_transforms)

# converts tensor to numpy array
n = len(dataset)
X = np.zeros((n, 3, 244, 244))
y = np.zeros(n)

for i, (inputs, labels) in enumerate(tqdm(dataset)):
    X[i] = inputs.numpy()
    y[i] = labels

# class to index dictionary
class_to_idx = dataset.class_to_idx
with open(os.path.join(dimension_reduced_data, 'class_to_idx.pkl'), 'wb') as handle:
    pickle.dump(class_to_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

# class names numpy array
idx_to_class = {v: k for k, v in class_to_idx.items()}
class_names = [idx_to_class[i] for i in range(len(idx_to_class))]
np.save(os.path.join(dimension_reduced_data, 'class_names.npy'), class_names)

## Normalize Data

In [None]:
n, d1, d2, d3 = X.shape
X = X.reshape((n, d1 * d2 * d3))

scaler = StandardScaler()
X = scaler.fit_transform(X)

## PCA
Dimension reduction on data for full rank matrix

In [None]:
pca = PCA(0.90) 
X = pca.fit_transform(X)

## LDA
Supervised dimension reduction on data that will be used for training

In [None]:
lda = LinearDiscriminantAnalysis()
X = lda.fit_transform(X, y)

## Saves data
- labels: [dimension_reduced_data/y.npy](dimension_reduced_data/y.npy)
- data: [dimension_reduced_data/X.npy](dimension_reduced_data/X_train_pca_lda.npy)

In [None]:
np.save(os.path.join(dimension_reduced_data, 'X.npy'), X)
np.save(os.path.join(dimension_reduced_data, 'y.npy'), y)