# Preprocessing
This notebook manages the data pipeline and performs feature extraction for testing.

## Install Dependencies

In [None]:
%pip install ...

## Imports

In [32]:
import os
import pickle

import numpy as np
import torchvision.datasets as datasets

from math import ceil, floor
from tqdm import tqdm
from PIL import Image
from autocrop import Cropper
from torchvision import transforms
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Root Path

In [33]:
root_path = ''
train_path = train_path = os.path.join(root_path, 'train')
grade_path = os.path.join(root_path, 'grade')

os.makedirs(train_path, exist_ok=True)
os.makedirs(grade_path, exist_ok=True)

# Data Pipeline
Preparing the data for feature extraction

## Crop Data
The sorted images will be cropped and saved in testing/ <br>
Data will be put into subdirectories organized by labels

In [34]:
src = os.path.join(root_path, 'training_validation_set_0226')
dst = os.path.join(train_path, 'testing')

rej = os.path.join(train_path, 'rejected')

os.makedirs(dst, exist_ok=True)
os.makedirs(rej, exist_ok=True)

# autocropper
cropper = Cropper(244, 244)

rejected_count = 0

for filename in tqdm(os.listdir(src)):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        # crops image
        cropped_array = cropper.crop(f'{src}/{filename}')

        if type(cropped_array) != type(None):
            # saves successfully cropped image in subdir
            img = Image.fromarray(cropped_array)
            img.save(f'{dst}/{filename}')
        else:
            rejsubdir = os.path.join(rej, 'testing')
            os.makedirs(rejsubdir, exist_ok=True)

            # saves rejected image in rejected/testing/
            img = Image.open(f'{src}/{filename}')
            img.save(f'{rejsubdir}/{filename}')

            rejected_count += 1

print(f'Number of rejected images: {rejected_count}')

100%|██████████| 157/157 [00:03<00:00, 40.00it/s]

Number of rejected images: 4





## Rejected Data
Manually crops rejected data that autocropper could not recognize <br>
The data will be saved to train/testing/

In [43]:
src = os.path.join(train_path, 'rejected', 'testing')
dst = os.path.join(train_path, 'testing')

cropped = os.path.join(train_path, 'rejected', 'testing_cropped')

os.makedirs(dst, exist_ok=True)
os.makedirs(cropped, exist_ok=True)

for file in os.listdir(src):
    img = Image.open(f'{src}/{file}')
    w, h = img.size

    left = 0
    right = w
    top = floor((h - w) / 2)
    bottom = h - ceil((h - w) / 2)

    img = img.crop((left, top, right, bottom))
    img = img.resize((244, 244))
    
    img.save(f'{dst}/{file}')
    img.save(f'{cropped}/{file}')