# Preprocessing
This notebook manages the data pipeline and performs feature extraction for testing.

## Install Dependencies

In [None]:
%pip install ...

## Imports

In [5]:
import os
import cv2
import torch.utils.data

import numpy as np
import matplotlib.image as mpimg

from tqdm import tqdm
from autocrop import Cropper
from torchvision import transforms
from PIL import Image, ImageEnhance
from collections import defaultdict

## Root Path

In [6]:
root_path = ''

# Data Pipeline
Preparing the data for feature extraction

## Sort Data
The raw image files will be renamed, labeled, and saved in sorted_test_data/

In [7]:
test_path = os.path.join(root_path,'training_validation_set_0226')
test_dst = os.path.join(root_path, 'sorted_test_data')

src = test_path
dst = test_dst

def is_image_file(filename: str) -> bool:
    extensions = ['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG']
    return any(filename.endswith(extension) for extension in extensions)

# mapping from filename to label
filename_to_label = {}

# mapping from filename to PIL Image object
filename_to_img = {}

# mapping from label to list of image files
label_to_filenames = defaultdict(list)

with open(src + '/file_mapping.txt') as file_mapping:
    for line in file_mapping:
        filename, label = line.split()
        if is_image_file(filename):
            filename_to_label[filename] = label
            filename_to_img[filename] = Image.open(f'{src}/{filename}')
            label_to_filenames[label].append(filename)

for label in label_to_filenames.keys():
    label_to_filenames[label].sort()

# list of labels
labels = list(label_to_filenames.keys())

def save_image(filename: str):
    # create parent directory if needed
    parent = os.path.join(root_path, dst)
    os.makedirs(parent, exist_ok=True)

    img = filename_to_img[filename]

    label = filename_to_label[filename]
    date = filename.split('_')[0]
    new_filename = '_'.join([label, date + '.jpeg'])

    img.save(f'{parent}/{new_filename}')

# main function
def sort_data():
    exclude_labels = ['wufangyuan']
    exclude = defaultdict(lambda : False)
    for label in exclude_labels:
        exclude[label] = True

    for label in label_to_filenames.keys():
        if exclude[label]:
            print(f'excluding {label}')
            continue
        
        for filename in label_to_filenames[label]:
            save_image(filename)

    print('finished sorting data')

sort_data()

finished sorting data


## Crop Data
The sorted images will be cropped and saved in testing/ <br>
Data will be put into subdirectories organized by labels

In [None]:
test_path = os.path.join(root_path, 'sorted_test_data')
test_dst = os.path.join(root_path, 'testing')
rej_path = os.path.join(root_path, 'rejected')

src = test_path
dst = test_dst

rej = os.path.join(rej_path, dst)

os.makedirs(dst, exist_ok=True)
os.makedirs(rej, exist_ok=True)

# autocropper
cropper = Cropper(224, 224)

for filename in tqdm(os.listdir(src)):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        # creates subdirectory by labels
        label = filename.split('_')[0]
        subdir = os.path.join(dst, label)
        os.makedirs(subdir, exist_ok=True)

        # crops image
        cropped_array = cropper.crop(f'{src}/{filename}')
        print(f'attempting to save {src}/{filename}')

        if type(cropped_array) != type(None):
            # saves successfully cropped image in subdir
            img = Image.fromarray(cropped_array)
            img.save(f'{subdir}/{filename}')
            print(f'saved to {subdir}/{filename}')
        else:
            rejsubdir = os.path.join(rej, label)
            os.makedirs(rejsubdir, exist_ok=True)

            # saves rejected image in rejected/testing/[label]/
            img = Image.open(f'{src}/{filename}')
            img.save(f'{rejsubdir}/{filename}')
            print(f'rejected to {rejsubdir}/{filename}')

## Rejected Data
Handles rejected data that autocropper could not recognize <br>
The data will be saved to testing/

# Feature Extraction
Extracts important features from data

## PCA
Dimension reduction on data for full rank matrix

## LDA
Supervised dimension reduction on data that will be used for testing <br>
The numpy arrays will be saved in
- [dimension_reduced_data/X_test_pca_lda.npy](dimension_reduced_data/X_test_pca_lda.npy)
- [dimension_reduced_data/y_test_pca_lda.npy](dimension_reduced_data/y_test_pca_lda.npy)
    - for validation