# Preprocessing
This notebook manages the data pipeline and performs feature extraction for testing.

## Install Dependencies

In [1]:
%pip install ...

[31mERROR: Invalid requirement: '...'[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Imports

In [2]:
import os
import pickle

import numpy as np
import torchvision.datasets as datasets

from math import ceil, floor
from tqdm import tqdm
from PIL import Image
from autocrop import Cropper
from torchvision import transforms
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score


## Root Path

In [3]:
root_path = ''
train_path = train_path = os.path.join(root_path, 'train')
grade_path = os.path.join(root_path, 'grade')

os.makedirs(train_path, exist_ok=True)
os.makedirs(grade_path, exist_ok=True)

# Data Pipeline
Preparing the data for feature extraction

## Crop Data
The sorted images will be cropped and saved in testing/ <br>
Data will be put into subdirectories organized by labels

In [4]:
src = os.path.join(root_path, 'training_validation_set_0226')
dst = os.path.join(train_path, 'testing')

rej = os.path.join(train_path, 'rejected')

os.makedirs(dst, exist_ok=True)
os.makedirs(rej, exist_ok=True)

# autocropper
cropper = Cropper(244, 244)

rejected_count = 0

for filename in tqdm(os.listdir(src)):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        # crops image
        cropped_array = cropper.crop(f'{src}/{filename}')

        if type(cropped_array) != type(None):
            # saves successfully cropped image in subdir
            img = Image.fromarray(cropped_array)
            img.save(f'{dst}/{filename}')
        else:
            rejsubdir = os.path.join(rej, 'testing')
            os.makedirs(rejsubdir, exist_ok=True)

            # saves rejected image in rejected/testing/
            img = Image.open(f'{src}/{filename}')
            img.save(f'{rejsubdir}/{filename}')

            rejected_count += 1

print(f'Number of rejected images: {rejected_count}')

  0%|          | 0/156 [00:00<?, ?it/s]

100%|██████████| 156/156 [00:02<00:00, 72.02it/s]

Number of rejected images: 4





## Rejected Data
Manually crops rejected data that autocropper could not recognize <br>
The data will be saved to train/testing/

In [5]:
src = os.path.join(train_path, 'rejected', 'testing')
dst = os.path.join(train_path, 'testing')

cropped = os.path.join(train_path, 'rejected', 'testing_cropped')

os.makedirs(dst, exist_ok=True)
os.makedirs(cropped, exist_ok=True)

for file in os.listdir(src):
    img = Image.open(f'{src}/{file}')
    w, h = img.size

    left = 0
    right = w
    top = floor((h - w) / 2)
    bottom = h - ceil((h - w) / 2)

    img = img.crop((left, top, right, bottom))
    img = img.resize((244, 244))
    
    img.save(f'{dst}/{file}')
    img.save(f'{cropped}/{file}')

## Import PCA, LDA, SVM Models and Scaler

In [6]:
with open('./train/dimension_reduced_data/pca.pkl', 'rb') as f:
    pca = pickle.load(f)

with open('./train/dimension_reduced_data/lda.pkl', 'rb') as f:
    lda = pickle.load(f)

with open('./train/dimension_reduced_data/standardScaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('./train/dimension_reduced_data/svm.pkl', 'rb') as f:
    svm_model = pickle.load(f)

In [7]:
import pandas as pd

test_path = './train/testing/'
resolution = 50  # Ensure this matches what you used in training
initial_transforms = transforms.Compose([
    transforms.Resize((resolution, resolution)),
    transforms.ToTensor(),
])

with open('./train/dimension_reduced_data/class_to_idx.pkl', 'rb') as f:
    class_to_idx = pickle.load(f)

# Invert the dictionary to create an index to class mapping
idx_to_class = {v: k for k, v in class_to_idx.items()}


predictions = []

# Process each image in the test dataset
for image_name in tqdm(os.listdir(test_path)):
    if image_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        image_path = os.path.join(test_path, image_name)
        image = Image.open(image_path)
        image = initial_transforms(image)

        # Convert image to numpy array
        image_array = np.array(image).reshape(1, -1)
        image_array = image_array.reshape((image_array.shape[0], -1))
        
        # Apply the same preprocessing as done for training data
        image_array = scaler.transform(image_array)
        image_array = pca.transform(image_array)
        image_array = lda.transform(image_array)
        
        # Predict the class
        predicted_class = svm_model.predict(image_array)
        
        # Store the results
        predictions.append([image_name, idx_to_class[int(predicted_class[0])]])

# Convert the list to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['filename', 'predictions'])

# Save the predictions to a CSV file
predictions_df.to_csv('./grade/predictions.csv', index=False)

  explained_variance_ = (S**2) / (n_samples - 1)
  0%|          | 0/155 [00:00<?, ?it/s]


ValueError: X has 1 features, but LinearDiscriminantAnalysis is expecting 563 features as input.

In [None]:
from collections import defaultdict

src = 'training_validation_set_0226'

def is_image_file(filename: str) -> bool:
    extensions = ['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG']
    return any(filename.endswith(extension) for extension in extensions)

# mapping from filename to label
filename_to_label = {}

# mapping from filename to PIL Image object
filename_to_img = {}

# mapping from label to list of image files
label_to_filenames = defaultdict(list)

with open(src + '/file_mapping.txt') as file_mapping:
    for line in file_mapping:
        filename, label = line.split()
        if is_image_file(filename):
            filename_to_label[filename] = label
            filename_to_img[filename] = Image.open(f'{src}/{filename}')
            label_to_filenames[label].append(filename)

In [None]:
rejected_files = [
    '0208_16.jpeg',
    '0215_3.jpeg',
    '0220_11.jpeg',
    '0222_25.jpeg'
]

with open('grade/predictions.csv') as predictions:
    for line in predictions:
        file, label = line.split(',')
        if is_image_file(file):
            ground_truth: str
            ground_truth = filename_to_label[file]

            if ground_truth.strip() != label.strip():
                if file in rejected_files:
                    print('rejected')
                    
                print(f'file:{file}\tpred:{label.strip()}\t\t\ttruth:{ground_truth.strip()}')
                pred = class_to_idx[label.strip()]
                truth = class_to_idx[ground_truth.strip()]
                print(f'\t\t\tpred:{pred}\t\t\t\ttruth:{truth}')
                print('')

file:0208_26.jpeg	pred:lozanoroberto			truth:vennavellirajashekarreddy
			pred:9				truth:24

file:0208_10.jpeg	pred:zuluagagonzalezisabel			truth:yashasvi
			pred:31				truth:27

file:0220_19.jpeg	pred:liuhongji			truth:banmingkai
			pred:8				truth:1

rejected
file:0208_16.jpeg	pred:chientingwei			truth:chenziang
			pred:3				truth:2

file:0220_29.jpeg	pred:perambuduruvishnu			truth:sampagaonrahul
			pred:14				truth:17

file:0220_30.jpeg	pred:lozanoroberto			truth:vennavellirajashekarreddy
			pred:9				truth:24

file:0208_15.jpeg	pred:virvadianisargjyotin			truth:lozanoroberto
			pred:25				truth:9

rejected
file:0220_11.jpeg	pred:chientingwei			truth:huangjiaoyan
			pred:3				truth:5

file:0208_2.jpeg	pred:virvadianisargjyotin			truth:chientingwei
			pred:25				truth:3

file:0222_3.jpeg	pred:zhouchuandi			truth:selinayu
			pred:29				truth:18

