# Preprocessing
This notebook manages the data pipeline and performs feature extraction for testing.

## Install Dependencies

In [94]:
%pip install ...

[31mERROR: Invalid requirement: '...'[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Imports

In [95]:
import os
import pickle

import numpy as np
import torchvision.datasets as datasets

from math import ceil, floor
from tqdm import tqdm
from PIL import Image
from autocrop import Cropper
from torchvision import transforms
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import accuracy_score

## Root Path

In [96]:
root_path = ''
train_path = train_path = os.path.join(root_path, 'train')
grade_path = os.path.join(root_path, 'grade')

os.makedirs(train_path, exist_ok=True)
os.makedirs(grade_path, exist_ok=True)

# Data Pipeline
Preparing the data for feature extraction

## Crop Data
The sorted images will be cropped and saved in testing/ <br>
Data will be put into subdirectories organized by labels

In [97]:
src = os.path.join(root_path, 'training_validation_set_0226')
dst = os.path.join(train_path, 'testing')

rej = os.path.join(train_path, 'rejected')

os.makedirs(dst, exist_ok=True)
os.makedirs(rej, exist_ok=True)

# autocropper
cropper = Cropper(244, 244)

rejected_count = 0

for filename in tqdm(os.listdir(src)):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        # crops image
        cropped_array = cropper.crop(f'{src}/{filename}')

        if type(cropped_array) != type(None):
            # saves successfully cropped image in subdir
            img = Image.fromarray(cropped_array)
            img.save(f'{dst}/{filename}')
        else:
            rejsubdir = os.path.join(rej, 'testing')
            os.makedirs(rejsubdir, exist_ok=True)

            # saves rejected image in rejected/testing/
            img = Image.open(f'{src}/{filename}')
            img.save(f'{rejsubdir}/{filename}')

            rejected_count += 1

print(f'Number of rejected images: {rejected_count}')

100%|██████████| 156/156 [00:02<00:00, 71.62it/s]

Number of rejected images: 4





## Rejected Data
Manually crops rejected data that autocropper could not recognize <br>
The data will be saved to train/testing/

In [98]:
src = os.path.join(train_path, 'rejected', 'testing')
dst = os.path.join(train_path, 'testing')

cropped = os.path.join(train_path, 'rejected', 'testing_cropped')

os.makedirs(dst, exist_ok=True)
os.makedirs(cropped, exist_ok=True)

for file in os.listdir(src):
    img = Image.open(f'{src}/{file}')
    w, h = img.size

    left = 0
    right = w
    top = floor((h - w) / 2)
    bottom = h - ceil((h - w) / 2)

    img = img.crop((left, top, right, bottom))
    img = img.resize((244, 244))
    
    img.save(f'{dst}/{file}')
    img.save(f'{cropped}/{file}')

## Import PCA, LDA, SVM Models and Scaler

In [99]:
with open('./train/dimension_reduced_data/pca.pkl', 'rb') as f:
    pca = pickle.load(f)

with open('./train/dimension_reduced_data/lda.pkl', 'rb') as f:
    lda = pickle.load(f)

with open('./train/dimension_reduced_data/standardScaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('./train/dimension_reduced_data/svm.pkl', 'rb') as f:
    svm_model = pickle.load(f)

In [100]:
import pandas as pd

test_path = './train/testing/'
resolution = 50  # Ensure this matches what you used in training
initial_transforms = transforms.Compose([
    transforms.Resize((resolution, resolution)),
    transforms.ToTensor(),
])

with open('./train/dimension_reduced_data/class_to_idx.pkl', 'rb') as f:
    class_to_idx = pickle.load(f)

# Invert the dictionary to create an index to class mapping
idx_to_class = {v: k for k, v in class_to_idx.items()}


predictions = []

# Process each image in the test dataset
for image_name in tqdm(os.listdir(test_path)):
    if image_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        image_path = os.path.join(test_path, image_name)
        image = Image.open(image_path)
        image = initial_transforms(image)

        # Convert image to numpy array
        image_array = np.array(image).reshape(1, -1)
        image_array = image_array.reshape((image_array.shape[0], -1))
        
        # Apply the same preprocessing as done for training data
        image_array = scaler.transform(image_array)
        image_array = pca.transform(image_array)
        image_array = lda.transform(image_array)
        
        # Predict the class
        predicted_class = svm_model.predict(image_array)
        
        # Store the results
        predictions.append([image_name, idx_to_class[int(predicted_class[0])]])

# Convert the list to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['FileName', 'PredictedClass'])

# Save the predictions to a CSV file
predictions_df.to_csv('./grade/predictions.csv', index=False)

  9%|▉         | 14/155 [00:00<00:01, 133.05it/s]

ravijayanthidhanasekar
vennavellirajashekarreddy
amarisian
perambuduruvishnu
vanderlindenilona
lozanoroberto
sivarajusairevanth
zotaharsh
chientingwei
sampagaonrahul
manglaniroshanlakhi
yashasvi
zuluagagonzalezisabel
somaniachal
sivarajusairevanth
zhouchuandi
pereiranerissagodfrey
somaniachal
zhangyuanzhen
ravijayanthidhanasekar
huangjiaoyan
chenziang
zhouchuandi
sampagaonrahul
zhouchuandi
zhangyuanzhen
manglaniroshanlakhi
huangjiaoyan
zuluagagonzalezisabel
liuhongji
sivarajusairevanth
vanderlindenilona
shahmanali
zotaharsh
selinayu


 29%|██▉       | 45/155 [00:00<00:00, 232.45it/s]

liuhongji
upadhyevaishnavi
gowdarachandrashekarappasrivarsha
lishumeng
selinayu
chientingwei
liuhongji
virvadianisargjyotin
zhangyuanzhen
upadhyevaishnavi
sivarajusairevanth
selinayu
manglaniroshanlakhi
upadhyevaishnavi
mendonakshay


 54%|█████▎    | 83/155 [00:00<00:00, 289.98it/s]

chientingwei
perambuduruvishnu
ravijayanthidhanasekar
huangjiaoyan
lozanoroberto
gowdarachandrashekarappasrivarsha
banmingkai
mendonakshay
gowdarachandrashekarappasrivarsha
wukaiyue
sampagaonrahul
wukaiyue
lishumeng
yashasvi
zuluagagonzalezisabel
virvadianisargjyotin
kodipunzulanandini
gowdarachandrashekarappasrivarsha
vanderlindenilona
wukaiyue
negiparth
ravijayanthidhanasekar
ravijayanthidhanasekar
mendonakshay
zuluagagonzalezisabel
perambuduruvishnu
mendonakshay
zuluagagonzalezisabel
virvadianisargjyotin
shahmanali
chenziang
vennavellirajashekarreddy
perambuduruvishnu
virvadianisargjyotin
chenziang
somaniachal
lishumeng
amarisian
liuhongji
vanderlindenilona
kodipunzulanandini
lozanoroberto
chenziang
pereiranerissagodfrey
negiparth
chientingwei
lozanoroberto
negiparth
pereiranerissagodfrey
yashasvi
kodipunzulanandini
sampagaonrahul
lozanoroberto
kodipunzulanandini
zhouchuandi
perambuduruvishnu
upadhyevaishnavi
banmingkai
liuhongji


 80%|████████  | 124/155 [00:00<00:00, 332.67it/s]

pereiranerissagodfrey
lishumeng
virvadianisargjyotin
oraisisaac
banmingkai
mendonakshay
chientingwei
somaniachal
kodipunzulanandini
upadhyevaishnavi
amarisian
virvadianisargjyotin
vennavellirajashekarreddy
zhouchuandi
liuhongji
pereiranerissagodfrey
lishumeng


100%|██████████| 155/155 [00:00<00:00, 294.80it/s]

wukaiyue
gowdarachandrashekarappasrivarsha
zhouchuandi
manglaniroshanlakhi
huangjiaoyan
manglaniroshanlakhi
oraisisaac
selinayu
chientingwei
oraisisaac
negiparth
shahmanali
somaniachal
zotaharsh
zotaharsh
banmingkai
zuluagagonzalezisabel
oraisisaac
negiparth
chenziang
amarisian
virvadianisargjyotin
shahmanali
shahmanali
oraisisaac
zhangyuanzhen
yashasvi
amarisian
perambuduruvishnu





In [101]:
# Define the path for your testing images
test_path = './train/testing/'

# Initialize the same transform you used for your training data
resolution = 50  # Ensure this matches what you used in training
initial_transforms = transforms.Compose([
    transforms.Resize((resolution, resolution)),
    transforms.ToTensor(),
])

# Process the test images
X_test = []
files = [f for f in os.listdir(test_path) if f.endswith(('.png', '.jpg', '.jpeg'))]

for file in tqdm(files):
    image_path = os.path.join(test_path, file)
    image = Image.open(image_path)
    image = initial_transforms(image)
    image = np.array(image)
    X_test.append(image)

X_test = np.array(X_test)
n, d1, d2, d3 = X_test.shape
X_test = X_test.reshape((n, d1 * d2 * d3))

# Normalize the data
X_test = scaler.transform(X_test)

# Apply PCA and LDA transformations
X_test = pca.transform(X_test)
X_test = lda.transform(X_test)

# Make predictions
predictions = model.predict(X_test)

# predictions now contains the predicted labels for your test data


100%|██████████| 155/155 [00:00<00:00, 1640.84it/s]
