In [None]:
# basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# image manipulation
from PIL import Image

# sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

#tensorflow
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


In [None]:
image_dir = os.path.join(os.path.expanduser('~'), 'Downloads', 'images')
target_size = (224, 224)
resized_images = []
image_names = []

for filename in os.listdir(image_dir):
    if filename.endswith('.jpg') or filename.endswith('.png'):
        file_path = os.path.join(image_dir, filename)
        image = Image.open(file_path)
    
        rgb_image = image.convert('RGB') # converts image to RGB (jpg -> RGB, png -> RGBA)
        resized_image = rgb_image.resize(target_size)
        
        resized_images.append(resized_image) #records resized images
        image_names.append(filename[:-4]) #records names of resized images
        
print(f'✅ Resized {len(resized_images)} images in total!')

In [None]:
images_arr = []
for image in resized_images:
    images_arr.append(np.array(image))
    
images_arr = np.array(images_arr)
images_arr.shape

In [None]:
file_path = os.path.join(os.path.expanduser('~'), 'code', 'jackanichp', 'pill_pic', 'data_collection', 'directory_consumer_grade_images.xlsx')
data = pd.read_excel(file_path)

#Create a dataframe using only the images that were downloaded and resized
image_names = [int(name) for name in image_names] #convert image names to int
index_exists = data.index.isin(image_names) #create a boolean index of the image names that exist in the dataframe
data = data.loc[index_exists] #create a new dataframe with only the images that have been resized
data.tail(3)

In [None]:
# how many images are there for each pill?
data['Name'].value_counts()

In [None]:
encoder = OrdinalEncoder() # Instanciate One hot encoder
encoder.fit(data[['NDC11']])
encoder.categories_

In [None]:
data['encoded_NDC11'] = encoder.fit_transform(data[['NDC11']])
data['encoded_NDC11'].value_counts()

In [None]:
data

In [None]:
# Normalization of pixel values to be between 0 and 1
images_arr = images_arr / 255.0

In [None]:
images_arr.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(images_arr, data['encoded_NDC11'], test_size=0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.3) 

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}\nX_val shape: {X_val.shape}, y_val shape: {y_val.shape}\nX_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

In [None]:
y_train = to_categorical(y_train.values)
y_val = to_categorical(y_val.values)
y_test = to_categorical(y_test.values)