# Preprocessing Images to Numpy Arrays
### **(Jupyter Notebook)**

- Preprocessing raw images to np.array is done locally because uploading all the raw images to google colab is nearly impossible with the size. 

- Here, we will preprocess the raw images and save it as a npz file.

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from tensorflow.keras.preprocessing import image
from PIL import UnidentifiedImageError

In [3]:
BIG_DATA_CSV_FILE_PATH = "../raw_data/big_data.csv"
big_data_df = pd.read_csv(BIG_DATA_CSV_FILE_PATH).drop(columns=['Unnamed: 0']).head(500)
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"['Adventure', 'Comedy', 'Fantasy']",barbie suffers a crisis that leads her to ques...


In [4]:
def get_image_array(
    df: pd.DataFrame, 
    image_folder_path: str,
    width=256, 
    height=256
    ):
    """
    Accepts a dataframe with an imdb_id column, searches for image named imdb_id.jpg
    Returns a np.array of dimensions (no. of images, image_width, image_height, 3)
    """
    image_array = np.zeros((df.shape[0],width, height, 3,), dtype=np.float32)
    initial_total = df.shape[0]
    unidentified_count, not_found_count = 0, 0
    error_image_array_index = list()
    for i in tqdm(range(df.shape[0])):
        try:
            image_path = f"{image_folder_path}/{df['imdb_id'][i]}.jpg"
            img = image.load_img(image_path, target_size=(width, height, 3))
            input_arr = np.asarray(image.img_to_array(img))
            image_array[i] = input_arr
        except UnidentifiedImageError as e1:
            unidentified_count += 1
            df = df.drop(index=i)
            error_image_array_index.append(i)
            pass
        except FileNotFoundError as e2:
            not_found_count += 1
            df = df.drop(index=i)
            error_image_array_index.append(i)
            pass
    clean_image_array = np.delete(image_array, error_image_array_index, axis=0)
    print(f"{unidentified_count} files were unidentified\n{not_found_count} files were not found")
    print(f"{len(clean_image_array)}/{initial_total} processed")
    assert df.shape[0] == len(clean_image_array)
    return df, image_array

In [6]:
IMAGE_FOLDER_PATH = "../raw_data/big_data_images"
clean_big_data_df, clean_image_array = get_image_array(big_data_df, IMAGE_FOLDER_PATH)

100%|██████████| 500/500 [00:01<00:00, 289.34it/s]


0 files were unidentified
56 files were not found
444/500 processed


In [8]:
CLEAN_DATA_FOLDER_PATH = "../raw_data/clean_data"
clean_big_data_df.to_csv(f"{CLEAN_DATA_FOLDER_PATH}/clean_big_data.csv")
np.save(f"{CLEAN_DATA_FOLDER_PATH}/clean_image_array", clean_image_array)