# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from keras.layers import GlobalAveragePooling2D
from keras.models import Model
from PIL import Image

In [None]:
import random
random.seed(42)

# Feature Extraction

In [None]:
def load_and_preprocess_image(image_path):
    img = load_img(image_path)
    img_array = img_to_array(img) # convert to numpy
    img_array = preprocess_input(img_array) # preprocesses for inceptionv3
    return img_array

In [None]:
inceptionv3_model = InceptionV3(include_top=False, weights='imagenet', input_shape=(299, 299, 3))
for layer in inceptionv3_model.layers:
    layer.trainable = False

pooled_features = GlobalAveragePooling2D()(inceptionv3_model.output)
feature_extraction_model = Model(inputs=inceptionv3_model.input, outputs=pooled_features)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
training_dir = r'/content/drive/My Drive/Cropped Stacks Test'
# directory with /6 participants/train or test/50 sentences/sample folders/word folders/png images of stacks

In [None]:
image_paths = []
labels = []
sentence_features = []

participants = os.listdir(training_dir)
participants.sort()
folders = [participant for participant in participants]
for participant in folders:
    train_path = os.path.join(training_dir, participant, 'train') # change to 'test' when needed
    sentence_folders = os.listdir(train_path)
    sentence_folders.sort()
    for sentence_folder in sentence_folders:
        print(sentence_folder)
        sentence_path = os.path.join(train_path, sentence_folder)
        folders = os.listdir(sentence_path)
        folders.sort()
        for folder in folders:
            print(folder)
            folder_path = os.path.join(sentence_path, folder)
            current_sequence = []
            current_labels = []
            folder_paths = os.listdir(folder_path)
            folder_paths.sort()
            for word_folder in folder_paths:
                word_folder_path = os.path.join(folder_path, word_folder)
                words = os.listdir(word_folder_path)
                words.sort()
                for word in words:
                    if word.endswith(".png"):
                        image_path = os.path.join(word_folder_path, word)
                        label = int(os.path.splitext(word)[0])
                        image_array = load_and_preprocess_image(image_path)
                        img_array_batch = np.expand_dims(image_array, axis=0) # have to add batch size so it's (1, 299, 299, 3)
                        #print(img_array_batch)
                        features = feature_extraction_model.predict(img_array_batch)
                        current_sequence.append(features.flatten()) # to make a 1D array
                        current_labels.append(label)
            # pad to be 5 words
            if len(current_sequence) < 5:
              last_FV = current_sequence[-1]
              last_label = current_labels[-1]
              for i in range(5 - len(current_sequence)):
                current_sequence.append(last_FV)
                current_labels.append(last_label)

            sentence_features.append(current_sequence)
            labels.append(current_labels)


In [None]:
np.save(r'/content/drive/My Drive/Model Files/X_test_cropped.npy', np.array(sentence_features))
np.save(r'/content/drive/My Drive/Model Files/y_test_cropped.npy', np.array(labels))