In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
from tqdm import tqdm

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# ! mkdir -p ~/.kaggle;
# ! cp kaggle.json ~/.kaggle/kaggle.json
# ! chmod 600 ~/.kaggle/kaggle.json

In [4]:
#! kaggle competitions download -c h-and-m-personalized-fashion-recommendations

Downloading h-and-m-personalized-fashion-recommendations.zip to /content
100% 28.7G/28.7G [18:13<00:00, 29.2MB/s]
100% 28.7G/28.7G [18:13<00:00, 28.2MB/s]


In [None]:
#!unzip /content/h-and-m-personalized-fashion-recommendations.zip -d "drive/MyDrive/ds_project_dataset/"

In [7]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Sequential   #, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Rescaling, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
#from tensorflow.keras.initializers import GlorotUniform

In [97]:
def get_filepath(article_id):
    """
    Returns the filepath of the image for the given article_id, e.g. 'samples/10/108775015.jpg'
    article_id: (str) article_id e.g. '108775015'
    """
    filepath = 'drive/MyDrive/ds_project_dataset/images/0' + article_id[:2] + '/' + '0' + article_id + '.jpg'
    return filepath

# Personalized Fashion Recommendations
## Baseline model for image recognition

The images are RGB color coded.

### Load the list of the samples

In [136]:
# Shows how many files in the images folder
def list_files(dir):
    """
    Returns list of filepaths in the given directory (dir) excluding hidden files
    """
    file_list = []
    for root, dirs, files in os.walk(dir):
        for file in files:
            if not file.startswith('.'):
                file_list.append(os.path.join(root, file))
    return file_list
file_lst = list_files('drive/MyDrive/ds_project_dataset/images')
len(file_lst)

105100

In [137]:
with open("filename_product_group_name.txt", "r") as file:
    lines = file.readlines()[1:]
    data = [line.strip().split(",") for line in lines]
    df = pd.DataFrame(data, columns=["article_id", "product_group_name"])

In [138]:
article_id_we_have = [path[-13:-4] for path in file_lst]

In [139]:
# Filter out rows that we have not the correspondent file for them
df = df[df['article_id'].isin(article_id_we_have)]

In [140]:
len(df)

104803

In [141]:
df['article_filepath'] = df['article_id'].apply(get_filepath)

### Preprocess

In [142]:
# The product_group_name to have
categories_to_have = \
['Garment Upper body', 'Garment Lower body', 'Garment Full body', 'Accessories', 'Underwear',\
 'Shoes', 'Swimwear', 'Socks & Tights', 'Nightwear']

le = LabelEncoder()
le.fit(categories_to_have)
labels = df['product_group_name']
labels = pd.Series(data=le.transform(labels), index=labels.index)

In [143]:
X = df['article_filepath']

### Load and preprocess the images

In [144]:
NUMBER_OF_SAMPLES_TO_USE = 300

In [145]:
# List of image paths
#image_paths = ["samples/sample_tshirt.jpg", "samples/sample_tshirt.jpg"]
image_paths = X[:NUMBER_OF_SAMPLES_TO_USE].values
# List to store the loaded images
images = []

In [146]:
TARGET_SIZE = (100, 100)

In [147]:
# TODO: add try - exception if the image is not uploaded

In [148]:
# Load the images and convert them to numpy arrays
for path in tqdm(image_paths):
    image = load_img(path=path, target_size=TARGET_SIZE)
    image = img_to_array(image)
    images.append(image)

# Convert the list of images to a numpy array
images = np.stack(images, axis=0)

100%|██████████| 300/300 [00:12<00:00, 23.60it/s]


In [149]:
labels = labels[:NUMBER_OF_SAMPLES_TO_USE].values

In [150]:
images.shape, labels.shape

((300, 100, 100, 3), (300,))

### Train-test split

In [151]:
images_train, images_test, labels_train, labels_test = train_test_split(images, labels, train_size=0.9)

### Model building

In [152]:
NUMBER_OF_CLASSES = len(le.classes_)
NUMBER_OF_CLASSES

9

In [153]:
# Define the model
baseline_model = Sequential(layers=[
    # Rescale the image in the [0, 255] range to be in the [0, 1] range
    Rescaling(scale=1./255, input_shape=images.shape[1:]),
    
    # The first convolutional layer
    Conv2D(filters=32, kernel_size=(3,3), activation='relu'),
    
    # The max pooling layer
    MaxPooling2D(pool_size=(2, 2)),
    
    # Flatten the output
    Flatten(),
    
    # Fully connected layer
    Dense(units=32, activation='relu'),
    
    # The output layer with units=NUMBER_OF_CLASSES. Sum of outputs equals to 1.
    Dense(NUMBER_OF_CLASSES, activation='softmax')
], name="baseline_model")

In [154]:
# Check if the input and output of the model are correct
assert baseline_model.input_shape[1:] == images.shape[1:]
assert baseline_model.output_shape[1:] == (NUMBER_OF_CLASSES,)

In [155]:
# Compile the model
baseline_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [156]:
# Make Early Stopping and Checkpoints
model_checkpoint = ModelCheckpoint(filepath='best_model_baseline_CNN.h5',
                                   monitor='val_loss',
                                   save_best_only=True,
                                   save_weights_only=False,
                                   mode='min',
                                   verbose=1)
early_stop = EarlyStopping(monitor='val_loss', patience=5, mode='min')

### Model fitting

In [159]:
baseline_model.fit(x=images_train, y=labels_train, batch_size=16, epochs=20, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f614e2807c0>

### Model evaluation

In [160]:
baseline_model.evaluate(images_test, labels_test)



[2.2205653190612793, 0.6000000238418579]

The benchmark model trained on ~300 samples demonstrates accuracy 0.6 on the test dataset, which is better than the baseline model's accuracy 0.4 (share of the majority class Garment Upper Body)

In [4]:
# !git branch
# !git add .
# !git commit -m "the baseline model for image recognition update"
# !git push origin kuzma

[kuzma 5b5f06b] the baseline model for image recognition update
 1 file changed, 11 insertions(+), 10 deletions(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 578 bytes | 578.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/kuzmatsukanov/kuzma_omri_noa_data_project.git
   552bd95..5b5f06b  kuzma -> kuzma
