In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pymysql
import io
import sys
import pickle

from detection import generate_download_signed_url_v4
from detection import get_similar_products_uri
from detection import query_product

from tensorflow.keras import backend
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow_recommenders as tfrs

from google.cloud import storage
from google.cloud import vision
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from PIL import Image
from typing import Dict, Text
from annoy import AnnoyIndex

In [3]:
#Setup Connection to mysql database
conn = pymysql.connect(
    host='35.221.181.94',
    port=int(3306),
    user="mkhoa",
    passwd='NTMK261194@dng',
    db="project",
    charset='utf8mb4')

cur = conn.cursor()

def query_item2room():
    '''
    Make dataframe for ImageDataGenerator
    
    '''
    query = f'''
    SELECT GROUP_CONCAT(room SEPARATOR ', ') as room, room_path
    FROM project.item2room a
    LEFT JOIN project.Files b ON a.item = b.id
    WHERE b.bucket_path like '%Products%'
    GROUP BY room_path 
    '''
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY SELECT:', err)
    result = cur.fetchall()
    result = pd.DataFrame(result, columns=['RoomID', 'RoomPath'])
    return result

def query_classes():
    '''
    Query for list of distinct room
    
    '''
    query = f'''
    SELECT distinct room
    FROM project.item2room
    '''
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY SELECT:', err)
    result = cur.fetchall()
    result = pd.DataFrame(result, columns=['RoomID'])
    return result

def query_item2item():
    '''
    Make dataframe for Item2Item Matching
    
    '''
    query = f'''
    WITH cte as(
    SELECT id, url, bucket, bucket_path, obj_count
    FROM project.Files
    WHERE bucket_path not like "%Design%" AND bucket_path not like "%Thumbnail%")
    SELECT 
    a.item as item1, 
    c.bucket_path as bucket1, 
    b.item as item2, 
    d.bucket_path as bucket2
    FROM project.item2room a
    INNER JOIN project.item2room b ON a.room = b.room
    INNER JOIN cte c ON a.item = c.id
    INNER JOIN cte d ON b.item = d.id
    WHERE a.item != b.item
    '''
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY SELECT:', err)
    result = cur.fetchall()
    result = pd.DataFrame(result, columns=['item1', 'bucket_path1', 'item2', 'bucket_path2' ])
    return result

In [None]:
def bucket2product(bucket_path):
    '''
    Reverse thumbnail bucket path to query product information
    
    
    '''
    query = f'''
    SELECT a.id, a.product, a.url
    FROM project.ProductHeader a
    LEFT JOIN project.Files b ON a.id = b.id
    WHERE b.bucket_path = {{bucket_path}}
    '''
    try:
        cur.execute(query)
    except Exception as err:
        print('ERROR BY SELECT:', err)
    result = cur.fetchone()
    return result


In [3]:
item2item = query_item2item()

In [4]:
project_id = 'abstract-veld-289612'
bucket_name = 'ftmle'
storage_client = storage.Client.from_service_account_json("./Credentials/abstract-veld-289612-327ddac80eba.json")

In [5]:
item2item

Unnamed: 0,item1,bucket_path1,item2,bucket_path2
0,204369,Images/Products/204369-0-601.963.58.jpg,202349,Images/Products/202349-0-802.017.40.jpg
1,203035,Images/Products/203035-0-902.142.85.jpg,202424,Images/Products/202424-0-198.850.43.jpg
2,202508,Images/Products/202508-0-399.030.41.jpg,202507,Images/Products/202507-0-303.270.68.jpg
3,202507,Images/Products/202507-0-303.270.68.jpg,202508,Images/Products/202508-0-399.030.41.jpg
4,203268,Images/Products/203268-0-602.178.22.jpg,202803,Images/Products/202803-0-802.418.16.jpg
...,...,...,...,...
147,204277,Images/Products/204277-0-302.580.41.jpg,204342,Images/Products/204342-0-202.085.27.jpg
148,202349,Images/Products/202349-0-802.017.40.jpg,204369,Images/Products/204369-0-601.963.58.jpg
149,204320,Images/Products/204320-0-668.040.47.jpg,204384,Images/Products/204384-0-902.874.13.jpg
150,204299,Images/Products/204299-0-103.064.39.jpg,204384,Images/Products/204384-0-902.874.13.jpg


In [2]:
def load_bucket_image(path):
    '''
    Load GCS iamge from bucket
    
    '''
    path = str(path.numpy().decode("utf-8"))
    blob = storage_client.bucket(bucket_name).get_blob(path)
    img = blob.download_as_string()

    return img

def preprocess_image(bucket_path):
    '''
    Preprocess image from bucket path
    
    '''
    img = tf.py_function(load_bucket_image, [bucket_path], tf.string)
    img = tf.image.decode_image(img, channels=3, expand_animations = False)
    img = tf.image.resize(img, (244, 244))
    img = img/255
    img = tf.cast(img, tf.float32)

    return img

# The tuples are unpacked into the positional arguments of the mapped function
def load_and_preprocess_from_path_label(path1, path2):
  return {"item1": preprocess_image(path1), "item2": preprocess_image(path2)}

# The tuples are unpacked into the positional arguments of the mapped function
def load_and_preprocess_candidate(path):
  return preprocess_image(path)

In [7]:
candidate = tf.data.Dataset.from_tensor_slices((item2item['bucket_path2']))
candidate_map = candidate.map(load_and_preprocess_candidate)

In [8]:
df_train, df_test = train_test_split(item2item, test_size=0.1)
train = tf.data.Dataset.from_tensor_slices((df_train['bucket_path1'], df_train['bucket_path2']))
train_map = train.map(load_and_preprocess_from_path_label)
test = tf.data.Dataset.from_tensor_slices((df_test['bucket_path1'],df_test['bucket_path2']))
test_map = test.map(load_and_preprocess_from_path_label)

In [9]:
#Parameter
BATCH_SIZE = 1
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_set = train_map.batch(BATCH_SIZE)
test_set = test_map.batch(BATCH_SIZE)

In [10]:
candidate_set = candidate_map.batch(1)

In [11]:
class PairModel(tfrs.Model):

  def __init__(self, tower1_model, tower2_model):
    super().__init__()
    self.tower1_model: tf.keras.Model = tower1_model #movie_model
    self.tower2_model: tf.keras.Model = tower2_model #user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    tower1_embeddings = self.tower1_model(features["item1"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_tower2_embeddings = self.tower2_model(features["item2"])

    # The task computes the loss and the metrics.
    return self.task(tower1_embeddings, positive_tower2_embeddings)

In [12]:
tower1_model = InceptionV3(include_top=False, input_shape=(244, 244, 3))
for layer in tower1_model.layers:
    layer.trainable = False
flat1 = Flatten()(tower1_model.layers[-1].output)
class1 = Dense(244, activation='relu', kernel_initializer='he_uniform')(flat1)
# define new model
tower1_model = Model(inputs=tower1_model.inputs, outputs=class1)

In [13]:
tower2_model = InceptionV3(include_top=False, input_shape=(244, 244, 3))
for layer in tower2_model.layers:
    layer.trainable = False
flat1 = Flatten()(tower2_model.layers[-1].output)
class1 = Dense(244, activation='relu', kernel_initializer='he_uniform')(flat1)
# define new model
tower2_model = Model(inputs=[tower2_model.inputs], outputs=[class1])

In [14]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=candidate_set.map(tower2_model))

In [15]:
task = tfrs.tasks.Retrieval(
  metrics=metrics)

In [16]:
model = PairModel(tower1_model, tower2_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
with tf.device('/device:XLA_GPU:0'):
    model.fit(train_set, epochs=20)

Epoch 1/20


In [19]:
index = tfrs.layers.ann.BruteForce(model.tower2_model)
index.index(candidate_set.map(model.tower1_model), candidate)

In [27]:
x = load_and_preprocess_candidate('Images/Products/204369-0-601.963.58.jpg')
x = tf.expand_dims(x, axis=0)

In [28]:
# Get recommendations.
_, titles = index(x)
print(f"Recommendations for Furniture: {titles[0, :10]}")

Recommendations for Furniture: [b'Images/Products/204064-0-802.671.37.jpg'
 b'Images/Products/204064-0-802.671.37.jpg'
 b'Images/Products/204064-0-802.671.37.jpg'
 b'Images/Products/204064-0-802.671.37.jpg'
 b'Images/Products/204064-0-802.671.37.jpg'
 b'Images/Products/203268-0-602.178.22.jpg'
 b'Images/Products/202349-0-802.017.40.jpg'
 b'Images/Products/202424-0-198.850.43.jpg'
 b'Images/Products/202507-0-303.270.68.jpg'
 b'Images/Products/202508-0-399.030.41.jpg']


In [19]:
model.tower1_model.save('Retrieval_tower1.h5')
model.tower2_model.save('Retrieval_tower2.h5')

In [8]:
tower1_model = tf.keras.models.load_model('./Model/Retrieval_tower1.h5')
tower2_model = tf.keras.models.load_model('./Model/Retrieval_tower2.h5')



In [46]:
model.evaluate(test_set, return_dict=True)



{'factorized_top_k': array([0.    , 0.0625, 0.0625, 0.375 , 0.6875], dtype=float32),
 'factorized_top_k/top_1_categorical_accuracy': 0.0,
 'factorized_top_k/top_5_categorical_accuracy': 0.0625,
 'factorized_top_k/top_10_categorical_accuracy': 0.0625,
 'factorized_top_k/top_50_categorical_accuracy': 0.375,
 'factorized_top_k/top_100_categorical_accuracy': 0.6875,
 'loss': 97.43084716796875,
 'regularization_loss': 0,
 'total_loss': 97.43084716796875}

In [9]:
index = AnnoyIndex(244, "dot")

In [26]:
candidate_embeddings = candidate.enumerate().map(lambda idx, bucket_path2: (idx, bucket_path2, model.tower2_model(tf.expand_dims(load_and_preprocess_candidate(bucket_path2), axis=0))))

In [27]:
candidate_to_path = dict((idx, title) for idx, title, _ in candidate_embeddings.as_numpy_iterator())
pickle.dump(candidate_to_path, open("'./Model/candidate_to_path.p", "wb"))

In [18]:
with open('./Model/candidate_to_path.p', 'rb') as handle:
    candidate_to_path = pickle.load(handle)

EOFError: Ran out of input

In [15]:
candidate_to_path

NameError: name 'candidate_to_path' is not defined

In [44]:
# We unbatch the dataset because Annoy accepts only scalar (id, embedding) pairs.
for candidate_id, _, embedding in candidate_embeddings.as_numpy_iterator():
  index.add_item(candidate_id, embedding[0])

In [45]:
index.build(10)

True

In [46]:
index.save('index.ann')

True

In [15]:
index = AnnoyIndex(244, "dot")
index.load('index.ann')

True

In [45]:
x = load_and_preprocess_candidate('Images/Products/204369-0-601.963.58.jpg')
x = tf.expand_dims(x, axis=0)
query_embedding = model.tower1_model(x)
candidates = index.get_nns_by_vector(query_embedding[0], 10)
print(f"Candidates: {[candidate_to_path[x].decode('utf-8') for x in candidates]}.")

Candidates: ['Images/Products/203750-0-202.016.82.jpg', 'Images/Products/203750-0-202.016.82.jpg', 'Images/Products/203046-0-702.068.04.jpg', 'Images/Products/203171-0-502.345.82.jpg', 'Images/Products/203171-0-502.345.82.jpg', 'Images/Products/203649-0-790.462.60.jpg', 'Images/Products/202424-0-198.850.43.jpg', 'Images/Products/203293-0-402.808.43.jpg', 'Images/Products/203293-0-402.808.43.jpg', 'Images/Products/203408-0-702.535.41.jpg'].


In [44]:
item2item[item2item['bucket_path1'] == 'Images/Products/204369-0-601.963.58.jpg']

Unnamed: 0,item1,bucket_path1,item2,bucket_path2
0,204369,Images/Products/204369-0-601.963.58.jpg,202349,Images/Products/202349-0-802.017.40.jpg
