In [838]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

<h1>LOADING DATASET</h1>

In [839]:
products = pd.read_csv("../dataset/productIds.csv", header=None).values.flatten().astype(int)

In [840]:
users = pd.read_csv("../dataset/userIds.csv")

In [841]:
train = pd.read_parquet("../dataset/train.parquet")

In [868]:
train.head()

Unnamed: 0,dummyUserId,productId
0,b'PIXcm7Ru5KmntCy0yA1K',10524048
1,b'd0RILFB1hUzNSINMY4Ow',9137713
2,b'Ebax7lyhnKRm4xeRlWW2',5808602
3,b'vtigDw2h2vxKt0sJpEeU',10548272
4,b'r4GfiEaUGxziyjX0PyU6',10988173


In [842]:
valid = pd.read_parquet("../dataset/valid.parquet")

In [869]:
valid.head()

Unnamed: 0,dummyUserId,productId
0,b'I4Yc5Ztur3UNwY5SdvDh',10093853
1,b'nhWgcxEVY7jQ3MvvNxWL',12306408
2,b'3vriQXKwG095rvR1MSrz',11858310
3,b'MA8KmOxkGd1JQ42GXDGO',10072124
4,b'vax7VgJnswdiC8iHZSCi',10596405


<h1>MAIN RECOMMENDER MODEL</h1>

In [843]:
#The embedding layer gives a list of random numbers for each user and each product.

sample1 = tf.keras.layers.Embedding(6,10)
sample1(2)

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([ 0.03672286,  0.01259146, -0.03362755, -0.00697302,  0.03866266,
       -0.01374878, -0.02526289, -0.02193757,  0.04464361, -0.04074328],
      dtype=float32)>

In [844]:
sample1.get_weights()


[array([[-0.04974989,  0.00224668,  0.02770053,  0.00907525, -0.04447396,
         -0.02569438, -0.00124962,  0.03482344,  0.01099429, -0.01542653],
        [-0.00251231, -0.04839152, -0.04136902,  0.02916792,  0.01381086,
         -0.01527318,  0.02927932,  0.0140107 , -0.01028389, -0.04401486],
        [ 0.03672286,  0.01259146, -0.03362755, -0.00697302,  0.03866266,
         -0.01374878, -0.02526289, -0.02193757,  0.04464361, -0.04074328],
        [ 0.0492471 , -0.00565097, -0.01376202,  0.00548672,  0.01416382,
          0.04934924, -0.03893637,  0.01682074, -0.01168279, -0.02305605],
        [-0.03394373, -0.026026  , -0.04295291, -0.01551647,  0.03146568,
         -0.01396207, -0.00152576,  0.02513451,  0.00069203,  0.00052307],
        [ 0.03335932,  0.0130215 ,  0.02155601,  0.02286348, -0.03159906,
          0.03062787, -0.00088923, -0.00791873,  0.02746013,  0.03151307]],
       dtype=float32)]

In [845]:
users

Unnamed: 0,pmfkU4BNZhmtLgJQwJ7x
0,UDRRwOlzlWVbu7H8YCCi
1,QHGAef0TI6dhn0wTogvW
2,xkDvstQDkA6uJlOfslX7
3,44dM2SXR9BWX5e0ozkF8
4,mveuXd4mlxb3XxVGVqQJ
...,...
43601,1hsyohz0i37hinx6KX8x
43602,oGSJHmWWvRq8vSbMq2XA
43603,lcORJ5hemOZc1iGo9z7k
43604,5CqDquDAszqJp27P7AL8


In [846]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [847]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products, embeddingLen):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), 0)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), 0)
        
        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), embeddingLen)
        self.product_embedding = tf.keras.layers.Embedding(len(products), embeddingLen)

        self.dot = tf.keras.layers.Dot(axes=-1)
        
    def call(self, inputs):
        user = inputs[0]
        product = inputs[1]

        userEmbeddingIndex = self.dummy_user_table.lookup(user)
        productEmbeddingIndex = self.product_table.lookup(product)
        
        userEmbeddingValue = self.user_embedding(userEmbeddingIndex)
        productEmbeddingValue = self.product_embedding(productEmbeddingIndex)
        
        return tf.squeeze(self.dot([userEmbeddingValue, productEmbeddingValue]), 1)
        
    
    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)
        
        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0)#note this only works if the layer has been built!
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])
        
        top_scores, top_indices = tf.math.top_k(scores, k=10)
        top_ids = tf.gather(self.products, top_indices)
        return top_ids, top_scores

In [848]:
users = users.to_numpy()

In [849]:
type(users)

numpy.ndarray

In [850]:
users.shape = (43606,)

In [851]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

In [852]:
sr1 = SimpleRecommender(users, products, 8)


In [853]:
users

array(['UDRRwOlzlWVbu7H8YCCi', 'QHGAef0TI6dhn0wTogvW',
       'xkDvstQDkA6uJlOfslX7', ..., 'lcORJ5hemOZc1iGo9z7k',
       '5CqDquDAszqJp27P7AL8', 'SSPNYxJMfuKhoe1dg24m'], dtype=object)

In [854]:
sr1([tf.constant([['UDRRwOlzlWVbu7H8YCCi'], ['QHGAef0TI6dhn0wTogvW']]), 
     tf.constant([[8650774,  9306139,  9961521], [ 12058614, 12058615, 11927550]])
     ])

<tf.Tensor: shape=(2, 3), dtype=float32, numpy=
array([[ 0.00125801, -0.00333704,  0.00276724],
       [ 0.00122792,  0.00011188,  0.00019821]], dtype=float32)>

<h1>Actual Training of the Model</h1>

In [855]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break


tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor([10524048], shape=(1,), dtype=int32)


In [856]:
#trying to reduce the score of those products that user did not purchase

class Mapper():
    
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)
        
        self.num_negative_products = num_negative_products

        self.y = tf.one_hot(0, num_negative_products+1)#gives a vector denoting 1->purchsed pro & 0-> not purchased

    def __call__(self, user, product):
        random_negative_indexes = tf.random.uniform((self.num_negative_products, ), minval = 0, maxval=self.num_possible_products, dtype=tf.int32)
        negatives = tf.gather(self.possible_products_tensor, random_negative_indexes)
        candidates = tf.concat([product, negatives], axis = 0)
        return (user, candidates), self.y

In [857]:
#testing above
dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor)).map(Mapper(products, 10))
for (u, c), y in dataset:
    print(u)
    print(c)
    print(y)
    break

tf.Tensor([b'PIXcm7Ru5KmntCy0yA1K'], shape=(1,), dtype=string)
tf.Tensor(
[10524048 12396055  5029445 10331014 12095610 10837307 10463170 12546171
 10262400 11036777 10164394], shape=(11,), dtype=int32)
tf.Tensor([1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.], shape=(11,), dtype=float32)


In [858]:
#Bringing together things

def get_dataset(df, products, num_negative_products):
    dummy_user_tensor = tf.constant(df[["dummyUserId"]].values, dtype=tf.string)
    product_tensor = tf.constant(df[["productId"]].values, dtype=tf.int32)

    dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
    dataset = dataset.map(Mapper(products, num_negative_products))
    dataset = dataset.batch(1024)
    return dataset

In [859]:
products.shape

(29696,)

In [860]:
users.shape

(43606,)

In [861]:
model = SimpleRecommender(users, products, 20)

In [862]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
            optimizer=tf.keras.optimizers.legacy.SGD(learning_rate=250), 
            metrics=[tf.keras.metrics.CategoricalAccuracy()])


In [863]:
model.fit(get_dataset(train, products, 100), validation_data=get_dataset(valid, products, 100), epochs = 5)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x293525ab190>

<h1>MANUAL TESTING</h1>

In [866]:
test_product = 12058615

In [867]:
print("Recs for item {}: {}".format(test_product, model.call_item_item(tf.constant(test_product, dtype=tf.int32))))


Recs for item 12058615: (<tf.Tensor: shape=(10,), dtype=int32, numpy=
array([10490457, 10490474, 10577467, 10252928, 10313006,  9097969,
       10125462, 12360399, 11845206, 11028490])>, <tf.Tensor: shape=(10,), dtype=float32, numpy=
array([1.8701911, 1.7465106, 1.7169802, 1.5605607, 1.4877069, 1.4858367,
       1.4789683, 1.4783304, 1.4627651, 1.4563138], dtype=float32)>)
