In [1]:
from pymongo import MongoClient
import pandas as pd
import itertools
from functools import reduce


def create_pair_items(lst_items):
    return list(itertools.combinations(lst_items, 2))


mongo = MongoClient()
data = list(mongo.data_4.events.find({},{"userid": 1, "jobId": 1}))
meta_data = list(mongo.data_4.data.find({},{"tag": 0, "_id": 0}))
df = pd.DataFrame(data)
df_1 = df.groupby(["userid"]).aggregate({"jobId": list}).reset_index()
df_1["num_items"] = df_1.jobId.str.len()
df_1[df_1["num_items"] > 1]
item_item_data = reduce(lambda x, y: x + y, list([create_pair_items(i) for i in df_1.jobId.values.tolist()]))
df_2 = pd.DataFrame(item_item_data, columns=["item1", "item2"])
meta_df = pd.DataFrame(meta_data)
df_2 = df_2.merge(meta_df, how="left", left_on="item1", right_on="jobId")
df_2 = df_2.merge(meta_df, how="left", left_on="item2", right_on="jobId", suffixes=("_item1", "_item2"))

In [3]:
import os
import tempfile

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [4]:
tensor_slices = {
    "item1": df_2.item1.values.tolist(),
    "title_item1": df_2.title_item1.values.tolist(),
    "location_item1": df_2.location_item1.values.tolist(),
    "level_item1": df_2.level_item1.values.tolist(),
    "item2": df_2.item2.values.tolist(),
    "title_item2": df_2.title_item2.values.tolist(),
    "location_item2": df_2.location_item2.values.tolist(),
    "level_item2": df_2.level_item2.values.tolist(),
}

jobs = tf.data.Dataset.from_tensor_slices({
    "item2": meta_df.jobId.values.tolist(),
    "location_item2": meta_df.location.values.tolist(),
    "level_item2": meta_df.level.values.tolist(),
})

items = tf.data.Dataset.from_tensor_slices(tensor_slices)

2024-02-05 10:42:55.407070: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-05 10:42:55.465191: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-05 10:42:55.465367: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [4]:
unique_items = np.unique(meta_df.jobId.values.tolist())
unique_location = np.unique(meta_df.location.values.tolist())
unique_level = np.unique(meta_df.level.values.tolist())

In [6]:
class Movie1Model(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_items,mask_token=None),
      tf.keras.layers.Embedding(len(unique_items) + 1, 32)
    ])
    self.location_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_location,mask_token=None),
      tf.keras.layers.Embedding(len(unique_location) + 1, 32)
    ])
    self.level_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_level,mask_token=None),
      tf.keras.layers.Embedding(len(unique_level) + 1, 32)
    ])

  def call(self, features):
    return tf.concat([
        self.title_embedding(features["item1"]),
        self.location_embedding(features["location_item1"]),
        self.level_embedding(features["level_item1"])
    ], axis=1)


class Movie2Model(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_items,mask_token=None),
      tf.keras.layers.Embedding(len(unique_items) + 1, 32)
    ])
    self.location_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_location,mask_token=None),
      tf.keras.layers.Embedding(len(unique_location) + 1, 32)
    ])
    self.level_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_level,mask_token=None),
      tf.keras.layers.Embedding(len(unique_level) + 1, 32)
    ])

  def call(self, features):
    return tf.concat([
        self.title_embedding(features["item2"]),
        self.location_embedding(features["location_item2"]),
        self.level_embedding(features["level_item2"])
    ], axis=1)

In [7]:
class QueryModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = Movie1Model()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)


class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = Movie2Model()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [12]:
class CFModel(tfrs.models.Model):

  def __init__(self, layer_sizes):
    super().__init__()
    self.query_model = QueryModel(layer_sizes)
    self.candidate_model = CandidateModel(layer_sizes)
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=jobs.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "item1": features["item1"],
        "location_item1": features["location_item1"],
        "level_item1": features["level_item1"],
    })
    movie_embeddings = self.candidate_model({
        "item2": features["item2"],
        "location_item2": features["location_item2"],
        "level_item2": features["level_item2"],
    })

    return self.task(
        query_embeddings, movie_embeddings, compute_metrics=not training)

In [13]:
tf.random.set_seed(42)
shuffled = items.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(60_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [14]:
num_epochs = 300

model = CFModel([256, 128, 64])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

2023-03-24 13:57:38.362363: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x21da2060 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-03-24 13:57:38.362380: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-03-24 13:57:38.365376: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
2023-03-24 13:57:38.432304: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You ma


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p

In [16]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model, k=20)
index.index_from_dataset(
    jobs.batch(128).map(lambda x: (x["item2"], model.candidate_model(x))))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f487459f580>

In [33]:
Job_Leader_Data_Engineer_other_Leader_28XLZJ_, pred = index({"item1": np.array(["Job_Junior_Data_Analyst_DN_Junior_NE83IQ"]),
                 "location_item1": np.array(["DN"]),
                 "level_item1": np.array(["Junior"])
                })
pred.numpy()

array([[b'Job_Junior_Data_Analyst_DN_Junior_NE83IQ',
        b'Job_Junior_Data_Analyst_DN_Junior_BVRE4S',
        b'Job_Junior_Data_Analyst_DN_Junior_KBFFEL',
        b'Job_Fresher_Data_Analyst_DN_Fresher_7BZI9S',
        b'Job_Junior_Data_Analyst_other_Junior_5NGTDA',
        b'Job_Junior_Data_Analyst_other_Junior_667A55',
        b'Job_Fresher_Data_Analyst_DN_Fresher_D6F3AE',
        b'Job_Junior_Data_Analyst_other_Junior_TPULME',
        b'Job_Junior_Data_Analyst_other_Junior_3XPCDF',
        b'Job_Fresher_Data_Analyst_DN_Fresher_RL9RQH',
        b'Job_Fresher_Data_Analyst_other_Fresher_MPTE37',
        b'Job_Junior_Data_Analyst_HN_Junior_D5PMAT',
        b'Job_Junior_Data_Scientist_DN_Junior_39L9V4',
        b'Job_Middle_Data_Analyst_other_Middle_3EUHOP',
        b'Job_Middle_Data_Analyst_other_Middle_ZC51JS',
        b'Job_Fresher_Data_Analyst_other_Fresher_50NE1S',
        b'Job_Fresher_Data_Analyst_other_Fresher_V1E0HP',
        b'Job_Fresher_Data_Analyst_other_Fresher_GQEHVS',


In [29]:
unique_items

array(['Job_Fresher_Backend_Developer_HCM_Fresher_76GM1G',
       'Job_Fresher_Backend_Developer_HCM_Fresher_EIC94O',
       'Job_Fresher_Backend_Developer_HCM_Fresher_FZGHQV',
       'Job_Fresher_Backend_Developer_HCM_Fresher_XNP1C4',
       'Job_Fresher_Backend_Developer_HCM_Fresher_YU59N5',
       'Job_Fresher_Backend_Developer_HN_Fresher_2P3ALT',
       'Job_Fresher_Backend_Developer_HN_Fresher_CC8VJN',
       'Job_Fresher_Backend_Developer_HN_Fresher_L22NC0',
       'Job_Fresher_Backend_Developer_other_Fresher_3GRSYJ',
       'Job_Fresher_Backend_Developer_other_Fresher_BRARX9',
       'Job_Fresher_Backend_Developer_other_Fresher_IBM1QQ',
       'Job_Fresher_Backend_Developer_other_Fresher_OW9IFH',
       'Job_Fresher_Backend_Developer_other_Fresher_SRLBO5',
       'Job_Fresher_Big_Data_Engineer_HCM_Fresher_19SU0T',
       'Job_Fresher_Big_Data_Engineer_HCM_Fresher_1RJXI1',
       'Job_Fresher_Big_Data_Engineer_HCM_Fresher_LWFD89',
       'Job_Fresher_Big_Data_Engineer_HCM_Fresher