In [1]:
import os
import tempfile
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

plt.style.use('seaborn-whitegrid')

2023-04-21 18:14:44.234380: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  plt.style.use('seaborn-whitegrid')


In [2]:
import os
import glob
from pymongo import MongoClient
import pandas as pd

database = "vnw_job"
col = "items"

mongo = MongoClient("mongodb://10.122.6.17:27017")
df_meta = pd.DataFrame(list(mongo[database][col].find()))
df_meta['jobTitle'] = df_meta.categoricalProps.apply(lambda x: x["jobTitle"][0])
df_meta['skills'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["skills"]))
df_meta['industries'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["industries"]))
df_meta['locations'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["locations"]))
df_meta['jobLevel'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["jobLevel"]))
df_meta['availableDate'] = df_meta.dateProps.apply(lambda x: x["availableDate"])
df = pd.read_parquet(glob.glob("/home/spark/ylv/data/navidata/*.parquet"))

In [3]:
ratings = df.merge(df_meta, left_on="targetEntityId", right_on="_id", how="left")

In [4]:
ratings = ratings.fillna('UNK')

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
rating_1st, rating_2nd = train_test_split(ratings, test_size=0.2)

In [7]:
rating_1st.shape, rating_2nd.shape

((8032554, 13), (2008139, 13))

In [8]:
%%time

movies = tf.data.Dataset.from_tensor_slices({
    "job_id": df_meta._id.values.tolist(),
    "category": df_meta.industries.values.tolist(),
    "location": df_meta.locations.values.tolist(),
    "level": df_meta.jobLevel.values.tolist(),
    "job_title": df_meta.jobTitle.values.tolist(),
    "skill_text": df_meta.skills.values.tolist()
})

tensor_slices = {
    "user_id": rating_1st.entityId.values.tolist(),
    "job_id": rating_1st.targetEntityId.values.tolist(),
    "location": rating_1st.locations.values.tolist(),
    "category": rating_1st.industries.values.tolist(),
    "level": rating_1st.jobLevel.values.tolist(),
    "job_title": rating_1st.jobTitle.values.tolist(),
    "skill_text": rating_1st.skills.values.tolist()
}

2023-04-21 18:15:44.495905: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 18:15:44.517191: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 18:15:44.517337: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 18:15:44.517705: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

CPU times: user 788 ms, sys: 343 ms, total: 1.13 s
Wall time: 1.15 s


In [9]:
%%time

ds_rating_1st = tf.data.Dataset.from_tensor_slices(tensor_slices)

CPU times: user 53.1 s, sys: 1.89 s, total: 55 s
Wall time: 55 s


In [10]:
%%time

from functools import reduce

unique_job_titles = np.unique(df_meta.jobTitle.values.tolist())
unique_job_ids = np.unique(df_meta._id.values.tolist())
unique_user_ids = np.unique(rating_1st.entityId.values.tolist())
unique_locations = np.unique(reduce(lambda x, y: x + "," + y, df_meta.locations.values.tolist()).split(","))
unique_skills = np.unique(reduce(lambda x, y: x + "," + y, df_meta.skills.values.tolist()).split(","))
# job_title_vocabs = np.unique(reduce(lambda x, y: x + y, df_meta.title.str.split(" ").tolist()))
unique_category = np.unique(reduce(lambda x, y: x + "," + y, df_meta.industries.values.tolist()).split(","))

CPU times: user 6.74 s, sys: 1.21 s, total: 7.95 s
Wall time: 7.95 s


In [11]:
%%time

unique_levels = np.unique(reduce(lambda x, y: x + "," + y, df_meta.jobLevel.values.tolist()).split(","))

CPU times: user 136 ms, sys: 613 µs, total: 136 ms
Wall time: 138 ms


In [12]:
class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None, name='user_stringlookup_layer'),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32, name='user_embedding_layer'),
    ], name='user_embedding_model')

  def call(self, inputs):
    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.user_embedding(inputs["user_id"])
    ], axis=1)

In [13]:
class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = UserModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    print(feature_embedding)
    return self.dense_layers(feature_embedding)

In [14]:
def tokenization(t):
    return tf.strings.split(t, ',')

def tokenization_1(t):
    return tf.strings.split(t, ' ')


class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.job_id_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_job_ids,mask_token=None),
      tf.keras.layers.Embedding(len(unique_job_ids) + 1, 32)
    ])
    self.location_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=100,
          standardize=None,
          split=tokenization,
          vocabulary=unique_locations,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_locations) + 1, 32),
      tf.keras.layers.GlobalAvgPool1D()
    ])
    self.level_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=100,
          standardize=None,
          split=tokenization,
          vocabulary=unique_levels,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_levels) + 1, 32),
      tf.keras.layers.GlobalAvgPool1D()
    ])
    self.category_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=100,
          standardize=None,
          split=tokenization,
          vocabulary=unique_category,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_category) + 1, 64),
      tf.keras.layers.GlobalAvgPool1D()
    ])
#     self.skill_embedding = tf.keras.Sequential([
#       tf.keras.layers.TextVectorization(
#           max_tokens=100,
#           standardize=None,
#           split=tokenization,
#           vocabulary=unique_skills,
#           pad_to_max_tokens=True
#       ),
#       tf.keras.layers.Embedding(len(unique_skills) + 1, 32),
#       tf.keras.layers.GlobalAvgPool1D()
#     ])

  def call(self, features):
    return tf.concat([
        self.job_id_embedding(features["job_id"]),
        self.category_embedding(features["category"]),
        self.location_embedding(features["location"]),
        self.level_embedding(features["level"]),
#         self.skill_embedding(features["skill_text"])
    ], axis=1)

In [15]:
class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = MovieModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [16]:
class MovielensModel(tfrs.models.Model):

  def __init__(self, layer_sizes):
    super().__init__()
    self.query_model = QueryModel(layer_sizes)
    self.candidate_model = CandidateModel(layer_sizes)
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "user_id": features["user_id"]
    })
    movie_embeddings = self.candidate_model({
        "job_id": features["job_id"],
        "job_title": features["job_title"],
        "category": features["category"],
        "location": features["location"],
        "skill_text": features["skill_text"],
        "level": features["level"],
    })

    return self.task(
        query_embeddings, movie_embeddings, compute_metrics=not training)

In [17]:
cached_train_1st = ds_rating_1st.batch(4096)

In [18]:
%%time

num_epochs = 50

model = MovielensModel([128, 64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train_1st,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

Tensor("query_model/user_model/concat/concat:0", shape=(None, 32), dtype=float32)
Tensor("query_model/user_model/concat/concat:0", shape=(None, 32), dtype=float32)


2023-04-21 18:16:57.468444: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f8fb7186210 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-21 18:16:57.468474: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-04-21 18:16:57.470777: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
2023-04-21 18:16:57.535611: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

Yo


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p

CPU times: user 35min 56s, sys: 1min 5s, total: 37min 1s
Wall time: 14min 36s


In [19]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model, k=10)
index.index_from_dataset(
    movies.batch(500).map(lambda x: (x["job_id"], model.candidate_model(x))))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f94520174c0>

In [20]:
import random

u = random.choice(unique_user_ids)
print("========================HISTORY==============================")
rating_1st[rating_1st.entityId==u][["targetEntityId","jobTitle", "industries", "jobLevel", "locations", "skills"]]
# score, pred = index({"user_id": np.array(["5749316"])})
# list(map(lambda x: x.decode('ascii'), pred.numpy()[0]))
# df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred.numpy()[0])))][["jobTitle", "industries", "locations", "jobLevel", "skills"]]



Unnamed: 0,targetEntityId,jobTitle,industries,jobLevel,locations,skills
4634827,1617861,UNK,UNK,UNK,UNK,UNK


In [21]:
score, pred = index({"user_id": np.array(["14.186.104.233, 10.122.10.10"])})
# list(map(lambda x: x.decode('ascii'), pred.numpy()[0]))
df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred.numpy()[0])))][["jobTitle", "industries", "locations", "jobLevel", "skills"]]

tf.Tensor(
[[ 0.25018522 -0.04359851  0.38761485 -0.22241382 -0.18109947  0.26559716
  -0.25351247  0.34905553 -0.0225444  -0.23030001  0.01781677 -0.00725579
  -0.08491569 -0.673024   -0.24497876  0.0909945   0.30467156  0.08764698
  -0.11209524 -0.09641353 -0.08389925 -0.26434812  0.33045956 -0.12258724
  -0.36470428 -0.18804297 -0.34601757  0.02455912  0.1951107   0.33963367
  -0.04576489  0.23378743]], shape=(1, 32), dtype=float32)


Unnamed: 0,jobTitle,industries,locations,jobLevel,skills
6978,account associate,"Customer Service,Marketing,Sales",Ha Noi,Fresher/Entry level,"tiep can khach hang,gioi thieu san pham,tu van..."
8166,tro ly du an project assistant muc luong tu 9 ...,"Administrative/Clerical,Civil/Construction,Pla...",Ha Noi,Experienced (non-manager),"ho so dau thau,theo doi bao cao,thanh quyet to..."
9233,nhan vien phong sales admin,"Administrative/Clerical,Marketing,Sales",Ha Noi,Experienced (non-manager),"system admin,phan tich,english skill,administr..."
9452,nhan vien hanh chinh va xu ly ho so ha noi nha...,"Customer Service,Education/Training","Ho Chi Minh,Ha Noi",Fresher/Entry level,"communication,english"
9624,can bo ngan hang chi nhanh thanh xuan,"Banking,Finance/Investment",Ha Noi,Experienced (non-manager),"teller operations,credit,credit operations,sal..."
9690,admin officer nhan vien hanh chinh bo phan pt,"Administrative/Clerical,Customer Service,Entry...",Ha Noi,Fresher/Entry level,"admin,administrative management,entry level,va..."
10972,sales admin,"Customer Service,Export-Import,Production/Process",Ha Noi,Experienced (non-manager),"sales admin,customer service,cham soc khach ha..."
11892,chuyen vien ho tro kinh doanh,"Administrative/Clerical,Legal/Contracts,Financ...",Ha Noi,Experienced (non-manager),"luat doanh nghiep,quan ly ho so,english,phan t..."
12484,ho tro phong kinh doanh du an,"Customer Service,Marketing,Planning/Projects",Ha Noi,Experienced (non-manager),"giai phap cong nghe,lap ke hoach du an,tieng a..."
13191,ky su phat trien ung dung application developer,"Production/Process,IT - Software,Planning/Proj...",Ha Noi,Experienced (non-manager),"english,object oriented,ms sql server,oracle,p..."


In [22]:
"6814502" in unique_user_ids, rating_2nd.shape

(True, (2008139, 13))

In [23]:
user_ids_2nd = np.unique(rating_2nd.entityId.values.tolist())

In [24]:
new_unique_userids = set(user_ids_2nd) - set(unique_user_ids)
len(new_unique_userids)

49928

In [25]:
new_unique_userids

{'27.75.108.13, 10.122.10.10',
 '116.105.170.32, 10.122.10.10',
 '14.185.171.90, 10.122.10.10',
 '14.242.153.15, 10.122.10.10',
 '3301573',
 '14.241.199.101, 10.122.10.10',
 '123.19.228.2, 10.122.10.10',
 '114.119.143.177, 10.122.10.10',
 '14.237.160.185, 10.122.10.10',
 '171.253.4.188, 10.122.10.10',
 '113.187.245.75, 10.122.10.10',
 '27.70.247.211, 10.122.10.10',
 '6867486',
 '14.191.57.185, 10.122.10.10',
 '117.5.141.207, 10.122.10.10',
 '116.96.120.80, 10.122.10.10',
 '165.225.230.112, 10.122.10.10',
 '14.167.171.135, 10.122.10.10',
 '116.109.185.198, 10.122.10.10',
 '171.251.234.53, 10.122.10.10',
 '171.247.79.73, 10.122.10.10',
 '1.55.42.235, 10.122.10.10',
 '113.163.72.143, 10.122.10.10',
 '7149361',
 '14.187.236.243, 10.122.10.10',
 '1.52.99.184, 10.122.10.10',
 '123.24.189.215, 10.122.10.10',
 '116.110.243.143, 10.122.10.10',
 '123.20.253.213, 10.122.10.10',
 '113.160.185.101, 10.122.10.10',
 '14.191.101.230, 10.122.10.10',
 '14.244.191.12, 10.122.10.10',
 '171.236.135.223, 10

In [26]:
score, pred = index({"user_id": np.array(["6937452"])})
list(map(lambda x: x.decode('ascii'), pred.numpy()[0]))
df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred.numpy()[0])))][["jobTitle", "industries", "locations", "jobLevel", "skills"]]

tf.Tensor(
[[ 0.00507442  0.5163779  -0.5457415   0.5571508  -0.15455909  0.11636562
   0.00845364  0.21496324  0.01792824  0.14123039  0.39737588  0.19248845
  -0.45298576 -0.00458293  0.4227991  -0.4689922   0.13474075  0.0117634
  -0.35706902  0.05359838  0.2914499  -0.08387545 -0.38715577  0.4530252
  -0.46546844  0.4231716   0.4784828   0.46473625 -0.10187411  0.52279186
  -0.36248457 -0.12246305]], shape=(1, 32), dtype=float32)


Unnamed: 0,jobTitle,industries,locations,jobLevel,skills
172,pho phong tai chinh,"Accounting,Auditing,Finance/Investment",Ho Chi Minh,Manager,"tai chinh ngan hang,thanh toan quoc te,phan me..."
681,finance assistant manager pho phong tai chinh,"Accounting,Banking,Finance/Investment",Ho Chi Minh,Experienced (non-manager),"finance,finance analysis,accounting,management..."
1176,assistant financial controller,"Accounting,Finance/Investment,Fintech",Ho Chi Minh,Manager,"ke toan,tai chinh,financial report,financial c..."
1497,finance and accounting manager,"Accounting,Auditing,Finance/Investment",Ho Chi Minh,Manager,"financial management,finance management,accoun..."
1500,ke toan truong,"Accounting,Auditing,Finance/Investment",Ho Chi Minh,Manager,"ke toan quan ly,lap chung tu ke toan,ke toan s..."
2328,sage 100 contractor consultant accountant book...,"Accounting,Civil/Construction,Consulting",Ho Chi Minh,Experienced (non-manager),"english,sage 100 erp,accounting,finance,constr..."
4137,assistant manager financial reporting tax,"Accounting,Administrative/Clerical,Finance/Inv...",Ho Chi Minh,Manager,"analyzing statistics,financial databases,finan..."
4936,financial controller,"Accounting,Auditing,Finance/Investment",Ho Chi Minh,Manager,"finance controling,cpa,accounting,finance,kiem..."
5326,chief accountant manufacturing garment industry,"Accounting,Auditing,Finance/Investment",Ho Chi Minh,Manager,"chief accountant certificate,manufacturing,gar..."
6440,chief accountant,"Accounting,Real Estate,Finance/Investment",Ho Chi Minh,Manager,"chief accountant certificate,auditing,financin..."


In [30]:
embedding_weights_base = model.query_model.get_layer("user_model").get_layer("user_embedding_model").get_layer('user_embedding_layer').get_weights()[0]

In [31]:
new_unique_users = np.unique(list(new_unique_userids))

In [32]:
# Generate the updated embedding matrix
updated_embedding = tf.keras.utils.warmstart_embedding_matrix(
    base_vocabulary=unique_user_ids,
    new_vocabulary=new_unique_users,
    base_embeddings=embedding_weights_base,
    new_embeddings_initializer="uniform",
)

# Update the model variable
updated_embedding_variable = tf.Variable(updated_embedding)

In [None]:
class UpdatedUserModel(tf.keras.Model):

  def __init__(self, new_user, embeded_user):
    super().__init__()
    user_embedding_layer_new = tf.keras.layers.Embedding(
        len(new_user), 32
    )
    user_embedding_layer_new.build(input_shape=[None])
    user_embedding_layer_new.embeddings.assign(updated_embedding)
    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None, name='user_stringlookup_layer'),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32, name='user_embedding_layer'),
    ], name='user_embedding_model')

  def call(self, inputs):
    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.user_embedding(inputs["user_id"])
    ], axis=1)

In [63]:
embedding_dimension = 32

user_embedding_layer_new = tf.keras.layers.Embedding(
    len(new_unique_users), embedding_dimension
)

user_embedding_layer_new.build(input_shape=[None])
user_embedding_layer_new.embeddings.assign(updated_embedding)
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=new_unique_users, mask_token=None),
    user_embedding_layer_new
])

In [64]:
model.query_model.embedding_model.user_embedding = tf.keras.Sequential([
    tf.keras.layers.StringLookup(vocabulary=new_unique_users, mask_token=None),
    user_embedding_layer_new
])

In [65]:
#6937452
model.query_model({"user_id": np.array(["6937452"])}) == model.query_model({"user_id": np.array(["2989560"])})

tf.Tensor(
[[-0.0234861   0.01408077 -0.00506867 -0.01083786 -0.03551828 -0.03978927
  -0.01406171  0.01756055  0.00213837  0.03844894 -0.04808125  0.02579955
   0.02994614 -0.04239151 -0.00475564  0.01889436  0.01769005  0.00497582
   0.04829449 -0.03534007 -0.02135521 -0.01391004  0.02308876  0.00114753
  -0.00017706  0.01722101 -0.0351498   0.02637174 -0.00686129 -0.01457911
  -0.0268388  -0.00599152]], shape=(1, 32), dtype=float32)
tf.Tensor(
[[-0.0234861   0.01408077 -0.00506867 -0.01083786 -0.03551828 -0.03978927
  -0.01406171  0.01756055  0.00213837  0.03844894 -0.04808125  0.02579955
   0.02994614 -0.04239151 -0.00475564  0.01889436  0.01769005  0.00497582
   0.04829449 -0.03534007 -0.02135521 -0.01391004  0.02308876  0.00114753
  -0.00017706  0.01722101 -0.0351498   0.02637174 -0.00686129 -0.01457911
  -0.0268388  -0.00599152]], shape=(1, 32), dtype=float32)


<tf.Tensor: shape=(1, 32), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True]])>

In [66]:
%%time

tensor_slices_2nd = {
    "user_id": rating_2nd.entityId.values.tolist(),
    "job_id": rating_2nd.targetEntityId.values.tolist(),
    "location": rating_2nd.locations.values.tolist(),
    "category": rating_2nd.industries.values.tolist(),
    "level": rating_2nd.jobLevel.values.tolist(),
    "job_title": rating_2nd.jobTitle.values.tolist(),
    "skill_text": rating_2nd.skills.values.tolist()
}

ds_rating_2nd = tf.data.Dataset.from_tensor_slices(tensor_slices_2nd)
cached_train_2nd = ds_rating_2nd.batch(4096)

CPU times: user 13.5 s, sys: 376 ms, total: 13.8 s
Wall time: 13.8 s


In [91]:
ds = ds_rating_1st.concatenate(ds_rating_2nd)

In [94]:
len(ds)

10040693

In [97]:
%%time
num_epochs = 50

model = MovielensModel([128, 64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    ds,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

Tensor("query_model_3/user_model_3/concat/concat:0", shape=(32,), dtype=float32)


ValueError: in user code:

    File "/home/spark/miniconda3/envs/recommend/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/spark/miniconda3/envs/recommend/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/spark/miniconda3/envs/recommend/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/spark/miniconda3/envs/recommend/lib/python3.9/site-packages/tensorflow_recommenders/models/base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "/tmp/ipykernel_1972309/95081500.py", line 18, in compute_loss
        query_embeddings = self.query_model({
    File "/home/spark/miniconda3/envs/recommend/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_file1p27ljn2.py", line 14, in tf__call
        retval_ = ag__.converted_call(ag__.ld(self).dense_layers, (ag__.ld(feature_embedding),), None, fscope)

    ValueError: Exception encountered when calling layer 'query_model_3' (type QueryModel).
    
    in user code:
    
        File "/tmp/ipykernel_1972309/2011177071.py", line 31, in call  *
            return self.dense_layers(feature_embedding)
        File "/home/spark/miniconda3/envs/recommend/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/home/spark/miniconda3/envs/recommend/lib/python3.9/site-packages/keras/engine/input_spec.py", line 250, in assert_input_compatibility
            raise ValueError(
    
        ValueError: Exception encountered when calling layer 'sequential_24' (type Sequential).
        
        Input 0 of layer "dense_18" is incompatible with the layer: expected min_ndim=2, found ndim=1. Full shape received: (32,)
        
        Call arguments received by layer 'sequential_24' (type Sequential):
          • inputs=tf.Tensor(shape=(32,), dtype=float32)
          • training=None
          • mask=None
    
    
    Call arguments received by layer 'query_model_3' (type QueryModel):
      • inputs={'user_id': 'tf.Tensor(shape=(), dtype=string)'}


In [77]:
index2nd = tfrs.layers.factorized_top_k.BruteForce(model.query_model, k=10)
index2nd.index_from_dataset(
    movies.batch(500).map(lambda x: (x["job_id"], model.candidate_model(x))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f2c0d9b6c10>

In [85]:
import random

u_n = random.choice(unique_user_ids)
print("========================HISTORY==============================")
rating_2nd[rating_2nd.entityId==u_n][["targetEntityId","jobTitle", "industries", "jobLevel", "locations", "skills"]]
# score, pred = index({"user_id": np.array(["5749316"])})
# list(map(lambda x: x.decode('ascii'), pred.numpy()[0]))
# df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred.numpy()[0])))][["jobTitle", "industries", "locations", "jobLevel", "skills"]]



Unnamed: 0,targetEntityId,jobTitle,industries,jobLevel,locations,skills
3203100,1627729,can bo phu trach ho so thau nghi t7 cn nu,"Civil/Construction,Legal/Contracts,Planning/Pr...",Experienced (non-manager),Ha Noi,"ho so du thau,xay dung hop dong,chuan bi ho so..."


In [86]:
score, pred = index2nd({"user_id": np.array([u_n])})
# list(map(lambda x: x.decode('ascii'), pred.numpy()[0]))
df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred.numpy()[0])))][["jobTitle", "industries", "locations", "jobLevel", "skills"]]

tf.Tensor(
[[-0.0234861   0.01408077 -0.00506867 -0.01083786 -0.03551828 -0.03978927
  -0.01406171  0.01756055  0.00213837  0.03844894 -0.04808125  0.02579955
   0.02994614 -0.04239151 -0.00475564  0.01889436  0.01769005  0.00497582
   0.04829449 -0.03534007 -0.02135521 -0.01391004  0.02308876  0.00114753
  -0.00017706  0.01722101 -0.0351498   0.02637174 -0.00686129 -0.01457911
  -0.0268388  -0.00599152]], shape=(1, 32), dtype=float32)


Unnamed: 0,jobTitle,industries,locations,jobLevel,skills
3818,kien truc su thiet ke 3d lumion hue va da lat,"Architecture/Interior Design,Civil/Constructio...","Lam Dong,Thua Thien Hue",Director and above,"dien hoa 3d,lumion,phoi canh 3d noi that,3ds m..."
6372,service engineer can tho,"Electrical/Electronics,Mechanical,Maintenance",Can Tho,Experienced (non-manager),"english,repairs mechanical,operating system,tr..."
7913,software engineer,"Sales,Banking,IT - Hardware/Networking",Ho Chi Minh,Experienced (non-manager),"english,python,selenium webdriver"
8303,truong nhom thu hoi no qua dien thoai,"Legal/Contracts,Banking,Fintech",Ho Chi Minh,Experienced (non-manager),"thu hoi no phap ly,thu hoi no"
10689,chuyen vien tu van tai chinh kenh lien doanh n...,"Consulting,Banking,Finance/Investment","Binh Dinh,Khanh Hoa,Lam Dong",Experienced (non-manager),"bao hiem,ngan hang,tu van tai chinh,bat dong s..."
10764,ky su lap trinh c c,"IT - Software,Telecommunications,IT - Hardware...",Ha Noi,Experienced (non-manager),"c,linux,software development,dien tu vien thon..."
10847,chuyen vien xu ly no nam duoi 32t,"Legal/Contracts,Banking,Finance/Investment","Binh Phuoc,Ho Chi Minh",Experienced (non-manager),"xu ly no xau,tin chap,luat,thu hoi no,quan li no"
11006,luat su,"Administrative/Clerical,Consulting,Legal/Contr...",Da Nang,Experienced (non-manager),"legal,tu van luat,ho so phap ly,soan thao hop ..."
13104,php developer,"IT - Software,IT - Hardware/Networking,Plannin...",Ho Chi Minh,Experienced (non-manager),"php,javascript,postgresql,my sql,css framework"
13324,officer,Other,Other,Experienced (non-manager),test


In [84]:
score, pred = index({"user_id": np.array([u_n])})
# list(map(lambda x: x.decode('ascii'), pred.numpy()[0]))
df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred.numpy()[0])))][["jobTitle", "industries", "locations", "jobLevel", "skills"]]

tf.Tensor(
[[-0.04995194  0.02714846 -0.01379492  0.0141131  -0.04381491  0.03947688
   0.02812408 -0.03205021 -0.00940702 -0.01702154 -0.00846279 -0.03094795
   0.02805856  0.02528908 -0.02155509 -0.00801569 -0.03114945  0.02834776
  -0.03922702 -0.02906051 -0.02550958  0.00732602  0.01184895  0.00498318
   0.021412   -0.02221696  0.02597881 -0.02779089 -0.03435098 -0.02546695
   0.04897121 -0.00072067]], shape=(1, 32), dtype=float32)


Unnamed: 0,jobTitle,industries,locations,jobLevel,skills
3818,kien truc su thiet ke 3d lumion hue va da lat,"Architecture/Interior Design,Civil/Constructio...","Lam Dong,Thua Thien Hue",Director and above,"dien hoa 3d,lumion,phoi canh 3d noi that,3ds m..."
6265,qc ct1,IT - Software,Ho Chi Minh,Experienced (non-manager),qc
7090,giang vien cong nghe thong tin fpl tay nguyen,"Education/Training,IT - Software",Dak Lak,Experienced (non-manager),"giao duc,cong nghe thong tin phan mem,ngon ngu..."
9243,lead java developer,IT - Software,Ho Chi Minh,Manager,"java,spring framework,react,troubleshooting an..."
9699,nhan vien van hanh nha may dien gio mat troi,"Electrical/Electronics,Mechanical,Maintenance","Soc Trang,Binh Dinh,Tay Ninh",Fresher/Entry level,"tu dong hoa,dien mat troi,dien gio,du an nang ..."
10023,hcm 3000 4300 lead java engineer,"IT - Software,IT - Hardware/Networking",Ho Chi Minh,Experienced (non-manager),"java,reactjs,microservice,domain banking"
10764,ky su lap trinh c c,"IT - Software,Telecommunications,IT - Hardware...",Ha Noi,Experienced (non-manager),"c,linux,software development,dien tu vien thon..."
12433,fresher qa engineer,"IT - Software,IT - Hardware/Networking",Ho Chi Minh,Experienced (non-manager),"php,java,c#,software development,lap trinh he ..."
13104,php developer,"IT - Software,IT - Hardware/Networking,Plannin...",Ho Chi Minh,Experienced (non-manager),"php,javascript,postgresql,my sql,css framework"
13324,officer,Other,Other,Experienced (non-manager),test


In [87]:
u_n

'14.186.104.233, 10.122.10.10'