In [1]:
import os
import tempfile
import matplotlib.pyplot as plt
import glob
from pymongo import MongoClient, InsertOne
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

database = "vnw_job"
col = "items"

2023-05-08 16:08:42.366458: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class QueryModel(tf.keras.Model):
    """Model for encoding user queries."""

    def __init__(self, layer_sizes):
        """Model for encoding user queries.

        Args:
          layer_sizes:
            A list of integers where the i-th entry represents the number of units
            the i-th layer contains.
        """
        super().__init__()

        # We first use the user model for generating embeddings.
        self.embedding_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None, name='user_stringlookup_layer'),
            tf.keras.layers.Embedding(len(unique_user_ids) + 1, 32, name='user_embedding_layer'),
        ], name='user_embedding_model')

    #     # Then construct the layers.
    #     self.dense_layers = tf.keras.Sequential()

    #     # Use the ReLU activation for all but the last layer.
    #     for layer_size in layer_sizes[:-1]:
    #       self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    #     # No activation for the last layer.
    #     for layer_size in layer_sizes[-1:]:
    #       self.dense_layers.add(tf.keras.layers.Dense(layer_size))

    def call(self, inputs):
        return self.embedding_model(inputs["user_id"])

In [3]:
@tf.keras.utils.register_keras_serializable()
def tokenization(t):
    return tf.strings.split(t, ',')

In [5]:
class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.job_id_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_job_ids,mask_token=None),
      tf.keras.layers.Embedding(len(unique_job_ids) + 1, 32)
    ])
    self.location_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=100,
          standardize=None,
          split=tokenization,
          vocabulary=unique_locations,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_locations) + 1, 32),
      tf.keras.layers.GlobalAvgPool1D()
    ])
    self.level_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=100,
          standardize=None,
          split=tokenization,
          vocabulary=unique_levels,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_levels) + 1, 32),
      tf.keras.layers.GlobalAvgPool1D()
    ])
    self.category_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=100,
          standardize=None,
          split=tokenization,
          vocabulary=unique_category,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_category) + 1, 64),
      tf.keras.layers.GlobalAvgPool1D()
    ])
    self.skill_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=11000,
          standardize=None,
          split=tokenization,
          vocabulary=unique_skills,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_skills) + 1, 64),
      tf.keras.layers.GlobalAvgPool1D()
    ])
    # self.embedding_model = MovieModel()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = tf.concat([
        self.job_id_embedding(inputs["job_id"]),
        self.category_embedding(inputs["category"]),
        self.location_embedding(inputs["location"]),
        self.level_embedding(inputs["level"]),
        self.skill_embedding(inputs["skill_text"])
    ], axis=1)
    return self.dense_layers(feature_embedding)

In [6]:
class MovielensModel(tfrs.Model):

  def __init__(self, user_model, movie_model):
    super().__init__()
    self.query_model: tf.keras.Model = user_model
    self.candidate_model: tf.keras.Model = movie_model
    self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "user_id": features["user_id"]
    })
    movie_embeddings = self.candidate_model({
        "job_id": features["job_id"],
        "job_title": features["job_title"],
        "category": features["category"],
        "location": features["location"],
        "skill_text": features["skill_text"],
        "level": features["level"],
    })

    return self.task(
        query_embeddings, movie_embeddings, compute_metrics=not training)

In [7]:
mongo = MongoClient()
df_meta = pd.DataFrame(list(mongo["data_8"][col].find()))
df_meta['jobTitle'] = df_meta.categoricalProps.apply(lambda x: x["jobTitle"][0])
df_meta['skills'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["skills"]))
df_meta['industries'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["industries"]))
df_meta['locations'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["locations"]))
df_meta['jobLevel'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["jobLevel"]))
df_meta['availableDate'] = df_meta.dateProps.apply(lambda x: x["availableDate"])

In [8]:
df = pd.read_parquet(glob.glob("/home/spark/ylv/data/navidata/*.parquet"))
ratings = df.merge(df_meta, left_on="targetEntityId", right_on="_id", how="left")
ratings = ratings.dropna()
rating_1st, rating_2nd = train_test_split(ratings, test_size=0.05)

In [9]:
m = MongoClient()
data_m = rating_2nd[["entityId", "targetEntityId"]].to_dict(orient='record')
bulk_data = [InsertOne(i) for i in data_m]
m['data_8']['events_2'].bulk_write(bulk_data)

  data_m = rating_2nd[["entityId", "targetEntityId"]].to_dict(orient='record')


<pymongo.results.BulkWriteResult at 0x7f7224040740>

In [10]:
'''Load data into tf.Dataset'''
movies = tf.data.Dataset.from_tensor_slices({
    "job_id": df_meta._id.values.tolist(),
    "category": df_meta.industries.values.tolist(),
    "location": df_meta.locations.values.tolist(),
    "level": df_meta.jobLevel.values.tolist(),
    "job_title": df_meta.jobTitle.values.tolist(),
    "skill_text": df_meta.skills.values.tolist()
})

2023-05-08 16:11:49.710915: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-08 16:11:49.732433: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-08 16:11:49.732596: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-08 16:11:49.733052: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [11]:
tensor_slices = {
        "user_id": rating_1st.entityId.values.tolist(),
        "job_id": rating_1st.targetEntityId.values.tolist(),
        "location": rating_1st.locations.values.tolist(),
        "category": rating_1st.industries.values.tolist(),
        "level": rating_1st.jobLevel.values.tolist(),
        "job_title": rating_1st.jobTitle.values.tolist(),
        "skill_text": rating_1st.skills.values.tolist()
    }

In [12]:
ds_rating_1st = tf.data.Dataset.from_tensor_slices(tensor_slices)

In [13]:
'''Create Vocab'''
unique_job_titles = np.unique(df_meta.jobTitle.values.tolist())
unique_job_ids = np.unique(df_meta._id.values.tolist())
unique_user_ids = np.unique(rating_1st.entityId.values.tolist())
unique_locations = np.unique(reduce(lambda x, y: x + "," + y, df_meta.locations.values.tolist()).split(","))
unique_skills = np.unique(reduce(lambda x, y: x + "," + y, df_meta.skills.values.tolist()).split(","))[1:]
# job_title_vocabs = np.unique(reduce(lambda x, y: x + y, df_meta.title.str.split(" ").tolist()))
unique_category = np.unique(reduce(lambda x, y: x + "," + y, df_meta.industries.values.tolist()).split(","))
unique_levels = np.unique(reduce(lambda x, y: x + "," + y, df_meta.jobLevel.values.tolist()).split(","))

In [15]:
cached_train_1st = ds_rating_1st.batch(4096)

In [18]:
num_epochs = 100

In [20]:
user_model = QueryModel([128, 64, 32])
item_model = CandidateModel([256, 128, 64, 32])
model = MovielensModel(user_model, item_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
history = model.fit(
        cached_train_1st,
        validation_freq=5,
        epochs=num_epochs,
        verbose=0)

2023-05-08 16:17:38.296988: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f6d9d11c680 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-05-08 16:17:38.297005: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-05-08 16:17:38.299422: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
2023-05-08 16:17:38.364948: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

Yo


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p

In [21]:
model.query_model.save('navi_user_model_3')
model.candidate_model.save('navi_item_model_3')

INFO:tensorflow:Assets written to: navi_user_model_3/assets
INFO:tensorflow:Assets written to: navi_item_model_3/assets


In [22]:
class QueryModelUpdated(tf.keras.Model):
    """Model for encoding user queries."""

    def __init__(self, vocab_new, updated_embedding):
        """Model for encoding user queries.

        Args:
          layer_sizes:
            A list of integers where the i-th entry represents the number of units
            the i-th layer contains.
        """
        super().__init__()

        # We first use the user model for generating embeddings.
        self.user_embedding_layer_new = tf.keras.layers.Embedding(
            len(vocab_new), 32, name="new_user_embedding"
        )
        self.user_embedding_layer_new.build(input_shape=[None])
        self.user_embedding_layer_new.embeddings.assign(updated_embedding)
        self.embedding_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=vocab_new, mask_token=None, name='user_stringlookup_layer_1'),
            self.user_embedding_layer_new
        ], name='user_embedding_model_1')

    #     # Then construct the layers.
    #     self.dense_layers = tf.keras.Sequential()

    #     # Use the ReLU activation for all but the last layer.
    #     for layer_size in layer_sizes[:-1]:
    #       self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    #     # No activation for the last layer.
    #     for layer_size in layer_sizes[-1:]:
    #       self.dense_layers.add(tf.keras.layers.Dense(layer_size))

    def call(self, inputs):
        return self.embedding_model(inputs["user_id"])

In [23]:
class MovielensModel(tfrs.Model):

    def __init__(self, user_model, movie_model):
        super().__init__()
        self.query_model: tf.keras.Model = user_model
        self.candidate_model: tf.keras.Model = movie_model
        self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, features, training=False):
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        query_embeddings = self.query_model({
            "user_id": features["user_id"]
        })
        movie_embeddings = self.candidate_model({
            "job_id": features["job_id"],
            "job_title": features["job_title"],
            "category": features["category"],
            "location": features["location"],
            "skill_text": features["skill_text"],
            "level": features["level"],
        })

        return self.task(
            query_embeddings, movie_embeddings, compute_metrics=not training)

In [24]:
'''Load events data from Mongo'''
# peer_users = [i['entityId'] for i in list(mongo['data_8']['data'].find({"targetEntityId": "1631782"}, {'_id': 0}))]
# df = pd.DataFrame(list(mongo['data_8']['data'].find({'entityId': {"$in": peer_users}})) + [{'entityId': 'user_test_001', 'targetEntityId': '1631782'}])
df = pd.DataFrame(list(mongo['data_8']['events_2'].find({}, {"_id": 0})))

In [25]:
ratings_1 = df.merge(df_meta, left_on="targetEntityId", right_on="_id", how="left")
ratings_1 = ratings_1.dropna()
# print("="*30)
# print(ratings.shape)
movies = tf.data.Dataset.from_tensor_slices({
    "job_id": df_meta._id.values.tolist(),
    "category": df_meta.industries.values.tolist(),
    "location": df_meta.locations.values.tolist(),
    "level": df_meta.jobLevel.values.tolist(),
    "job_title": df_meta.jobTitle.values.tolist(),
    "skill_text": df_meta.skills.values.tolist()
})

In [26]:
%%time

# user_model = tf.keras.models.load_model('/home/spark/ylv/workplace/rec_online_training/navi_user_model_1')
# item_model = tf.keras.models.load_model('/home/spark/ylv/workplace/rec_online_training/navi_item_model_1')
vocab_base = model.query_model.get_layer("user_embedding_model").get_layer("user_stringlookup_layer").get_vocabulary()
embedding_weights_base = \
model.query_model.get_layer("user_embedding_model").get_layer("user_embedding_layer").get_weights()[0]
new_users = list(set(ratings.entityId.unique()) - set(vocab_base))
if len(new_users) > 0:
    vocab_new = vocab_base + new_users
    updated_embedding = tf.keras.utils.warmstart_embedding_matrix(
        base_vocabulary=vocab_base,
        new_vocabulary=vocab_new,
        base_embeddings=embedding_weights_base,
        new_embeddings_initializer="uniform",
    )
    updated_embedding_variable = tf.Variable(updated_embedding)
    user_model_21 = QueryModelUpdated(vocab_new, updated_embedding)

CPU times: user 9.44 s, sys: 236 ms, total: 9.68 s
Wall time: 9.67 s


In [29]:
tensor_slices_2nd = {
        "user_id": ratings_1.entityId.values.tolist(),
        "job_id": ratings_1.targetEntityId.values.tolist(),
        "location": ratings_1.locations.values.tolist(),
        "category": ratings_1.industries.values.tolist(),
        "level": ratings_1.jobLevel.values.tolist(),
        "job_title": ratings_1.jobTitle.values.tolist(),
        "skill_text": ratings_1.skills.values.tolist()
    }

In [30]:
ds_rating_2nd = tf.data.Dataset.from_tensor_slices(tensor_slices_2nd)

In [139]:
num_epochs = 50

In [140]:
cached_train_2nd = ds_rating_2nd.batch(4096)

In [141]:
%%time
model1 = MovielensModel(user_model_21, model.candidate_model)
model1.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
history1 = model1.fit(
    cached_train_2nd,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

CPU times: user 3min 9s, sys: 3.95 s, total: 3min 13s
Wall time: 1min 47s


In [142]:
index3 = tfrs.layers.factorized_top_k.BruteForce(model1.query_model, k=20)
index3.index_from_dataset(
        movies.batch(500).map(lambda x: (x["job_id"], model1.candidate_model(x))))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f72261224f0>

In [196]:
u = np.random.choice(new_users)
# u = "171.240.26.108, 10.122.10.10"
print("========================HISTORY==============================")
ratings[ratings.entityId == u][["targetEntityId", "jobTitle", "industries", "jobLevel", "locations", "skills"]]
# score, pred = index({"user_id": np.array(["5749316"])})



Unnamed: 0,targetEntityId,jobTitle,industries,jobLevel,locations,skills
764440,1627803,can bo ke toan truong dai hoc fpt,"Accounting,Education/Training,Finance/Investment",Experienced (non-manager),Ha Noi,"ke toan,ke toan mua hang,ke toan chi phi,ke to..."


In [188]:
# u = '183.80.135.240, 10.122.10.10'
score_3, pred_3 = index3({"user_id": np.array([u])})
pre_items_3 = list(map(lambda x: x.decode('ascii'), pred_3.numpy()[0]))
check_df_3 = df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred_3.numpy()[0])))][["_id","jobTitle", "industries", "locations", "jobLevel", "skills"]]
check_df_3["priority"] = check_df_3._id.apply(lambda x: pre_items_3.index(x))
check_df_3.sort_values(by=["priority"], ascending=True)

Unnamed: 0,_id,jobTitle,industries,locations,jobLevel,skills,priority
3664,1624360,hse executive quang nam,"Environment/Waste Services,Textiles/Garments/F...",Quang Nam,Experienced (non-manager),"english,footwear industry,safety,an toan lao d...",0
8120,1629945,phu quoc thu kho xay dung,"Civil/Construction,Warehouse",Kien Giang,Experienced (non-manager),thu kho xay dung,1
10956,1633539,tpm manager,"Production/Process,Electrical/Electronics",Ha Noi,Manager,"engineering,electronics",2
11779,1634472,truong nhom an toan,"Electrical/Electronics,Auto/Automotive,HSE",Ha Noi,Experienced (non-manager),"tieng nhat n3,phong chay chua chay,autocad,qua...",3
3939,1624783,restaurant supervisor,"Food & Beverage,Restaurant/Hotel",Ho Chi Minh,Experienced (non-manager),"quan ly nha hang,nha hang khach san,giam sat n...",4
11576,1634266,overseas sales representative for america nort...,"Customer Service,Sales,Industrial Products",Ho Chi Minh,Experienced (non-manager),"english,b2b sales,sales management,business de...",5
1581,1621759,vinpearljsc j1615 food safety auditor,"Customer Service,Airlines/Tourism,QA/QC","Other,Ha Noi,Ho Chi Minh",Experienced (non-manager),"food chemistry,english,qa,food technology,qual...",6
9295,1631458,chuyen vien r d,"Production/Process,Food & Beverage,FMCG",Ha Noi,Experienced (non-manager),"nha hang,khach san,bep,bep banh ngot,phat trie...",7
10054,1632336,senior accountant work in phu yen,"Accounting,Auditing,Finance/Investment","Ho Chi Minh,Phu Yen,Khanh Hoa",Experienced (non-manager),"accounting,financial report,financial report a...",8
1073,1619994,operation coordinator in khanh hoa,"Administrative/Clerical,Environment/Waste Serv...",Khanh Hoa,Experienced (non-manager),"operations coordinating,operational process an...",9


In [1]:
!pwd

/bin/bash: /home/spark/miniconda3/envs/recommend/lib/libtinfo.so.6: no version information available (required by /bin/bash)
/home/spark/ylv/workplace
