In [1]:
import os
import tempfile
import matplotlib.pyplot as plt
import glob
from pymongo import MongoClient
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import random

database = "vnw_job"
col = "items"

2023-05-08 11:13:41.542156: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
@tf.keras.utils.register_keras_serializable()
def tokenization(t):
    return tf.strings.split(t, ',')

In [3]:
class QueryModelUpdated(tf.keras.Model):
    """Model for encoding user queries."""

    def __init__(self, vocab_new, updated_embedding):
        """Model for encoding user queries.

        Args:
          layer_sizes:
            A list of integers where the i-th entry represents the number of units
            the i-th layer contains.
        """
        super().__init__()

        # We first use the user model for generating embeddings.
        self.user_embedding_layer_new = tf.keras.layers.Embedding(
            len(vocab_new), 32, name="new_user_embedding"
        )
        self.user_embedding_layer_new.build(input_shape=[None])
        self.user_embedding_layer_new.embeddings.assign(updated_embedding)
        self.embedding_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=vocab_new, mask_token=None, name='user_stringlookup_layer_1'),
            self.user_embedding_layer_new
        ], name='user_embedding_model_1')

    #     # Then construct the layers.
    #     self.dense_layers = tf.keras.Sequential()

    #     # Use the ReLU activation for all but the last layer.
    #     for layer_size in layer_sizes[:-1]:
    #       self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    #     # No activation for the last layer.
    #     for layer_size in layer_sizes[-1:]:
    #       self.dense_layers.add(tf.keras.layers.Dense(layer_size))

    def call(self, inputs):
        return self.embedding_model(inputs["user_id"])

In [4]:
class MovielensModel(tfrs.Model):

    def __init__(self, user_model, movie_model):
        super().__init__()
        self.query_model: tf.keras.Model = user_model
        self.candidate_model: tf.keras.Model = movie_model
        self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, features, training=False):
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        query_embeddings = self.query_model({
            "user_id": features["user_id"]
        })
        movie_embeddings = self.candidate_model({
            "job_id": features["job_id"],
            "job_title": features["job_title"],
            "category": features["category"],
            "location": features["location"],
            "skill_text": features["skill_text"],
            "level": features["level"],
        })

        return self.task(
            query_embeddings, movie_embeddings, compute_metrics=not training)

In [5]:
'''Load data vs metadata'''
mongo = MongoClient()

'''Load metadata from Mongo'''
df_meta = pd.DataFrame(list(mongo['data_8']['items'].find()))
df_meta['jobTitle'] = df_meta.categoricalProps.apply(lambda x: x["jobTitle"][0])
df_meta['skills'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["skills"]))
df_meta['industries'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["industries"]))
df_meta['locations'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["locations"]))
df_meta['jobLevel'] = df_meta.categoricalProps.apply(lambda x: ','.join(x["jobLevel"]))
df_meta['availableDate'] = df_meta.dateProps.apply(lambda x: x["availableDate"])
# df = pd.read_parquet(glob.glob("/home/spark/ylv/data/navidata/*.parquet"))

In [6]:
'''Load events data from Mongo'''
# peer_users = [i['entityId'] for i in list(mongo['data_8']['data'].find({"targetEntityId": "1631782"}, {'_id': 0}))]
# df = pd.DataFrame(list(mongo['data_8']['data'].find({'entityId': {"$in": peer_users}})) + [{'entityId': 'user_test_001', 'targetEntityId': '1631782'}])
df = pd.DataFrame(list(mongo['data_8']['events_1'].find({}, {"_id": 0})))

In [7]:
ratings = df.merge(df_meta, left_on="targetEntityId", right_on="_id", how="left")
ratings = ratings.dropna()
# print("="*30)
# print(ratings.shape)
movies = tf.data.Dataset.from_tensor_slices({
    "job_id": df_meta._id.values.tolist(),
    "category": df_meta.industries.values.tolist(),
    "location": df_meta.locations.values.tolist(),
    "level": df_meta.jobLevel.values.tolist(),
    "job_title": df_meta.jobTitle.values.tolist(),
    "skill_text": df_meta.skills.values.tolist()
})

2023-05-08 11:13:50.123144: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-08 11:13:50.144193: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-08 11:13:50.144365: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-08 11:13:50.144789: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [8]:
%%time

user_model = tf.keras.models.load_model('/home/spark/ylv/workplace/rec_online_training/navi_user_model_1')
item_model = tf.keras.models.load_model('/home/spark/ylv/workplace/rec_online_training/navi_item_model_1')
vocab_base = user_model.get_layer("user_embedding_model").get_layer("user_stringlookup_layer").get_vocabulary()
embedding_weights_base = \
user_model.get_layer("user_embedding_model").get_layer("user_embedding_layer").get_weights()[0]
new_users = list(set(ratings.entityId.unique()) - set(vocab_base))
if len(new_users) > 0:
    vocab_new = vocab_base + new_users
    updated_embedding = tf.keras.utils.warmstart_embedding_matrix(
        base_vocabulary=vocab_base,
        new_vocabulary=vocab_new,
        base_embeddings=embedding_weights_base,
        new_embeddings_initializer="uniform",
    )
    updated_embedding_variable = tf.Variable(updated_embedding)
    user_model1 = QueryModelUpdated(vocab_new, updated_embedding)

CPU times: user 10 s, sys: 396 ms, total: 10.4 s
Wall time: 10.5 s


In [9]:
vocab_base[100]

'1.200.254.13, 10.122.10.10'

In [10]:
user_model.get_layer("user_embedding_model").get_layer("user_stringlookup_layer")(np.array(["1.10.11.111"]))

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([3])>

In [11]:
user_model1.get_layer("user_embedding_model_1").get_layer("user_stringlookup_layer_1")(np.array(["1.10.11.111"]))

<tf.Tensor: shape=(1,), dtype=int64, numpy=array([3])>

In [12]:
user_model.embedding_model(np.array(["unknown, 27.71.118.24, 10.122.10.10"])) == user_model1.embedding_model(np.array(["unknown, 27.71.118.24, 10.122.10.10"]))

<tf.Tensor: shape=(1, 32), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True]])>

In [13]:
tensor_slices_2nd = {
        "user_id": ratings.entityId.values.tolist(),
        "job_id": ratings.targetEntityId.values.tolist(),
        "location": ratings.locations.values.tolist(),
        "category": ratings.industries.values.tolist(),
        "level": ratings.jobLevel.values.tolist(),
        "job_title": ratings.jobTitle.values.tolist(),
        "skill_text": ratings.skills.values.tolist()
    }

In [14]:
ds_rating_2nd = tf.data.Dataset.from_tensor_slices(tensor_slices_2nd)

In [15]:
cached_train_2nd = ds_rating_2nd.batch(4096)

In [17]:
num_epochs = 10

In [18]:
%%time
model1 = MovielensModel(user_model1, item_model)
model1.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
history1 = model1.fit(
    cached_train_2nd,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

2023-05-08 11:14:36.006604: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x3dc42bc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-05-08 11:14:36.006621: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-05-08 11:14:36.009079: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
2023-05-08 11:14:36.073645: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You ma


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p

CPU times: user 54.4 s, sys: 1.56 s, total: 56 s
Wall time: 44.7 s


In [19]:
index1 = tfrs.layers.factorized_top_k.BruteForce(model1.query_model, k=20)
index1.index_from_dataset(
        movies.batch(500).map(lambda x: (x["job_id"], model1.candidate_model(x))))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7ff1ef5a4ca0>

In [22]:
# u = random.choice(new_users)
u = "user_test_15"
print("========================HISTORY==============================")
ratings[ratings.entityId == u][["targetEntityId", "jobTitle", "industries", "jobLevel", "locations", "skills"]]
# score, pred = index({"user_id": np.array(["5749316"])})



Unnamed: 0,targetEntityId,jobTitle,industries,jobLevel,locations,skills
366973,1618942,chuyen vien chinh quan tri co so du lieu phong...,"IT - Software,Banking,IT - Hardware/Networking",Experienced (non-manager),Ha Noi,"ngan hang,quan tri co so du lieu,devops,linux,..."


In [23]:
# u = '183.80.135.240, 10.122.10.10'
score, pred = index1({"user_id": np.array([u])})
pre_items = list(map(lambda x: x.decode('ascii'), pred.numpy()[0]))
check_df = df_meta[df_meta._id.isin(list(map(lambda x: x.decode('ascii'), pred.numpy()[0])))][["_id","jobTitle", "industries", "locations", "jobLevel", "skills"]]
check_df["priority"] = check_df._id.apply(lambda x: pre_items.index(x))
check_df.sort_values(by=["priority"], ascending=True)

Unnamed: 0,_id,jobTitle,industries,locations,jobLevel,skills,priority
1004,1618942,chuyen vien chinh quan tri co so du lieu phong...,"IT - Software,Banking,IT - Hardware/Networking",Ha Noi,Experienced (non-manager),"ngan hang,quan tri co so du lieu,devops,linux,...",0
244,1610787,chuyen vien van hanh an ninh khoi cntt,"IT - Software,Banking,Finance/Investment",Ha Noi,Experienced (non-manager),"cong nghe thong tin,an ninh thong tin,bao mat ...",1
5168,1626358,chuyen vien cao cap giam sat an toan thong tin...,"IT - Software,IT - Hardware/Networking",Ha Noi,Manager,"an toan thong tin mang,an toan thong tin",2
242,1610794,chuyen vien giam sat va ung cuu su co an ninh ...,"IT - Software,Banking,Finance/Investment",Ha Noi,Experienced (non-manager),"cong nghe thong tin,an toan thong tin mang,bao...",3
3708,1624502,devops engineer,"IT - Software,IT - Hardware/Networking",Ha Noi,Experienced (non-manager),"devops,.net,sql,database queries,azure",4
1005,1618985,chuyen vien chinh phong an ninh he thong khoi ...,"IT - Software,Banking,IT - Hardware/Networking",Ha Noi,Experienced (non-manager),"ngan hang,an ninh mang,bao mat du lieu,an toan...",5
2370,1622753,chuyen gia trien khai va van hanh he thong sys...,"IT - Software,IT - Hardware/Networking",Ha Noi,Experienced (non-manager),"van hanh he thong,english,giam sat he thong,da...",6
15851,1639703,chuyen vien an ninh thong tin,"IT - Software,Banking,IT - Hardware/Networking",Ha Noi,Experienced (non-manager),"it security,cong nghe thong tin,an toan thong ...",7
3424,1624176,onsite cto cio bank up to 18000 net salary,"Executive management,IT - Software,Banking",International,Director and above,"computer science,it system,analytical skill,pr...",8
1038,1619593,giam doc ban quan tri va dieu phoi du an cntt ...,"IT - Software,IT - Hardware/Networking,Fintech",Ha Noi,Manager,"du an,agile,agile software development,pmp",9


In [21]:
user_model.embedding_model(np.array(["1.200.254.13, 10.122.10.10"])) == model1.query_model.embedding_model(np.array(["1.200.254.13, 10.122.10.10"]))

<tf.Tensor: shape=(1, 32), dtype=bool, numpy=
array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True]])>

In [22]:
us = random.choices(new_users, k=5)
score, preds = index1({"user_id": np.array(df.entityId.unique().tolist()[150000:])})

In [23]:
len(df.entityId.unique().tolist())

168275

In [24]:
updated_users = df.entityId.unique().tolist()
batch_predict = 50000
for i in range(len(updated_users)//batch_predict + 1):
    print(len(updated_users[i*batch_predict:(i+1)*batch_predict]))

50000
50000
50000
18275


In [25]:
from pymongo import UpdateOne

def scale_5(arr):
    arr = (arr - min(arr) + 1)/(max(arr) - min(arr))
    return list(map(float, arr*5/max(arr)))


def update_item_recs(user, recs, score):
    return UpdateOne({"_id": user}, {
        "$set": {
            "items": dict(zip(list(map(lambda x: x.decode("ascii"), recs)), scale_5(score)))
        }
    }, upsert=True)

In [26]:
updated_users = df.entityId.unique().tolist()
batch_predict = 50000
for i in range(len(updated_users)//batch_predict + 1):
    predict_users = updated_users[i*batch_predict:(i+1)*batch_predict]
    score, preds = index1({"user_id": np.array(predict_users)})
    mongo['data_8'].User_Recs.bulk_write(
        list(map(lambda u, r, s: update_item_recs(u, r, s), predict_users, preds.numpy(), score.numpy()))
    )
#     print(list(map(lambda u, r, s: update_item_recs(u, r, s), predict_users, preds.numpy(), score.numpy()))[:1])
    

In [104]:
index2 = tfrs.layers.factorized_top_k.BruteForce(model1.query_model, k=10)
index2.index_from_dataset(
        movies.batch(500).map(
            lambda x: (x["location"], model1.candidate_model.location_embedding(x["location"]))
        )
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f47b18e5f10>

In [105]:
u = random.choice(new_users)
# u = "user_test_002"
print("========================HISTORY==============================")
ratings[ratings.entityId == u][["targetEntityId", "jobTitle", "industries", "jobLevel", "locations", "skills"]]
# score, pred = index({"user_id": np.array(["5749316"])})



Unnamed: 0,targetEntityId,jobTitle,industries,jobLevel,locations,skills
327490,1631935,nhan vien kinh doanh tour du lich,"Customer Service,Sales,Airlines/Tourism",Experienced (non-manager),Ha Noi,"du lich,sale online,sale du lich,kinh doanh du..."


In [36]:
a = movies.batch(1000000000).map(lambda x: (x["location"], model1.candidate_model.location_embedding(x["location"])))

In [142]:
idx, embs = next(iter(a))

In [118]:
from sklearn.metrics.pairwise import cosine_similarity

In [143]:
# u_emb = model1.query_model({"user_id": np.array(u)})
# # cosine_similarity(u_emb.numpy(), embs.numpy()

# pre_items = [idx[i].numpy().decode("ascii") for i in list(reversed(np.argsort(cosine_similarity(tf.reshape(u_emb, shape=(1, -1)).numpy(), embs.numpy())[0])))[:20]]
# check_df = df_meta[df_meta._id.isin(pre_items)][["_id","jobTitle", "industries", "locations", "jobLevel", "skills"]]
# check_df["priority"] = check_df._id.apply(lambda x: pre_items.index(x))
# check_df.sort_values(by=["priority"], ascending=True)

In [145]:
idx, embs

(<tf.Tensor: shape=(19236,), dtype=string, numpy=
 array([b'Ha Noi,Hung Yen,Ha Nam', b'Ha Noi', b'Ho Chi Minh', ...,
        b'Ha Noi', b'Dong Nai', b'Ha Noi'], dtype=object)>,
 <tf.Tensor: shape=(19236, 32), dtype=float32, numpy=
 array([[ 0.39815533, -0.0886539 ,  0.03352622, ...,  0.6786445 ,
         -0.11176276, -0.42520228],
        [ 0.40533033,  0.00991735, -0.02473093, ...,  0.43977395,
          0.00222277, -0.05495538],
        [ 0.12366021,  0.09146322, -0.04761661, ...,  0.03488928,
         -0.01461918,  0.430252  ],
        ...,
        [ 0.40533033,  0.00991735, -0.02473093, ...,  0.43977395,
          0.00222277, -0.05495538],
        [ 0.12142243,  0.0199055 ,  0.4148923 , ..., -0.12958623,
         -0.35030916,  0.14047426],
        [ 0.40533033,  0.00991735, -0.02473093, ...,  0.43977395,
          0.00222277, -0.05495538]], dtype=float32)>)