In [1]:
from pymongo import MongoClient
import pandas as pd

mongo = MongoClient()

data = list(mongo.data_4.events.find({},{"userid": 1, "jobId": 1}))
meta_data = list(mongo.data_4.data.find({},{"tag": 0, "_id": 0}))
df = pd.DataFrame(data)
df_1 = df.groupby(["userid"]).aggregate({"jobId": list}).reset_index()
df_1["num_items"] = df_1.jobId.str.len()

In [2]:
import itertools
from functools import reduce


def create_pair_items(lst_items):
    return list(itertools.combinations(lst_items, 2))

In [3]:
item_item_data = reduce(lambda x, y: x + y, list([create_pair_items(i) for i in df_1.jobId.values.tolist()]))

In [4]:
df_2 = pd.DataFrame(item_item_data, columns=["item1", "item2"])

In [5]:
meta_df = pd.DataFrame(meta_data)

In [6]:
def create_skill_text(lst_skill):
    return ','.join(lst_skill)

meta_df["skill_text"] = meta_df.skill.apply(create_skill_text)
meta_df.head()

Unnamed: 0,jobId,title,category,location,skill,salary,level,image,skill_text
0,Job_Leader_Machine_Learning_HN_Leader_ILCOCE,Leader Machine Learning Engineer,Machine Learning,HN,"[NLP, Python, Python, Scala, Scala]",4260,Leader,https://www.vietnamworks.com/_next/image?url=h...,"NLP,Python,Python,Scala,Scala"
1,Job_Senior_Python_Developer_HCM_Senior_ULLFC8,"Senior Fullstack Developer (Python, Django)",Python Backend,HCM,"[Backend, API, Django, SQL, Flask]",2187,Senior,https://www.vietnamworks.com/_next/image?url=h...,"Backend,API,Django,SQL,Flask"
2,Job_Junior_Data_Analyst_HCM_Junior_WB0N2U,Junior Data Analystics,Data Analytic,HCM,"[SQL, SQL, Data Analytic, Python, Data Analytic]",1511,Junior,https://www.vietnamworks.com/_next/image?url=h...,"SQL,SQL,Data Analytic,Python,Data Analytic"
3,Job_Leader_Fullstack_Developer_DN_Leader_788V3W,Leader Fullstack Developer (for AI project),Fullstack,DN,"[ReactJS, NodeJS, Backend (familar AI, chatbot...",3578,Leader,https://www.vietnamworks.com/_next/image?url=h...,"ReactJS,NodeJS,Backend (familar AI, chatbot, c..."
4,Job_Leader_NodeJS_Developer_HCM_Leader_KGZA40,"Leader Fullstack Developer (NodeJS, Express)",NodeJS Backend,HCM,"[NodeJS, Backend, Backend, JavaScript, ReactJS]",3399,Leader,https://www.vietnamworks.com/_next/image?url=h...,"NodeJS,Backend,Backend,JavaScript,ReactJS"


In [7]:
df_2 = df_2.merge(meta_df, how="left", left_on="item1", right_on="jobId")
df_2 = df_2.merge(meta_df, how="left", left_on="item2", right_on="jobId", suffixes=("_item1", "_item2"))

In [9]:
import os
import tempfile

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

# plt.style.use('seaborn-whitegrid')

In [10]:
tensor_slices = {
    "item1": df_2.item1.values.tolist(),
    "category_item1": df_2.category_item1.values.tolist(),
    "title_item1": df_2.title_item1.values.tolist(),
    "location_item1": df_2.location_item1.values.tolist(),
    "level_item1": df_2.level_item1.values.tolist(),
    "skill_text_item1": df_2.skill_text_item1.values.tolist(),
    "item2": df_2.item2.values.tolist(),
    "category_item2": df_2.category_item2.values.tolist(),
    "title_item2": df_2.title_item2.values.tolist(),
    "location_item2": df_2.location_item2.values.tolist(),
    "level_item2": df_2.level_item2.values.tolist(),
    "skill_text_item2": df_2.skill_text_item2.values.tolist()
}

jobs = tf.data.Dataset.from_tensor_slices({
    "item2": meta_df.jobId.values.tolist(),
    "category_item2": meta_df.category.values.tolist(),
    "location_item2": meta_df.location.values.tolist(),
    "level_item2": meta_df.level.values.tolist(),
    "skill_text_item2": meta_df.skill_text.values.tolist()
})

items = tf.data.Dataset.from_tensor_slices(tensor_slices)

2024-02-05 10:44:25.172668: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-05 10:44:25.230992: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-05 10:44:25.231163: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [11]:
unique_items = np.unique(meta_df.jobId.values.tolist())
unique_category = np.unique(meta_df.category.values.tolist())
unique_location = np.unique(meta_df.location.values.tolist())
unique_level = np.unique(meta_df.level.values.tolist())
unique_skill = np.unique(reduce(lambda x, y: x + y, meta_df.skill.values.tolist()))

In [12]:
def tokenization(t):
    return tf.strings.split(t, ',')

tokenization("Backend,ReactJS,NodeJS,JavaScript,JavaScript")

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'Backend', b'ReactJS', b'NodeJS', b'JavaScript', b'JavaScript'],
      dtype=object)>

In [13]:
class Movie1Model(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_items,mask_token=None),
      tf.keras.layers.Embedding(len(unique_items) + 1, 32)
    ])
    self.category_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_category,mask_token=None),
      tf.keras.layers.Embedding(len(unique_location) + 1, 32)
    ])
    self.location_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_location,mask_token=None),
      tf.keras.layers.Embedding(len(unique_location) + 1, 32)
    ])
    self.level_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_level,mask_token=None),
      tf.keras.layers.Embedding(len(unique_level) + 1, 32)
    ])
    self.skill_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=50,
          vocabulary=unique_skill,
          standardize=None,
          split=tokenization,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_skill) + 1, 32),
      tf.keras.layers.GlobalAvgPool1D()
    ])

  def call(self, features):
    return tf.concat([
        self.title_embedding(features["item1"]),
        self.category_embedding(features["category_item1"]),
        self.location_embedding(features["location_item1"]),
        self.level_embedding(features["level_item1"]),
        self.skill_embedding(features["skill_text_item1"])
    ], axis=1)

class Movie2Model(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_items,mask_token=None),
      tf.keras.layers.Embedding(len(unique_items) + 1, 32)
    ])
    self.category_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_category,mask_token=None),
      tf.keras.layers.Embedding(len(unique_location) + 1, 32)
    ])
    self.location_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_location,mask_token=None),
      tf.keras.layers.Embedding(len(unique_location) + 1, 32)
    ])
    self.level_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_level,mask_token=None),
      tf.keras.layers.Embedding(len(unique_level) + 1, 32)
    ])
    self.skill_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(
          max_tokens=50,
          vocabulary=unique_skill,
          standardize=None,
          split=tokenization,
          pad_to_max_tokens=True
      ),
      tf.keras.layers.Embedding(len(unique_skill) + 1, 32),
      tf.keras.layers.GlobalAvgPool1D()
    ])

  def call(self, features):
    return tf.concat([
        self.title_embedding(features["item2"]),
        self.category_embedding(features["category_item2"]),
        self.location_embedding(features["location_item2"]),
        self.level_embedding(features["level_item2"]),
        self.skill_embedding(features["skill_text_item2"])
    ], axis=1)

class QueryModel(tf.keras.Model):
  """Model for encoding user queries."""

  def __init__(self, layer_sizes):
    """Model for encoding user queries.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = Movie1Model()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

class CandidateModel(tf.keras.Model):
  """Model for encoding movies."""

  def __init__(self, layer_sizes):
    """Model for encoding movies.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = Movie2Model()

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [14]:
QueryModel([32])({"item1": np.array(["Job_Fresher_Data_Scientist_DN_Fresher_PUF8GY"]),
               "category_item1": np.array(["Backend Developer"]),
               "location_item1": np.array(["DN"]),
               "level_item1": np.array(["Fresher"]),
               "skill_text_item1": np.array(["Backend,ReactJS,NodeJS,JavaScript,JavaScript"])
              })

ValueError: could not convert string to float: 'Backend Developer'

In [22]:
CandidateModel([32])({"item2": np.array(["Job_Fresher_Data_Scientist_DN_Fresher_PUF8GY"]),
               "category_item2": np.array(["Backend Developer"]),
               "location_item2": np.array(["DN"]),
               "level_item2": np.array(["Fresher"]),
               "skill_text_item2": np.array(["Backend,ReactJS,NodeJS,JavaScript,JavaScript"])
              })

<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 4.22667637e-02,  1.04957540e-02, -4.12077941e-02,
        -5.35202660e-02,  3.98054533e-03,  4.18344885e-02,
        -1.73059311e-02, -3.40664387e-02,  5.58805093e-02,
        -5.15691712e-02,  1.78748835e-02,  1.46133658e-02,
         4.15474474e-02,  1.42521597e-02,  3.01606916e-02,
         8.59376322e-03, -2.35634670e-03, -7.19649717e-02,
        -5.07200025e-02,  2.01971550e-02, -2.39986256e-02,
         2.38101557e-02, -2.29243897e-02, -5.30655347e-02,
        -2.98945233e-05,  5.61026437e-03,  3.77673917e-02,
         1.68422461e-02,  1.51903005e-02,  1.47948898e-02,
        -4.60456237e-02, -5.25935888e-02]], dtype=float32)>

In [28]:
class ItemItemModel(tfrs.models.Model):

  def __init__(self, layer_sizes):
    super().__init__()
    self.query_model = QueryModel(layer_sizes)
    self.candidate_model = CandidateModel(layer_sizes)
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=jobs.batch(128).map(self.candidate_model),
        ),
    )

  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "item1": features["item1"],
        "category_item1": features["category_item1"],
        "location_item1": features["location_item1"],
        "level_item1": features["level_item1"],
        "skill_text_item1": features["skill_text_item1"]
    })
    movie_embeddings = self.candidate_model({
       "item2": features["item2"],
        "category_item2": features["category_item2"],
        "location_item2": features["location_item2"],
        "level_item2": features["level_item2"],
        "skill_text_item2": features["skill_text_item2"]
    })

    return self.task(
        query_embeddings, movie_embeddings, compute_metrics=not training)

In [15]:
# model = MovielensModel()
# model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [25]:
tf.random.set_seed(42)
shuffled = items.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

cached_train = train.shuffle(100_000).batch(2048)
cached_test = test.batch(4096).cache()

In [29]:
num_epochs = 300

model = ItemItemModel([128, 64, 32])
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

one_layer_history = model.fit(
    cached_train,
    validation_data=cached_test,
    validation_freq=5,
    epochs=num_epochs,
    verbose=0)

2023-03-28 16:04:08.502859: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f93b000a180 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-03-28 16:04:08.502877: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2023-03-28 16:04:08.506010: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.
2023-03-28 16:04:08.572572: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

Yo


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p


You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-picking the ptxas binary is often sufficient.

You may not need to update to CUDA 11.1; cherry-p

In [30]:
cached_test.take(1)

<TakeDataset element_spec={'item1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'category_item1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'title_item1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'location_item1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'level_item1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'skill_text_item1': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'item2': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'category_item2': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'title_item2': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'location_item2': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'level_item2': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'skill_text_item2': TensorSpec(shape=(None,), dtype=tf.string, name=None)}>

In [35]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model, k=40)
# recommends movies out of the entire movies dataset.
index.index_from_dataset(
  jobs.batch(100).map(lambda x: (x["item2"], model.candidate_model(x)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f93e21494f0>

In [36]:
_, pred = index({"item1": np.array(["Job_Senior_Data_Scientist_HCM_Senior_UAJVNT"]),
                 "category_item1": np.array(["Data Scientist"]),
                 "location_item1": np.array(["HCM"]),
                 "level_item1": np.array(["Senior"]),
                 "skill_text_item1": np.array(["Python,Deep Learning,Big-data,Machine Learning,Python"])
                })
pred.numpy()

array([[b'Job_Senior_Data_Scientist_HCM_Senior_WNOM7T',
        b'Job_Leader_Data_Scientist_HCM_Leader_Z3GXQC',
        b'Job_Senior_Data_Scientist_other_Senior_B79RUZ',
        b'Job_Senior_Machine_Learning_HCM_Senior_OSX7S9',
        b'Job_Senior_Data_Scientist_other_Senior_THHWSM',
        b'Job_Senior_Data_Scientist_HCM_Senior_UAJVNT',
        b'Job_Senior_Data_Scientist_HCM_Senior_ICRJ9A',
        b'Job_Senior_Machine_Learning_other_Senior_0BSTQD',
        b'Job_Senior_Machine_Learning_HCM_Senior_MVKE00',
        b'Job_Leader_Data_Scientist_HCM_Leader_CIUR6J',
        b'Job_Senior_Machine_Learning_other_Senior_BSQJBC',
        b'Job_Senior_Machine_Learning_HCM_Senior_QZGBK9',
        b'Job_Senior_Data_Scientist_other_Senior_G4IZM2',
        b'Job_Leader_Data_Analyst_HCM_Leader_3TSV4J',
        b'Job_Senior_Data_Scientist_HCM_Senior_OA3KH0',
        b'Job_Leader_Data_Analyst_other_Leader_ZX81LZ',
        b'Job_Leader_Data_Scientist_other_Leader_Q1GEHW',
        b'Job_Senior_Machine

In [94]:
meta_df[["jobId", "location", "level", "skill_text"]]
n , _ = meta_df.shape
score, pred = index({"item1": np.array(meta_df.jobId.values.tolist()).reshape(n, 1),
                 "location_item1": np.array(meta_df.location.values.tolist()).reshape(n, 1),
                 "level_item1": np.array(meta_df.level.values.tolist()).reshape(n, 1),
                 "skill_text_item1": np.array(meta_df.skill_text.values.tolist()).reshape(n, 1)
                })
score

<tf.Tensor: shape=(500, 20), dtype=float32, numpy=
array([[7.812184 , 7.6138816, 7.4300327, ..., 6.705778 , 6.681527 ,
        6.673678 ],
       [8.494712 , 8.468593 , 8.221682 , ..., 7.722277 , 7.7072144,
        7.6292567],
       [7.7892895, 7.7132106, 7.5902157, ..., 6.4960356, 6.479126 ,
        6.4579606],
       ...,
       [7.3493648, 7.215054 , 6.921525 , ..., 6.2424927, 6.225397 ,
        6.195697 ],
       [7.5333076, 7.296815 , 7.2764544, ..., 6.4780903, 6.378552 ,
        6.2939086],
       [7.4792747, 7.1970086, 7.003552 , ..., 5.8118157, 5.743824 ,
        5.7421017]], dtype=float32)>

In [95]:
def scale_5(arr):
    return list(map(float, arr*5/max(arr)))


scale_5(score.numpy()[0])

[5.0,
 4.873081684112549,
 4.755413055419922,
 4.70578670501709,
 4.643866539001465,
 4.60400915145874,
 4.54772424697876,
 4.54767370223999,
 4.4978556632995605,
 4.43584680557251,
 4.430275917053223,
 4.409862041473389,
 4.407322883605957,
 4.370481014251709,
 4.369513034820557,
 4.363524436950684,
 4.359341144561768,
 4.291872024536133,
 4.276350498199463,
 4.271326541900635]

In [111]:
from pymongo import UpdateOne


def update_item_recs(item, recs, score):
    return UpdateOne({"_id": item}, {
        "$set": {
            "items": dict(zip(list(map(lambda x: x.decode("ascii"), recs)), scale_5(score)))
        }
    }, upsert=True)

In [113]:
# mongo["data_1"].test.bulk_write([update_item_recs(meta_df.jobId.values.tolist()[0], list(map(lambda x: x.decode('ascii'), pred.numpy()[0])), scale_5(score.numpy()[0]))])


mongo["data_3"].Item_Recs.bulk_write(list(map(lambda i, r, s: update_item_recs(i, r, s), meta_df.jobId.values.tolist(), pred.numpy(), score.numpy())))

# pred.numpy()[0]

<pymongo.results.BulkWriteResult at 0x7fb8b2e0aa90>

In [102]:
# mongo.data_3.test.update_one({"_id": "abc"}, {"$set": {"item": 1}}, upsert=True)


mongo.data_3.test.bulk_write([UpdateOne({"_id": "bdhd"}, {"$set": {"abc": 1}}, upsert=True)])

<pymongo.results.BulkWriteResult at 0x7fb8b3181880>

In [20]:
paired_item1 = df_2[["item1", "item2"]][df_2["item1"]=="Job_Fresher_Backend_Engineer_DN_Fresher_FL62TZ"]["item2"].values.tolist()
paired_item2 = df_2[df_2["item2"].isin(paired_item1)]
paired_item2

Unnamed: 0,item1,item2,jobId_item1,title_item1,location_item1,skill_item1,salary_item1,level_item1,skill_text_item1,jobId_item2,title_item2,location_item2,skill_item2,salary_item2,level_item2,skill_text_item2
0,Job_Fresher_Backend_Engineer_DN_Fresher_FL62TZ,Job_Fresher_Backend_Engineer_other_Fresher_F6SGNV,Job_Fresher_Backend_Engineer_DN_Fresher_FL62TZ,Fresher Backend Engineer,DN,"[Backend, System, PHP, ReactJS, Java]",919,Fresher,"Backend,System,PHP,ReactJS,Java",Job_Fresher_Backend_Engineer_other_Fresher_F6SGNV,Fresher Backend Engineer,other,"[API, Backend, PHP, System, Java]",952,Fresher,"API,Backend,PHP,System,Java"
1,Job_Fresher_Backend_Engineer_DN_Fresher_FL62TZ,Job_Fresher_Backend_Engineer_other_Fresher_U96NGQ,Job_Fresher_Backend_Engineer_DN_Fresher_FL62TZ,Fresher Backend Engineer,DN,"[Backend, System, PHP, ReactJS, Java]",919,Fresher,"Backend,System,PHP,ReactJS,Java",Job_Fresher_Backend_Engineer_other_Fresher_U96NGQ,Fresher Backend Engineer,other,"[Golang, Java, English, Golang, English]",972,Fresher,"Golang,Java,English,Golang,English"
2,Job_Fresher_Backend_Engineer_other_Fresher_F6SGNV,Job_Fresher_Backend_Engineer_other_Fresher_U96NGQ,Job_Fresher_Backend_Engineer_other_Fresher_F6SGNV,Fresher Backend Engineer,other,"[API, Backend, PHP, System, Java]",952,Fresher,"API,Backend,PHP,System,Java",Job_Fresher_Backend_Engineer_other_Fresher_U96NGQ,Fresher Backend Engineer,other,"[Golang, Java, English, Golang, English]",972,Fresher,"Golang,Java,English,Golang,English"
4,Job_Fresher_Backend_Engineer_other_Fresher_U96NGQ,Job_Fresher_Backend_Developer_DN_Fresher_3JKXHO,Job_Fresher_Backend_Engineer_other_Fresher_U96NGQ,Fresher Backend Engineer,other,"[Golang, Java, English, Golang, English]",972,Fresher,"Golang,Java,English,Golang,English",Job_Fresher_Backend_Developer_DN_Fresher_3JKXHO,Fresher Backend Developer,DN,"[Backend, ReactJS, ReactJS, ReactJS, ReactJS]",1164,Fresher,"Backend,ReactJS,ReactJS,ReactJS,ReactJS"
5,Job_Fresher_Backend_Engineer_other_Fresher_QME5O3,Job_Fresher_Backend_Developer_other_Fresher_5B...,Job_Fresher_Backend_Engineer_other_Fresher_QME5O3,Fresher Backend Engineer,other,"[NodeJS, NodeJS, System, JavaScript, Backend]",564,Fresher,"NodeJS,NodeJS,System,JavaScript,Backend",Job_Fresher_Backend_Developer_other_Fresher_5B...,Fresher Backend Developer,other,"[API, .NET, NodeJS, NodeJS, System]",1211,Fresher,"API,.NET,NodeJS,NodeJS,System"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193888,Job_Senior_Devops_other_Senior_IRDBVY,Job_Junior_Devops_other_Junior_FL31SL,Job_Senior_Devops_other_Senior_IRDBVY,Senior Devops,other,"[Perl, DevOps, Perl, Perl, Aws]",2699,Senior,"Perl,DevOps,Perl,Perl,Aws",Job_Junior_Devops_other_Junior_FL31SL,Junior Devops,other,"[Aws, Perl, DevOps, DevOps, Linux]",1488,Junior,"Aws,Perl,DevOps,DevOps,Linux"
193902,Job_Senior_Devops_HCM_Senior_UGYO5G,Job_Junior_Devops_other_Junior_MO28SB,Job_Senior_Devops_HCM_Senior_UGYO5G,Senior Devops,HCM,"[Python, Aws, Python, DevOps, DevOps]",2224,Senior,"Python,Aws,Python,DevOps,DevOps",Job_Junior_Devops_other_Junior_MO28SB,Junior Devops,other,"[Azure, Aws, Linux, DevOps, Perl]",1320,Junior,"Azure,Aws,Linux,DevOps,Perl"
193903,Job_Leader_Devops_other_Leader_IX7PTI,Job_Junior_Devops_other_Junior_MO28SB,Job_Leader_Devops_other_Leader_IX7PTI,Leader Devops,other,"[Perl, Python, Linux, Azure, Perl]",3471,Leader,"Perl,Python,Linux,Azure,Perl",Job_Junior_Devops_other_Junior_MO28SB,Junior Devops,other,"[Azure, Aws, Linux, DevOps, Perl]",1320,Junior,"Azure,Aws,Linux,DevOps,Perl"
193912,Job_Middle_Devops_HCM_Middle_9XKSQB,Job_Junior_Devops_other_Junior_NNHXSZ,Job_Middle_Devops_HCM_Middle_9XKSQB,Middle Devops,HCM,"[Azure, Aws, Azure, Linux, DevOps]",2295,Middle,"Azure,Aws,Azure,Linux,DevOps",Job_Junior_Devops_other_Junior_NNHXSZ,Junior Devops,other,"[Aws, Perl, Python, Perl, Linux]",1537,Junior,"Aws,Perl,Python,Perl,Linux"


In [21]:
meta_df[meta_df["jobId"]=="Job_Fresher_Backend_Engineer_DN_Fresher_FL62TZ"]

Unnamed: 0,jobId,title,location,skill,salary,level,skill_text
146,Job_Fresher_Backend_Engineer_DN_Fresher_FL62TZ,Fresher Backend Engineer,DN,"[Backend, System, PHP, ReactJS, Java]",919,Fresher,"Backend,System,PHP,ReactJS,Java"
