In [1]:
# connect drive
from google.colab import drive
drive.flush_and_unmount()
drive.mount('/content/drive')

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [2]:
dataset_path = '/content/drive/MyDrive/NLP/NLP/job_applicant_dataset.csv'

In [3]:
import pandas as pd

df = pd.read_csv(dataset_path)

In [4]:
df.head()

Unnamed: 0,Job Applicant Name,Age,Gender,Race,Ethnicity,Resume,Job Roles,Job Description,Best Match
0,Daisuke Mori,29,Male,Mongoloid/Asian,Vietnamese,"Proficient in Injury Prevention, Motivation, N...",Fitness Coach,A Fitness Coach is responsible for helping cl...,0
1,Taichi Shimizu,31,Male,Mongoloid/Asian,Filipino,"Proficient in Healthcare, Pharmacology, Medica...",Physician,"Diagnose and treat illnesses, prescribe medica...",0
2,Sarah Martin,46,Female,White/Caucasian,Dutch,"Proficient in Forecasting, Financial Modelling...",Financial Analyst,"As a Financial Analyst, you will be responsibl...",0
3,Keith Hughes,43,Male,Negroid/Black,Caribbean,"Proficient in Budgeting, Supply Chain Optimiza...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1
4,James Davis,49,Male,White/Caucasian,English,"Proficient in Logistics, Negotiation, Procurem...",Supply Chain Manager,A Supply Chain Manager oversees the entire sup...,1


In [5]:
resume_data = df['Resume'][8000:].to_list()
job_description_data = df['Job Description'][8000:].to_list()

In [6]:
type(resume_data), type(job_description_data)

(list, list)

In [7]:
# def DenseInteractionClassifier(d_model):
#     input_1 = layers.Input(shape=(d_model,), name="embedding_1")
#     input_2 = layers.Input(shape=(d_model,), name="embedding_2")

#     # Use a Lambda layer to wrap TensorFlow operations
#     interaction_layer = layers.Lambda(lambda inputs: tf.concat([
#         inputs[0],  # embedding 1
#         inputs[1],  # embedding 2
#         tf.math.abs(inputs[0] - inputs[1]),  # absolute difference
#         inputs[0] * inputs[1]  # element-wise product
#     ], axis=-1))([input_1, input_2])

#     x = layers.Dense(256, activation='relu')(interaction_layer)
#     # x = layers.Dropout(0.3)(x)
#     x = layers.Dense(128, activation='relu')(x)
#     # x = layers.Dropout(0.2)(x)
#     output = layers.Dense(1, activation='sigmoid')(x)  # Binary classification

#     return tf.keras.Model(inputs=[input_1, input_2], outputs=output, name="DenseInteractionClassifier")


def DenseInteractionClassifier(d_model):

    input_1 = layers.Input(shape=(d_model,), name="embedding_1")
    input_2 = layers.Input(shape=(d_model,), name="embedding_2")

    # Compute absolute difference and element-wise product using Keras layers
    abs_diff = layers.Subtract()([input_1, input_2])
    abs_diff = layers.Activation('relu')(abs_diff)  # mimic abs() via ReLU + subtract
    prod = layers.Multiply()([input_1, input_2])

    # Concatenate all features
    combined = layers.Concatenate()([input_1, input_2, abs_diff, prod])

    # Feed-forward layers
    x = layers.Dense(256, activation='relu')(combined)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.2)(x)

    output = layers.Dense(1, activation='sigmoid')(x)  # Binary classification

    model = models.Model(inputs=[input_1, input_2], outputs=output, name="DenseInteractionClassifier")
    return model

In [8]:
import pickle

def load_variable(pkl_variable):
  # Open the file in binary mode
  with open(pkl_variable, 'rb') as file:
    myvar = pickle.load(file)
  return myvar

In [9]:
# Getting final embeddings

# 1. Path of the final embeddings
job_description_embeddings_path = '/content/drive/MyDrive/NLP/NLP/NLP_Embeddings/NEW_COMBINED_job_description_embedding.pkl'
resume_text_embeddings_path = '/content/drive/MyDrive/NLP/NLP/NLP_Embeddings/NEW_COMBINED_resume_text_embedding.pkl'
labels_path = '/content/drive/MyDrive/NLP/NLP/labels.pkl'

# 2. Final Embeddings
total_job_description_embeddings = load_variable(job_description_embeddings_path)
total_resume_text_embeddings = load_variable(resume_text_embeddings_path)
labels = load_variable(labels_path)

In [10]:
total_job_description_embeddings[0].shape, total_resume_text_embeddings[0].shape, len(labels)

((256,), (256,), 10000)

In [11]:
import tensorflow as tf
from tensorflow.keras import models, layers

In [12]:
resume_emb_final = []
job_description_emb_final = []

for i in range(len(total_resume_text_embeddings)):
  resume_emb_final.append(tf.constant(total_resume_text_embeddings[i]))
  job_description_emb_final.append(tf.constant(total_job_description_embeddings[i]))

In [13]:
resume_emb_final[0].shape, len(resume_emb_final)

(TensorShape([256]), 10000)

In [14]:
resume_emb_final = []
job_description_emb_final = []

for i in range(len(total_resume_text_embeddings)):
  resume_emb_final.append(tf.constant(total_resume_text_embeddings[i]))
  job_description_emb_final.append(tf.constant(total_job_description_embeddings[i]))

In [15]:
len(resume_emb_final), resume_emb_final[0].shape, len(resume_emb_final)

(10000, TensorShape([256]), 10000)

In [16]:
def _shuffle(list_1, list_2, labels):
    import numpy
    from numpy import random
    combined = list(zip(list_1, list_2, labels))
    random.shuffle(combined)
    list_1= []
    list_2= []
    labels= []
    for i in combined:
        list_1.append(i[0])
        list_2.append(i[1])
        labels.append(i[2])
    return list_1, list_2, labels

In [17]:
check_1, check_2, check_3 = _shuffle(resume_emb_final, job_description_emb_final, labels)

In [18]:
import pandas as pd
check_3 = pd.Series(check_3)
print(type(check_3))

<class 'pandas.core.series.Series'>


In [19]:
# Creating train test split

train_resume_emb = check_1[:8000]
train_job_description_emb = check_2[:8000]
train_labels = check_3[:8000]

test_resume_emb = check_1[8000:]
test_job_description_emb = check_2[8000:]
test_labels = labels[8000:]

In [20]:
train_resume_emb[0].shape[0]

256

In [21]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [22]:
def train_model_1(emb_1, emb_2, total_size, d_model_val, labels):

  # print(total_size, d_model_val)
  # return

  # Stack the embeddings to create single tensors
  emb_1 = tf.stack(emb_1, axis= 0)
  emb_2 = tf.stack(emb_2, axis= 0)
  emb_1 = tf.reshape(emb_1, [total_size, d_model_val])
  emb_2 = tf.reshape(emb_2, [total_size, d_model_val])

  # print(emb_1.shape, emb_2.shape)
  # return

  model = DenseInteractionClassifier(d_model=d_model_val)
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  # print(emb_1.shape, emb_2.shape, seq_len_1, seq_len_2, d_model)

  # return
  # Use the entire labels list for training
  model.fit([emb_1, emb_2], labels, epochs=10)
  return model

In [23]:
# print(train_resume_emb[0].shape[0])
model = train_model_1(train_resume_emb, train_job_description_emb, len(train_resume_emb), train_resume_emb[0].shape[0], train_labels)

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5044 - loss: 0.7379
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5622 - loss: 0.6839
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6328 - loss: 0.6390
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.7225 - loss: 0.5531
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8187 - loss: 0.4121
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8989 - loss: 0.2641
Epoch 7/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9397 - loss: 0.1672
Epoch 8/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9547 - loss: 0.1237
Epoch 9/10
[1m250/250[0m [32m━━━━━━━━

In [24]:
def stack_tensors(tensor_list):
    stacked_tensor = tf.stack(tensor_list)
    return stacked_tensor

In [25]:
def test(emb1, emb2, total_size, d_model_val):
  emb_1 = stack_tensors(emb1)
  emb_2 = stack_tensors(emb2)
  emb_1 = tf.reshape(emb_1, [total_size, d_model_val])
  emb_2 = tf.reshape(emb_2, [total_size, d_model_val])
  model.evaluate([emb_1, emb_2], test_labels)
  return emb_1, emb_2

In [26]:
x, y = test(test_resume_emb, test_job_description_emb, len(test_resume_emb), test_resume_emb[0].shape[0])
print(type(x), type(y))
similarity = model.predict([x, y])
pred_labels = []
for i in similarity:
  if i > 0.5:
    pred_labels.append(1)
  else:
    pred_labels.append(0)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5018 - loss: 1.6466
<class 'tensorflow.python.framework.ops.EagerTensor'> <class 'tensorflow.python.framework.ops.EagerTensor'>
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [46]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_labels, pred_labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.52


In [29]:
def save_mode(model, path):
  model.save(path)

In [27]:
model_path = '/content/drive/MyDrive/NLP/NLP/model_10_v1.keras'
save_mode(model, model_path)

In [28]:
check_model = tf.keras.models.load_model(model_path, safe_mode= False)

In [29]:
model.summary()

In [43]:
def cosine_similarity_tf(embedding1, embedding2):
    embedding1 = tf.convert_to_tensor(embedding1, dtype=tf.float32)
    embedding2 = tf.convert_to_tensor(embedding2, dtype=tf.float32)

    norm1 = tf.norm(embedding1)
    norm2 = tf.norm(embedding2)

    similarity = tf.reduce_sum(embedding1 * embedding2) / (norm1 * norm2 + 1e-8)
    return similarity.numpy()

In [44]:
similarity_list = []
for i in range(len(test_resume_emb)):
  score = (cosine_similarity_tf(test_resume_emb[i], test_job_description_emb[i]))
  if score > 0.5:
    similarity_list.append(1)
  else:
    similarity_list.append(0)

In [45]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(similarity_list, pred_labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.53


In [33]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

def pretrained_model(sentence):
  # Load the pre-trained BERT model and tokenizer
  tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
  model = TFBertModel.from_pretrained("bert-base-uncased")

  inputs = tokenizer(sentence, return_tensors="tf")
  outputs = model(**inputs)
  embeddings = outputs.last_hidden_state[:,0,:]  # shape: (1, seq_len, 768)
  return embeddings

In [34]:
similarity_list = []
for i in range(len(resume_data)):
  emb_1 = pretrained_model(resume_data[i])
  emb_2 = pretrained_model(job_description_data[i])
  score = (cosine_similarity_tf(emb_1, emb_2))
  if score > 0.5:
    similarity_list.append(1)
  else:
    similarity_list.append(0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [41]:
bert_similarity = similarity_list

In [53]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(bert_similarity, pred_labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.47
