In [None]:
# connect drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
resume_text_path = '/content/drive/MyDrive/NLP/lemmatized_resume_text_token_ids.pkl'
job_description_path = '/content/drive/MyDrive/NLP/lemmatized_job_description_token_ids.pkl'
vocab_path = '/content/drive/MyDrive/NLP/vocab.pkl'

In [None]:
import pickle

def load_variable(pkl_variable):
  # Open the file in binary mode
  with open(pkl_variable, 'rb') as file:
    myvar = pickle.load(file)
  return myvar

In [None]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
lemmatized_resume_text_token_ids = load_variable(resume_text_path)
lemmatized_job_description_token_ids = load_variable(job_description_path)
vocab = load_variable(vocab_path)

In [None]:
print(len(lemmatized_resume_text_token_ids))
print(len(lemmatized_job_description_token_ids))
print(len(vocab))

10000
10000
1315


In [None]:
# Creating the transformer encoder
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
# class PositionalEncoding(layers.Layer):
#     def __init__(self, max_len, d_model):
#         super().__init__()
#         pos = tf.range(max_len)[:, tf.newaxis]
#         i = tf.range(d_model)[tf.newaxis, :]
#         # Cast 'i' to tf.float32 to match the dtype of tf.pow result
#         angle_rates = 1 / tf.pow(10000., (2 * (tf.cast(i, tf.float32) // 2)) / tf.cast(d_model, tf.float32))
#         angle_rads = tf.cast(pos, tf.float32) * angle_rates

#         sines = tf.math.sin(angle_rads[:, 0::2])
#         cosines = tf.math.cos(angle_rads[:, 1::2])
#         self.pos_encoding = tf.concat([sines, cosines], axis=-1)
#         self.pos_encoding = self.pos_encoding[tf.newaxis, ...]

#     def call(self, x):
#         return x + self.pos_encoding[:, :tf.shape(x)[1], :]

class PositionalEncoding(layers.Layer):
    def __init__(self, max_len, d_model):
        super(PositionalEncoding, self).__init__()
        pos = tf.range(max_len, dtype=tf.float32)[:, tf.newaxis]
        i = tf.range(d_model, dtype=tf.float32)[tf.newaxis, :]

        angle_rates = 1 / tf.pow(10000.0, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
        angle_rads = pos * angle_rates

        # Apply sin to even indices in the array; cos to odd indices
        pos_encoding = tf.where(i % 2 == 0, tf.sin(angle_rads), tf.cos(angle_rads))
        self.pos_encoding = pos_encoding[tf.newaxis, ...]

    def call(self, x):
        seq_len = tf.shape(x)[1]
        return x + self.pos_encoding[:, :seq_len, :]

In [None]:
# def TransformerEncoderClassifier(vocab_size, d_model, num_heads, d_ff, num_layers, num_classes, max_len):
#     inputs = layers.Input(shape=(None,))
#     x = layers.Embedding(vocab_size, d_model)(inputs)
#     x = PositionalEncoding(max_len, d_model)(x)

#     for _ in range(num_layers):
#         x = layers.MultiHeadAttention(num_heads, d_model // num_heads)(x, x)
#         x = layers.LayerNormalization()(x)
#         x_ff = layers.Dense(d_ff, activation='relu')(x)
#         x = layers.Dense(d_model)(x_ff) + x
#         x = layers.LayerNormalization()(x)

#     encoder_output = x
#     x = layers.GlobalAveragePooling1D()(x)
#     logits = layers.Dense(num_classes)(x)

#     return models.Model(inputs=inputs, outputs=[logits, encoder_output])

def TransformerEncoderClassifier(vocab_size, d_model, num_heads, d_ff, num_layers, num_classes, max_len):
    inputs = layers.Input(shape=(None,), name="input_tokens")
    x = layers.Embedding(vocab_size, d_model, name="token_embedding")(inputs)
    x = PositionalEncoding(max_len, d_model)(x)

    for i in range(num_layers):
        attn_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model // num_heads, name=f"mha_{i}")(x, x)
        x = layers.Add()([x, attn_output])
        x = layers.LayerNormalization(name=f"attn_norm_{i}")(x)

        ff_output = layers.Dense(d_ff, activation='relu', name=f"ffn_{i}_1")(x)
        ff_output = layers.Dense(d_model, name=f"ffn_{i}_2")(ff_output)
        x = layers.Add()([x, ff_output])
        x = layers.LayerNormalization(name=f"ffn_norm_{i}")(x)

    encoder_output = x
    pooled_output = layers.GlobalAveragePooling1D(name="global_avg_pool")(encoder_output)
    logits = layers.Dense(num_classes, name="classifier_logits")(pooled_output)

    return models.Model(inputs=inputs, outputs=[logits, encoder_output], name="TransformerEncoderClassifier")


In [None]:
def get_embeddings(vocab, token_id):
    model = TransformerEncoderClassifier(vocab_size=2*len(vocab), d_model=256, num_heads=4, d_ff=128, num_layers=2, num_classes=2, max_len=5*token_id.shape[1])
    embedding_model = tf.keras.Model(
        inputs=model.input,
        outputs=model.output[1]  # Only encoder output
    )
    embedding = embedding_model.predict(token_id)
    return embedding

In [None]:
def get_embeddings_1(vocab, token_id):
    model = TransformerEncoderClassifier(
        vocab_size=2000,
        d_model=256,
        num_heads=4,
        d_ff=128,
        num_layers=4,
        num_classes=2,
        max_len=5 * token_id.shape[1]
    )
    embedding_model = tf.keras.Model(
        inputs=model.input,
        outputs=model.get_layer("global_avg_pool").output  # or encoder_output if you want full sequence
    )
    embedding = embedding_model.predict(token_id)
    return embedding

In [None]:
check_1 = get_embeddings(vocab, lemmatized_resume_text_token_ids[0])
check_2 = get_embeddings_1(vocab, lemmatized_resume_text_token_ids[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 342ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455ms/step


In [None]:
# check_1[0], check_2

In [None]:
def embeddings(token_list_1, token_list_2):
    coupled_embeddings = []
    for i in range(len(token_list_1)):
        # embedding_1 = get_embeddings(vocab, token_list_1[i])
        # embedding_2 = get_embeddings(vocab, token_list_2[i])
        embedding_1 = get_embeddings_1(vocab, token_list_1[i])
        embedding_2 = get_embeddings_1(vocab, token_list_2[i])
        coupled_embeddings.append((embedding_1, embedding_2))
    return coupled_embeddings

In [None]:
check = lemmatized_resume_text_token_ids[0:10] # It takes from 0-110

In [None]:
len(check)

10

In [None]:
# generate embeddings from [0:2500] // In this 0 in inclusive but 2500 is exclusive so it will consider [0-2499] -> total being 2500
coupled_embeddings = embeddings(lemmatized_resume_text_token_ids[0:2500], lemmatized_job_description_token_ids[0:2500])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 236ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 243ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 238ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 246ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[

In [None]:
print(len(coupled_embeddings))

2500


In [None]:
# Saving the variable
import pickle

def save_variable(to_be_saved, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(to_be_saved, file)

In [None]:
def generate_embeddings_and_save(start_index, end_index):
    coupled_embeddings = embeddings(lemmatized_resume_text_token_ids[start_index:end_index], lemmatized_job_description_token_ids[start_index:end_index])
    save_variable(coupled_embeddings, f'/content/drive/MyDrive/NLP/NEW_coupled_embeddings_from_{start_index}_{end_index-1}.pkl')

In [None]:
for i in range(0,5):
    generate_embeddings_and_save(i*500, (i+1)*500)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 468ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 509ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 461ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 469ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 463ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 439ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 443ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 433ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 435ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 428ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 906ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
# save_variable(coupled_embeddings, '/content/drive/MyDrive/NLP/coupled_embeddings_from_0_2499.pkl')

In [None]:
# save_variable(coupled_embeddings, '/content/drive/MyDrive/NLP/coupled_embeddings_from_0_2499.pkl')