In [1]:
import random
import pandas as pd
import numpy as np
from transformers import BertModel, AutoTokenizer
import transformers
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import math

## Read Data

In [2]:
# read the data frame with two columns: title, positive_skill
df = pd.read_csv("v1401_1_4_train.csv")
df[['InputConcat']] = df[['InputConcat']].astype(str)
df_split = df['InputConcat'].str.split('<h>', expand=True)
dfIC = pd.concat([df, df_split], axis=1)
df_ = dfIC[[0, 2]]
df_raw = df_.rename(columns={0:'job_title',2:'positive_skill'})
df_raw = df_raw[~df_raw.apply(lambda x: x.str.strip() == '', axis=1).any(axis=1)]
df_raw.sample()

Unnamed: 0,job_title,positive_skill
7638,insurance risk consultant,"advise on risk management, analyse financial ..."


In [343]:
df_raw.merge(df_raw['positive_skill'].str.strip().str.split(', ').explode(), left_index=True, right_index=True).head(10)

Unnamed: 0,job_title,positive_skill_x,positive_skill_y
0,picture editor,"meet deadlines, adapt to type of media, perfo...",meet deadlines
0,picture editor,"meet deadlines, adapt to type of media, perfo...",adapt to type of media
0,picture editor,"meet deadlines, adapt to type of media, perfo...",perform image editing
0,picture editor,"meet deadlines, adapt to type of media, perfo...",edit photographs
0,picture editor,"meet deadlines, adapt to type of media, perfo...",negotiate exploitation rights
0,picture editor,"meet deadlines, adapt to type of media, perfo...",follow ethical code of conduct of journalists
0,picture editor,"meet deadlines, adapt to type of media, perfo...",supervise staff
0,picture editor,"meet deadlines, adapt to type of media, perfo...",select photos
0,picture editor,"meet deadlines, adapt to type of media, perfo...",consult with editor
0,picture editor,"meet deadlines, adapt to type of media, perfo...",edit negatives


## Dataset Clas

In [337]:
class JobTitleDataset(Dataset):
    def __init__(self, data_frame, tokenizer, max_token_len, K):
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        self.K = K
        self.training_pairs = self._prepare_training_data(data_frame)
    
    def __len__(self):
        return len(self.training_pairs)
    
    def __getitem__(self, index):
        job_title, positive_skill = self.training_pairs[index]['job_title'], self.training_pairs[index]['positive_skill']

        job_title_encoding = self.tokenizer.encode_plus(
          job_title,
          add_special_tokens=False,
          max_length=self.max_token_len,
          return_token_type_ids=False,
          padding="max_length",
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )

        positive_skill_IDs = self.skill_to_id[positive_skill]

        random_negatives = self.skill_frequencies.sample(n=self.K, weights=self.skill_frequencies.values).index.tolist()
        negative_skills_IDs = [self.skill_to_id[item] for item in random_negatives]

        return dict(
          input_job_title=job_title_encoding["input_ids"].flatten(),
          positive_skill=torch.tensor(positive_skill_IDs),#.unsqueeze(0),
          negative_skills=torch.tensor(negative_skills_IDs),#.unsqueeze(0)
                  )

    def _prepare_training_data(self, df):
        all_skills_series = df['positive_skill'].str.strip().str.split(', ').explode()
        all_skills_series = all_skills_series.apply(lambda x: x.strip())
        all_unique_skills = all_skills_series.unique().tolist()
        self.skill_frequencies = all_skills_series.value_counts(normalize=True) ** 0.75
        self.num_of_unique_skills = len(all_unique_skills)
        self.skill_to_id = {skill: i for i, skill in enumerate(all_unique_skills)}

        df_ = df.merge(all_skills_series, left_index=True, right_index=True)
        df_new = df_.rename(columns={'positive_skill_y':'positive_skill'})
        df_new.drop(columns=['positive_skill_x'], inplace=True)
        df_new = df_new.sample(frac=1).reset_index(drop=True)
        data_dict = df_new[['job_title', 'positive_skill']].to_dict('records')
        return data_dict

train, validate = np.split(df_raw.sample(frac=1, random_state=42), [int(.8*len(df_raw))])
train.shape, validate.shape


((41449, 2), (10363, 2))

In [318]:
max([len(item['input_job_title'].nonzero()) for item in ds])

KeyboardInterrupt: 

In [4]:
# all_skills_series = df['positive_skill'].str.strip().str.split(', ').explode()
# all_skills_series = all_skills_series.apply(lambda x: x.strip())
# all_unique_skills = all_skills_series.unique().tolist()
# skill_frequencies = all_skills_series.value_counts(normalize=True) ** 0.75

# skill_to_id = {skill: i for i, skill in enumerate(all_unique_skills)}

# df_ = df.merge(all_skills_series, left_index=True, right_index=True)
# df_new = df_.rename(columns={'positive_skill_y':'positive_skill'})
# df_new.drop(columns=['positive_skill_x'], inplace=True)
# df_new = df_new.sample(frac=1).reset_index(drop=True)
# df_new['positive_skill_IDs'] = df_new['positive_skill'].apply(lambda x: skill_to_id[x])
# data_dict = df_new[['job_title', 'positive_skill_IDs']].to_dict('records')

In [372]:
training_pairs, validation_pairs, all_skills_series = get_training_pairs(df_raw, 100)
print(f'Train Size = {len(training_pairs)}, Validation Size = {len(validation_pairs)}')

Train Size = 100, Validation Size = 20


In [7]:
ds = JobTitleDataset(df_raw, tokenizer, max_token_len=16, K=5)
train_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True)

## Model Creation

In [370]:
class Jobbert_neg_sampling(nn.Module):

    def __init__(self, embedding_size, vocab_size):
        super(Jobbert_neg_sampling, self).__init__()

        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        self.embeddings_context = nn.Embedding(vocab_size, embedding_size) # torch zeros

        # # Initialize the gating layer
        self.gating = nn.Linear(self.bert_model.config.hidden_size, 1)

        # # Initialize the layers   
        self.mlp_layers = nn.Sequential(
                        nn.Linear(self.bert_model.config.hidden_size, 768),
                        nn.ReLU(),
                        nn.Linear(768, 300)
                        )
        
        self.criteron = nn.NLLLoss()

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            if module.weight.shape[1] == 1:
                nn.init.uniform_(module.weight, -math.sqrt(1/768), math.sqrt(1/768))
                nn.init.uniform_(module.bias, -1, 1)
            else:
                nn.init.uniform_(module.weight, -math.sqrt(1/768), math.sqrt(1/768))
                nn.init.uniform_(module.bias, -math.sqrt(1/300), math.sqrt(1/300))


    def forward(self, data):
        debug =  not True

        t = data['input_job_title']
        u_j = self.embeddings_context(data['positive_skill'])
        u_k = self.embeddings_context(data['negative_skills'])

        if debug:
            print('u_j.shape: ', u_j.shape)
            print('u_k.shape: ', u_k.shape) 

        # # Get the BERT embeddings
        bert_output = self.bert_model(t)['last_hidden_state']
        # # Get the gating scores
        x = self.gating(bert_output).sigmoid()
        # # Get the averaged sum of the gated embeddings
        det = x.sum(dim=1)
        gating_scores = x / det.unsqueeze(dim=-1)
        # # Get the input embeddings
        input_embs = gating_scores * bert_output
        input_embs = input_embs.sum(dim=1)
        input_embs = self.mlp_layers(input_embs)
        if debug:print('input_embs.shape: ', input_embs.shape)
       
       # Positive samples
        emb_context = torch.mul(u_j, input_embs)
        if debug:print('pos_matx.shape: ', emb_context.shape)

        emb_product = torch.sum(emb_context, dim=1)          # bs
        if debug:print('emb_product.shape: ', emb_product.shape)

        out_loss = F.logsigmoid(emb_product)                      # bs
        if debug:print('out_loss.shape: ', out_loss.shape)

        # Negative samples
        if debug:print('input_embs.unsqueeze(2).shape: ', input_embs.unsqueeze(2).shape)
        emb_product_neg_samples = torch.bmm(u_k.neg(), input_embs.unsqueeze(2))
        if debug:print('emb_product_neg_samples.shape: ', emb_product_neg_samples.shape)
        noise_loss = F.logsigmoid(emb_product_neg_samples).squeeze(2).sum(1)
        if debug:print('noise_loss.shape: ', noise_loss.shape)

        if debug:print('*'*100)

        total_loss = out_loss + noise_loss
        if debug:print('total_loss: ', total_loss)

        mean_loss = -(out_loss + noise_loss).mean()
        if debug:print('mean_loss: ', mean_loss)

        # with loss function ???
        #nll_loss = self.criteron(emb_product, emb_product_neg_samples.squeeze(2).sum(1).long())
        #if debug:print("nll_loss: ", nll_loss)

        return mean_loss
    

model = Jobbert_neg_sampling(embedding_size=300, vocab_size=num_of_unique_skills)
data_row = next(iter(train_loader))
model(data_row)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor(5.1834, grad_fn=<NegBackward0>)

In [373]:
import torch
import torch.optim as optim
batch_size = 8
num_epochs = 3

# get training parameters
num_of_unique_skills = len(all_skills_series.unique().tolist())

# Initialize the model
model = Jobbert_neg_sampling(embedding_size=300, vocab_size=num_of_unique_skills)


# Initialize the data loader
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

df_train, df_validate = np.split(df_raw.sample(frac=1, random_state=42), [int(.8*len(df_raw))])


train_dataset = JobTitleDataset(df_train, tokenizer, max_token_len=16, K=5)
train_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True)

val_dataset = JobTitleDataset(df_validate, tokenizer, max_token_len=16, K=5)
val_loader = torch.utils.data.DataLoader(ds, batch_size=batch_size, shuffle=True)


# Initialize the optimizers
optimizer_gating_mlp = optim.SGD(model.mlp_layers.parameters(), lr=0.05)
optimizer_context_matrix = optim.Adagrad(model.embeddings_context.parameters(), lr=0.01)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [377]:
# define your training loop
for epoch in range(num_epochs):
    total_loss = 0
    for i, input in enumerate(train_loader):
        # clear gradients
        optimizer_gating_mlp.zero_grad()
        optimizer_context_matrix.zero_grad()

        # forward pass
        loss = model(input)
        print('J_T loss: ', loss)

        # backward pass
        loss.backward()

        # update weights
        optimizer_gating_mlp.step()
        optimizer_context_matrix.step()

        total_loss += loss.item()

        #EVALUATION
        # res = J_T.mean(dim=1) @ skill_emb.float().T
        # res.shape



    print(f"Epoch {epoch}: loss={total_loss}")

J_T loss:  tensor(21.9575, grad_fn=<NegBackward0>)
J_T loss:  tensor(7.5741, grad_fn=<NegBackward0>)
J_T loss:  tensor(7.0884, grad_fn=<NegBackward0>)
J_T loss:  tensor(5.4644, grad_fn=<NegBackward0>)
J_T loss:  tensor(4.6789, grad_fn=<NegBackward0>)
J_T loss:  tensor(4.2801, grad_fn=<NegBackward0>)
J_T loss:  tensor(4.4234, grad_fn=<NegBackward0>)
J_T loss:  tensor(5.6373, grad_fn=<NegBackward0>)


KeyboardInterrupt: 

torch.Size([8, 8])

In [149]:
bert_model = BertModel.from_pretrained("bert-base-uncased")
embeddings_context = nn.Embedding(300, num_of_unique_skills)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [33]:
data_row['positive_skill']


tensor([[ 3268],
        [ 7508],
        [20356],
        [10456],
        [10693],
        [ 1047],
        [ 2726],
        [20504]])

In [16]:
 predictions = model(next(iter(train_loader)))
 predictions

tensor([[[-1.3088, -1.4271, -1.3817,  ..., -1.3544, -1.3891, -1.3470],
         [-1.3101, -1.4231, -1.3818,  ..., -1.3575, -1.3892, -1.3482],
         [-1.3091, -1.4254, -1.3815,  ..., -1.3578, -1.3890, -1.3499],
         ...,
         [-1.3070, -1.4166, -1.3815,  ..., -1.3611, -1.3890, -1.3477],
         [-1.3082, -1.4170, -1.3813,  ..., -1.3596, -1.3891, -1.3478],
         [-1.3073, -1.4164, -1.3814,  ..., -1.3576, -1.3891, -1.3490]],

        [[-1.4138, -1.3668, -1.4478,  ..., -1.3653, -1.3901, -1.3493],
         [-1.4146, -1.3692, -1.4492,  ..., -1.3683, -1.3901, -1.3508],
         [-1.4150, -1.3720, -1.4478,  ..., -1.3697, -1.3898, -1.3491],
         ...,
         [-1.4146, -1.3720, -1.4509,  ..., -1.3701, -1.3899, -1.3497],
         [-1.4142, -1.3718, -1.4526,  ..., -1.3689, -1.3900, -1.3496],
         [-1.4146, -1.3719, -1.4521,  ..., -1.3677, -1.3901, -1.3508]],

        [[-1.3986, -1.3286, -1.3754,  ..., -1.3879, -1.3751, -1.3770],
         [-1.3990, -1.3391, -1.3754,  ..., -1

In [45]:
 predictions = model(next(iter(data_loader)))

# Compute the MRR across all queries
mrr = 0
num_queries = 0
for query in predictions:
    print(query.shape)
    # Sort the predicted scores for this query in descending order
    ranked_scores, ranked_indices = torch.sort(query, descending=True)

torch.Size([16, 300])
torch.Size([16, 300])
torch.Size([16, 300])
torch.Size([16, 300])
torch.Size([16, 300])
torch.Size([16, 300])
torch.Size([16, 300])
torch.Size([16, 300])


In [38]:
predictions.shape

torch.Size([8, 16, 300])

In [73]:
def compute_loss(model, validation_set):
    # Evaluate the model on the validation set and get the predictions
    predictions = model(validation_set)

    # Compute the MRR across all queries
    mrr = 0
    num_queries = 0
    for query in predictions:
        # Sort the predicted scores for this query in descending order
        ranked_scores, ranked_indices = torch.sort(query, descending=True)

        # Find the rank of the first correct prediction (i.e., the reciprocal rank)
        correct_index = ranked_indices == 0
        reciprocal_rank = 1 / (correct_index.nonzero(as_tuple=True)[0][0] + 1)

        # Add the reciprocal rank to the total MRR and increment the query count
        mrr += reciprocal_rank
        num_queries += 1

    # Compute the mean MRR across all queries
    mean_mrr = mrr / num_queries

    # Return the negative mean MRR as the loss
    return -mean_mrr


In [74]:
 for i, input in enumerate(val_loader):
    calc_mrr = compute_loss(model, input)
    print(calc_mrr)

tensor(-1.)
tensor(-1.)
tensor(-1.)
tensor(-1.)
tensor(-1.)
tensor(-1.)
tensor(-1.)
tensor(-1.)


KeyboardInterrupt: 

In [205]:
for epoch in range(5):
    for batch in data_loader:
        # Zero the gradients for all parameters
        optimizer_gating_mechanism.zero_grad()
        optimizer_mlp_weights.zero_grad()
        optimizer_context_matrix.zero_grad()

        # Forward pass
        output = model(batch)
        loss = compute_loss(output)

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer_gating_mechanism.step()
        optimizer_mlp_weights.step()
        optimizer_context_matrix.step()


data.keys():  dict_keys(['input_job_title', 'positive_skill', 'negative_skills'])
input_embs.shape:  torch.Size([2, 16, 300])
u_j.shape:  torch.Size([2, 1, 300])
u_k.shape:  torch.Size([2, 1, 5, 300])


NameError: name 'compute_loss' is not defined

In [274]:
input_embs = torch.rand(32, 16, 300)
u_j = torch.rand(32, 1, 300)
u_k = torch.rand(32, 1, 5, 300)

pos_mult = torch.mul(u_j, input_embs)

aux_embs = input_embs.unsqueeze(2).expand(-1, -1, 5, -1)
a = (u_k * aux_embs).sum(dim=2).neg()

J_T = F.logsigmoid(pos_mult) + F.logsigmoid(a)
J_T.shape

torch.Size([32, 16, 300])

In [267]:
neg_mul.shape

torch.Size([2, 16, 300])

In [271]:
a = (u_k * aux_embs).sum(dim=2).neg()
a.shape

torch.Size([32, 16, 300])

### Get DataLoader

In [13]:
## Example Usage
neg_samples = torch.multinomial(noise_dist, K * len(df), replacement=True)
print(neg_samples.shape)
neg_samples = neg_samples.view(len(df), K)
print(neg_samples.shape)
neg_samples

torch.Size([5000])
torch.Size([1000, 5])


tensor([[ 902,   68, 6990, 6700, 2353],
        [ 365, 3593,  860,  102, 6111],
        [3620,  479, 5013, 1882,  853],
        ...,
        [2831,   91, 5666, 5576, 1020],
        [  44, 3050,  850,  930, 1240],
        [ 199, 2204, 3603, 5050, 1339]])

In [51]:
def convert_skills_to_ids(all_unique_skills):
    """
    Convert the skills to ids.
    """
    skill_to_id = {skill: i for i, skill in enumerate(all_unique_skills)}
    id_to_skill = {i: skill for i, skill in enumerate(all_unique_skills)}
    return skill_to_id, id_to_skill

skill_to_id, id_to_skill = convert_skills_to_ids(all_unique_skills)

In [56]:
df_ = df.merge(all_skills_series, left_index=True, right_index=True)
df_new = df_.rename(columns={'positive_skill_y':'positive_skill'})
df_new.drop(columns=['positive_skill_x'], inplace=True)
df_new = df_new.sample(frac=1).reset_index(drop=True)

df_new['positive_skill_IDs'] = df_new['positive_skill'].apply(lambda x: skill_to_id[x])
df_new

Unnamed: 0,job_title,positive_skill,positive_skill_IDs
0,building cleaner,maintain cleaning equipment,658
1,power plant manager,manage budgets,90
2,Branch Manager,effective sales,2847
3,energy systems engineer,determine appropriate heating and cooling system,4551
4,Brand Manager,Organización de eventos,1007
...,...,...,...
14083,Project Engineer,CANoe,2790
14084,Web Developer,then we would love to hear from you. \n \n Jo...,1860
14085,Sales Engineer,VPN,1296
14086,integrated circuit design engineer,electronic equipment standards,2515


In [72]:
skill_frequencies = all_skills_series.value_counts(normalize=True) ** 0.75
random_negatives = skill_frequencies.sample(n=K, weights=skill_frequencies.values).index.tolist()
random_negatives

[skill_to_id[item] for item in random_negatives]

[2668, 1519, 190, 4273, 2238]