In [3]:
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from plot_confustion_matrix import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from collections import Counter
import torch
from scipy.spatial.distance import cosine
from transformers import BertModel, BertTokenizer
from transformers import BertConfig, BertPreTrainedModel


In [10]:
path2data = r"C:\Users\samer\Documents\university\anfänger_praktikum\harambee\ej_formal_shareable.csv"
df = pd.read_csv(path2data)
df = df.drop("Unnamed: 0", axis=1)
df = df[df["job_category"].isin(["Unclassified"]) == False]
df['job_title'] = df.job_title.apply(lambda x: str.lower(x))
bag_of_word_to_remove = ['gsa', 'gsa ','deli ', 'ict ', '2020 ', 'none ', 'none', 'p40', 'p40 ', 'ecd', 'ict']
df = df[df['job_title'].isin(bag_of_word_to_remove) == False]
df.shape

(58984, 3)

In [6]:
config = BertConfig.from_pretrained('bert-base-cased')
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [149]:
import torch.nn as nn
HIDDEN_SIZE_BERT = 768
EMBED_SIZE_WORD = 128

class MyModel(BertPreTrainedModel):
    def __init__(self, config, **kwargs):
        """ Using Bert, define custom Model. 
        [*]: check important parts. """
        super(MyModel, self).__init__(config)
        self.bert = BertModel(config)
        
        # recursively load into the BERT submodule the first time you call pre-trained weights. [*]
        self.init_weights()
        
        # customized layer - these layers' wieghts are not initialized.
        self.linear = nn.Linear(kwargs['hidden_size_bert'], kwargs['embed_size_word'])

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        """ forward step of BERT and pass customed layers.
        input_ids: prerequesite: """
        # pdb.set_trace() # debugging
        #hiddens, pooled, hiddens_all
        output = self.bert(input_ids, 
                            attention_mask=attention_mask
                            ) #token_type_ids=token_type_ids
        out = self.linear(output[0])
        return out#out, hiddens, hiddens_all # [B, T, D]

BERT_MODEL_NAME = 'bert-base-cased'
model = MyModel.from_pretrained(
    pretrained_model_name_or_path=BERT_MODEL_NAME, 
    config=config, 
    hidden_size_bert=HIDDEN_SIZE_BERT,
    embed_size_word=EMBED_SIZE_WORD)

Some weights of the model checkpoint at bert-base-cased were not used when initializing MyModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing MyModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MyModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MyModel were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['linear.weight', 'linear.bias']
You should pr

In [39]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME, do_lower_case=True)

#tk_jobs = []
input_ids = []
attention_mask = []
jobs = df.job_title.tolist()
for j in jobs:
    encode = tokenizer.encode_plus(
                        j,
                        None,
                        add_special_tokens=True,
                        max_length= 128,
                        padding = 'max_length',
                        return_token_type_ids= False,
                        return_attention_mask= True,
                        truncation=True,
                        return_tensors = 'pt'      
                                       )
    
    input_ids.append(torch.tensor(encode['input_ids']))
    attention_mask.append(torch.tensor(encode['attention_mask']))

  input_ids.append(torch.tensor(encode['input_ids']))
  attention_mask.append(torch.tensor(encode['attention_mask']))


In [46]:
encode

{'input_ids': tensor([[  101, 12102,  5052,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [40]:
type(attention_mask[0]), type(input_ids[0])

(torch.Tensor, torch.Tensor)

In [29]:
# Define a new example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Map the token strings to their vocabulary indeces.
tokens = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
input_ids_exp = tokenizer.build_inputs_with_special_tokens(tokens)

# optional - discriminate sentence A or B.
token_type_ids = tokenizer.create_token_type_ids_from_sequences(tokens)
assert len(token_type_ids) == len(input_ids_exp), "single sentence token tpye ids does not matched."

# Convert inputs to PyTorch tensors
input_ids_exp = torch.tensor([input_ids_exp])
token_type_ids_exp = torch.tensor([token_type_ids]) + 1 # becomes all 1
# input_ids, token_type_ids

In [54]:
torch.tensor([[0]*128]).shape, attention_mask[0].shape

(torch.Size([1, 128]), torch.Size([1, 128]))

In [36]:
input_ids[0]

101

In [44]:
input_ids_exp.shape#,
input_ids[0].shape

torch.Size([1, 27])

In [17]:
input_ids[0].shape , attention_mask[0].shape 

(torch.Size([1, 128]), torch.Size([1, 128]))

In [150]:
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
model.eval()
embeddings = []
ss1 = []
for job in range(100):
    with torch.no_grad():

        output = model(input_ids=input_ids[job]) #, attention_mask=attention_mask[job], token_type_ids=torch.tensor([[0]*128])

        # Evaluating the model will return a different number of objects based on 
        # how it's  configured in the `from_pretrained` call earlier. In this case, 
        # becase we set `output_hidden_states = True`, the third item will be the 
        # hidden states from all layers. See the documentation for more details:
        # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
#        hidden_state = output[0]
#        embeddings.append(embed)
        ss1.append(output)

In [133]:
premuted_token_embeddings = ss[0].permute(1,0,2)

embeddings_array = []
for token in premuted_token_embeddings:    
    sum_vec = torch.sum(token[-1:], dim=0)
    embeddings_array.append(sum_vec)## embedding

NameError: name 'premuted_token_embeddings' is not defined

In [100]:
arr = np.array(ss[0])
arr.shape
#arr = np.array([])
for em in ss[1:]:
    arr = np.vstack((arr,em))
arr.shape
arr = torch.tensor(arr).permute(1,0,2)

# Stores the token vectors, with shape [22 x 3,072]
embed_words = []
for embed in arr:
    # option 1
#     vec = torch.cat([x for x in embed[-4:]], dim=0)
#     embed_words.append(vec)
    # option 2
    vec = torch.sum(embed[-4:], dim=0)
    embed_words.append(vec)

(1, 128, 300)

In [115]:
arr.shape

torch.Size([128, 100, 300])

In [128]:
v_temp = []
permuted = ss[0].permute(1,0,2)[1:]
for token in ss[0]:    
    sum_vec = torch.sum(token[-1:], dim=0)
    v_temp.append(sum_vec)

In [141]:
ss[0].shape
permute_ = ss[0].permute(1,0,2)
embeddings_array = []
for token in permute_:    
    sum_vec = torch.sum(token[-1:], dim=0)
    embeddings_array.append(sum_vec)

In [152]:
jobs[4]
ss1[4].shape


torch.Size([1, 128, 128])

In [113]:
for word in text:
    words = tokenizer.tokenize(text)
    print(f"tokenized text: {words}")

    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[0]
    premuted_token_embeddings = last_hidden_states.permute(1,0,2)

    embeddings_array = []
    for token in premuted_token_embeddings:    
        sum_vec = torch.sum(token[-1:], dim=0)
        embeddings_array.append(sum_vec)

tensor([ 1.0942, -0.7470, -0.4808, -1.1342, -0.3187,  0.5898, -1.1278, -0.5733,
        -2.9823, -0.3715,  0.1594, -0.8888, -1.3297,  1.5315, -0.5492,  0.1243,
        -0.6405, -0.8386, -0.1274, -0.3215,  0.8914, -0.0868, -1.1806,  1.9957,
        -0.6228, -1.0396, -2.4755, -0.8926,  0.1639,  2.0686,  0.9769,  1.7476,
        -1.9696,  0.0380, -0.3319,  1.1865, -2.0985, -2.3394, -1.1130, -0.4222,
        -1.7908,  1.6347,  1.3268,  0.8771,  0.0572, -0.6786,  0.4012, -0.8841,
         0.4252,  1.0750, -0.1496,  0.4367, -0.3810,  1.4738,  0.0243, -1.2522,
        -1.4610,  0.6421, -0.0965, -0.7510, -2.0862, -0.0526, -1.8559,  2.9231,
         2.2449, -0.0099, -1.4532, -1.4022, -1.4259,  1.4055, -1.7303, -0.0753,
        -0.0910, -1.1743,  1.7392, -1.1135,  0.2755,  1.9611, -0.8392,  1.1992,
         1.5793, -1.2127,  1.7677, -2.4366,  0.1138,  0.5713,  1.1973, -0.8328,
         0.4585,  1.9672,  1.7817,  1.3759,  2.3485,  0.6467, -0.3440, -1.1367,
         1.5754, -1.2732,  0.1950, -0.27

In [None]:
list_of_text = [] 
with open(filename) as file:
    for line in file:
        list_of_text.append(line.rstrip())