### Loading Bert fine-tuned Model

In [15]:
from pathlib import Path
import os
import torch
from transformers import pipeline, BertTokenizerFast, BertForTokenClassification, BertModel

os.environ["TOKENIZERS_PARALLELISM"] = "true"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = str(Path("./out_new3").absolute())
TOKENIZER = BertTokenizerFast.from_pretrained(MODEL_PATH)
MODEL = BertForTokenClassification.from_pretrained(MODEL_PATH)
MODEL.eval()
MODEL_BERT = BertModel.from_pretrained(MODEL_PATH, output_hidden_states = True)
MODEL_BERT.eval()
pipe = pipeline("token-classification", model=MODEL, tokenizer=TOKENIZER)

LABELS_MAP = dict(LABEL_0='B', LABEL_1='I', LABEL_2='O')

Some weights of the model checkpoint at C:\Users\karam\PycharmProjects\GermanJobSkillExtractor\out_new3 were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\karam\PycharmProjects\GermanJobSkillExtractor\out_new3 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Calc ESCO embeddings

In [16]:
def get_embedding(text: str):
    encoded = TOKENIZER.encode_plus(
                text,
                return_tensors="pt",
                truncation=True,
                ).to(DEVICE)
    with torch.no_grad():
        output = MODEL_BERT(**encoded)
        return torch.mean(output.last_hidden_state.squeeze(), dim=0).numpy()

In [17]:
import joblib
from tqdm import tqdm
import numpy as np

JOBS_EMB_FILE = Path("SKILLS_DE_EMD.joblib")

if JOBS_EMB_FILE.exists():
    JOBS_EMB = joblib.load(JOBS_EMB_FILE)
else:
    JOBS_EMB = dict()
    with open("skills_de.txt", "rt", encoding="utf8") as f:
        for line in tqdm(list(f)[:100]):
            span = line.split("\t")[0]
            emb = get_embedding(span) 
            JOBS_EMB[span] = emb
    joblib.dump(JOBS_EMB, JOBS_EMB_FILE, compress=5)

ESCO_JOB_SPANS = list(JOBS_EMB.keys())
ESCO_JOB_SPANS_EMB = np.array(list(JOBS_EMB.values()))

### Load Files to label

In [20]:
SEEN_CACHE = dict()

def get_models_span_embedding(text: str):
    key = text.lower()
    if key in SEEN_CACHE:
        return SEEN_CACHE[key]
    emb = get_embedding(text)
    SEEN_CACHE[key] = emb
    return emb

In [21]:
from itertools import groupby
import re
from operator import itemgetter, attrgetter
from sklearn.metrics.pairwise import cosine_similarity

# Map label 0 and 1 to same label for grouping 
LABELS_MAP = dict(LABEL_0='J', LABEL_1='J', LABEL_2='O')
ESCO_LABEL_SIM_THR = .8

DOCS_SPANS = []

files = Path("./tagedFiles").glob("*.txt")
for file in tqdm(list(files)):
    spans = []
    lines = file.read_text().split("\n")
    # remove labels
    lines = [l.replace("<<*>>", "") for l in lines]
    for line in tqdm(lines, leave=False):
        line = re.sub(r"\s+", line, " ", flags=re.DOTALL | re.IGNORECASE | re.MULTILINE)
        # check exact match of esco
        for esco_span in ESCO_JOB_SPANS:
            if esco_span in line:
                spans.append(esco_span)
        
        # use model to find job spans
        labels = ({**la, "entity":LABELS_MAP[la["entity"]]} for la in pipe(line))
        for l, tokens in groupby(labels, key=itemgetter("entity")):
            if l == "O": continue
            tokens = list(tokens)
            start = tokens[0]["start"]
            end = tokens[-1]["end"]

            span = line[start:end]

            emb = get_models_span_embedding(span)
            sims = cosine_similarity([emb], ESCO_JOB_SPANS_EMB).squeeze()
            max_idx = np.argmax(sims)
            
            if sims[max_idx] >= ESCO_LABEL_SIM_THR:
                # print("esco replace:", span, " ==> ", ESCO_JOB_SPANS[max_idx])
                normed_span = ESCO_JOB_SPANS[max_idx]
                spans.append(normed_span)
        
    DOCS_SPANS.append(spans)


0it [00:00, ?it/s]


#### Select subset of ESCO

In [22]:
from itertools import chain

SELECTED_ESCO_SPANS = set(chain.from_iterable(DOCS_SPANS))

with open("./esco_skills_de.txt", "rt") as f:
    SELECTED_ESCO_SPANS = SELECTED_ESCO_SPANS | set(l.strip() for l in f.readlines())
    
SELECTED_ESCO_SPANS = list(SELECTED_ESCO_SPANS)
len(SELECTED_ESCO_SPANS)

13891

### Step 2

In [23]:
from collections import defaultdict

SPANS_DOCS_COUNT = defaultdict(int)
for doc in DOCS_SPANS:
    for span in set(doc):
        SPANS_DOCS_COUNT[span] += 1

In [24]:
from sklearn.metrics import pairwise_distances

EPS = np.finfo(np.float64).eps

def calc_cbm_score(t1_idx, t2_idx) -> float:
    t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
    t2 = SELECTED_ESCO_SPANS[int(t2_idx)]

    t1_count = SPANS_DOCS_COUNT[t1]
    t2_count = SPANS_DOCS_COUNT[t2]
    t1t2_count = 0
    for doc in DOCS_SPANS:
        if t1 in doc and t2 in doc:
            t1t2_count += 1
    
    if t1_count == 0 or t2_count == 0:
        return 0.
    elif t1_count == t2_count and t1_count == t1t2_count:
        return 1.
    else:
        # print(t1_count, t2_count, t1t2_count, np.log10(t1_count + 1), np.log10(t2_count + 1), np.power(np.log10(t1t2_count) + EPS, 2))
        # np.power(np.log10(t1t2_count), 2)
        return np.power(np.log10(t1t2_count + 1), 2)/(np.log10(t1_count + 1.) * np.log10(t2_count + 1.))

CBM_SCORES = pairwise_distances(np.arange(len(SELECTED_ESCO_SPANS)).reshape((-1, 1)), metric=calc_cbm_score, n_jobs=-1)
CBM_SCORES
# calc_cbm_score(0, 1)

  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]


In [26]:
from sklearn.metrics import pairwise_distances
import numpy as np
from tqdm import tqdm
import os

EPS = np.finfo(np.float64).eps

def calc_cbm_score(t1_idx, t2_idx) -> float:
    t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
    t2 = SELECTED_ESCO_SPANS[int(t2_idx)]

    t1_count = SPANS_DOCS_COUNT[t1]
    t2_count = SPANS_DOCS_COUNT[t2]
    t1t2_count = 0
    for doc in DOCS_SPANS:
        if t1 in doc and t2 in doc:
            t1t2_count += 1
    
    if t1_count == 0 or t2_count == 0:
        return 0.
    elif t1_count == t2_count and t1_count == t1t2_count:
        return 1.
    else:
        return np.power(np.log10(t1t2_count + 1), 2) / (np.log10(t1_count + 1.) * np.log10(t2_count + 1.))

# Define chunk size
CHUNK_SIZE = 100
N = len(SELECTED_ESCO_SPANS)
NUM_CHUNKS = (N + CHUNK_SIZE - 1) // CHUNK_SIZE

# Counter to keep track of progress
counter = 0

# Check if counter exists and load it
if os.path.exists('counter.npy'):
    counter = np.load('counter.npy')

# Compute in chunks
for idx in tqdm(range(counter, NUM_CHUNKS)):
    start_col = idx * CHUNK_SIZE
    end_col = min(start_col + CHUNK_SIZE, N)
    
    CBM_SCORES_chunk = pairwise_distances(
        np.arange(N).reshape((-1, 1)),
        np.arange(start_col, end_col).reshape((-1, 1)),
        metric=calc_cbm_score,
        n_jobs=32
    )
    
    # Save the chunk
    np.save(f'CBM_SCORES_chunk_{idx}.npy', CBM_SCORES_chunk)
    
    # Update the counter
    counter = idx + 1
    np.save('counter.npy', counter)

# Concatenate all the chunks to form the full matrix
CBM_SCORES = np.concatenate([np.load(f'CBM_SCORES_chunk_{i}.npy') for i in range(NUM_CHUNKS)], axis=1)


  t2 = SELECTED_ESCO_SPANS[int(t2_idx)]
  t1 = SELECTED_ESCO_SPANS[int(t1_idx)]
0it [00:00, ?it/s]


In [243]:
from sklearn.metrics.pairwise import cosine_similarity

COS_SIMS = cosine_similarity(list(get_embedding(span) for span in SELECTED_ESCO_SPANS))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import os

# Function to get the embedding of a span
# You should already have this function defined in your code
# def get_embedding(span):
#     ...

CHUNK_SIZE = 100
N = len(SELECTED_ESCO_SPANS)
NUM_CHUNKS = (N + CHUNK_SIZE - 1) // CHUNK_SIZE

# Counter to keep track of progress
counter = 0

# Check if counter exists and load it
if os.path.exists('counter.npy'):
    counter = np.load('counter.npy')

# Precompute all embeddings
all_embeddings = [get_embedding(span) for span in SELECTED_ESCO_SPANS]

# Compute in chunks
for idx in tqdm(range(counter, NUM_CHUNKS)):
    start_col = idx * CHUNK_SIZE
    end_col = min(start_col + CHUNK_SIZE, N)
    
    COS_SIMS_chunk = cosine_similarity(all_embeddings, all_embeddings[start_col:end_col])
    
    # Save the chunk
    np.save(f'COS_SIMS_chunk_{idx}.npy', COS_SIMS_chunk)
    
    # Update the counter
    counter = idx + 1
    np.save('counter.npy', counter)

# Concatenate all the chunks to form the full matrix
COS_SIMS = np.concatenate([np.load(f'COS_SIMS_chunk_{i}.npy') for i in range(NUM_CHUNKS)], axis=1)


In [31]:
CBM_SCORES = np.load('.\CBM_Score.npy')
COS_SIMS = np.load('.\CosineSimilarityMatrix.npy')


ValueError: operands could not be broadcast together with shapes (13999,13999) (13891,13891) 

In [34]:
print('CBM',CBM_SCORES.shape)
print('Cos_sim',COS_SIMS.shape)

a = 0.4
# COMPARE_SCORE = a*CBM_SCORES + (1-a)COS_SIMS
# CBM_SCORES+COS_SIMS

CBM (13999, 13999)
Cos_sim (13891, 13891)


In [None]:
import pickle, gzip

with gzip.open("searchModel.pickle.gz", "wb", compresslevel=5) as zf:
    pickle.dump({
        "scores": COMPARE_SCORE,
        "spans": SELECTED_ESCO_SPANS
    }, zf)

### Step 4

In [132]:
input_text = "Monitoring"

In [244]:
import Levenshtein

MIN_LEVEN_SIM_SCORE = .1

if input_text in SELECTED_ESCO_SPANS:
    # Exact match:
    pass
else:
    most_similar_esco_spans = [
        (idx, Levenshtein.ratio(input_text, span, processor=lambda t: t.lower(), score_cutoff=MIN_LEVEN_SIM_SCORE))
        for idx, span in enumerate(SELECTED_ESCO_SPANS)
    ]
    most_similar_esco_spans = filter(lambda p: p[1] > 0.0, most_similar_esco_spans)
    most_similar_esco_spans = list(sorted(most_similar_esco_spans, key=itemgetter(1), reverse=True))[:5]


selected_span_near_to_input = SELECTED_ESCO_SPANS[most_similar_esco_spans[0][0]]
selected_span_near_to_input, most_similar_esco_spans

('colour grading',
 [(32, 0.5),
  (11, 0.4),
  (33, 0.3783783783783784),
  (8, 0.36363636363636365),
  (5, 0.33333333333333337)])

In [261]:
COMPARE_SCORE = CBM_SCORES + COS_SIMS

for idx, _ in most_similar_esco_spans:
    sims = COMPARE_SCORE[idx]
    most_similar = np.argsort(sims)[-2]
    print(SELECTED_ESCO_SPANS[idx], " ==> ", SELECTED_ESCO_SPANS[most_similar])
    

colour grading  ==>  add colour
colour ranges of roasting  ==>  colours of roasted malt
collect tourist information  ==>  combination of flavours
colour glass  ==>  add colour
collect visitor fees  ==>  collect rental fees
