In [76]:
import sys
import time
import re
import string
import pickle
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
from collections import OrderedDict
from typing import Union, List
from pandas import ExcelWriter


##################################################################

def load_corpus_as_sentlist(res_pklfile,jd_pklfile):
    import pickle
    #load cleaned sentence lists for jd and res
    with open(res_pklfile, "rb") as handle:
        res_pkg=pickle.load(handle)
        handle.close()
        r=res_pkg['sentences']
    with open(jd_pklfile, "rb") as handle:
        jd_pkg=pickle.load(handle)
        handle.close()
        j=jd_pkg['sentences']
    return(r,j)

##################################################################
def load_and_embed_sent_transformer(pretrained_modelname):
    model = SentenceTransformer(pretrained_modelname)
    model.max_seq_length = 256
    
    
    #Compute embedding for both lists
    s_emb_r = model.encode(sent_r, convert_to_tensor=True)
    s_emb_j = model.encode(sent_j, convert_to_tensor=True)
    w_emb_r = model.encode(sent_r, output_value='token_embeddings', convert_to_tensor=True)
    w_emb_j = model.encode(sent_j,output_value='token_embeddings', convert_to_tensor=True)


    #Compute cosine-similarities for sentences
    cos_s = util.pytorch_cos_sim(s_emb_r, s_emb_j)


    tf=model.add_tokens([])
    voc=tf[0].get_vocab()

    with open('tf.pickle','wb') as handle:
        pickle.dump(tf[0], handle,protocol=pickle.HIGHEST_PROTOCOL)

    # # loading
    with open('tf.pickle', 'rb') as handle:
        tf_adpt = pickle.load(handle)
        handle.close
    return({'s_r':s_emb_r,'s_j':s_emb_j,'w_r':w_emb_r,'w_j':w_emb_j},voc, model,cos_s)

##################################################################

# converting a list of sentences into a list of words
def wordlist_from_sentencelist(corp):
    word_list=[]
    for s in corp:
        new_s=re.sub('[^A-Za-z0-9äöü]+', ' ', s)
        word_list.extend(new_s.split())
        word_list=list(set(word_list)-set(['','.','-','a','an','in','at','the','up','my','(',')'])) # removing duplicates and '' here
        word_list.sort()
    return word_list

##################################################################

# finding all unknown words from word_list in vocabulary
def find_ukn_words(word_list,vocab):
    ukn_word_list=[]
    char_not_allowed=['',',','(',')']
    for w in word_list:
        for c in char_not_allowed:
            w=w.replace(c,'')
            if w not in ['','.'] and w not in vocab:
                ukn_word_list.append(w)
    return ukn_word_list # list of words not found in vocab

##################################################################

# add the words in word_list to the vocab and resize the model
def add_words_to_vocab_and_save(word_list):
    tf=model.add_tokens(word_list)
    
    return tf

##################################################################

def remove_duplicates(wordlist):
    final_list = list(dict.fromkeys(wordlist))
    return final_list

##################################################################

#generates wordlist from list of sentences, cleanS special characters and duplicates and extract words not in vocab
def get_clean_wordlist(corp,vocab):
    wl=wordlist_from_sentencelist(corp)
    unknown_wl=find_ukn_words(wl,vocab)
    l=remove_duplicates(unknown_wl)
    return l   

##################################################################
def get_words_with_tensors(word_emb):
    #input_ids
    wlist=[]
    i_id=word_emb['input_ids']
    t_emb=word_emb['token_embeddings']
    for i in range(0,len(i_id)-1):
        for j in range(0,len(i_id[i])-1):
            w=list(adpt_voc)[int(i_id[i][j])]
            idx=int(i_id[i][j])
            v=t_emb[i][j]
            if (idx >200):
                wlist.append((idx,w,v))
    return(wlist)

##################################################################

def cosim_words(w_r,w_j):
    cosim_r=[]
    cosim_j=[]
    start=time.time()
    for r in range(0,len(w_r)):
        for j in range(0,len(w_j)):
            cosim_j.append(float(util.pytorch_cos_sim(w_r[r][2],w_j[j][2])))
        cosim_r.append(cosim_j)
        cosim_j=[]
    end=time.time()
    #print('finished in ', end-start, ' seconds')
    return cosim_r
    
##################################################################    
def make_word_panda(w_dict_r,w_dict_j,cosim_r):
    res_words=[]
    jd_words=[]
    for w in w_dict_j:
        jd_words.append(w[1])
    for x in w_dict_r:
        res_words.append(x[1])
    PATH='Z:/FILES/OUTPUT/'
    df=pd.DataFrame(cosim_r, columns=jd_words, index=res_words)
    fname=PATH+'word_sim_'+str(time.time())+'.xlsx'
    with pd.ExcelWriter(fname) as writer:  
        df.to_excel(writer, sheet_name='CosSim Words')
    return df

def make_sent_pairs(sent_r,sent_j,cos_s):
    #Find the pairs with the highest cosine similarity scores
    pairs = []
    no=0
    for i in range(len(cos_s)):
        for k in range(i+1, len(cos_s)):
            pairs.append({'pair_no': 0,'index': [i, k], 'score': cos_s[i][k].item(), 'sentence RES':sent_r[i], 'sentence JD':sent_j[k]})

    #Sort scores in decreasing order
    pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)
    for j in range(0,len(pairs)-1):
        pairs[j]['pair_no']=no
        no += 1
    
    PATH='Z:/FILES/OUTPUT/'
    df_spairs=pd.DataFrame(pairs)
    fname=PATH+'sent_sim_'+str(time.time())+'.xlsx'
    with pd.ExcelWriter(fname) as writer:  
        df_spairs.to_excel(writer, sheet_name='CosSim Sent')   
    return pairs,df_spairs
##################################################################


In [77]:
start=time.time()
# loading corpus as clean lists of sentetnces from pickle files
sent_r,sent_j=load_corpus_as_sentlist('res_slist.pkl','jd_slist.pkl')
# getting the embeddings for sentences and words and the vocabulary
emb,voc, model,cos_s=load_and_embed_sent_transformer("paraphrase-MiniLM-L12-v2")

#add unkn words to vocab from both docs
clean=get_clean_wordlist(sent_j,voc)
adpt_tokenizer,adpt_model,adpt_voc=model.add_tokens(clean)
clean=get_clean_wordlist(sent_r,adpt_voc)
adpt_tokenizer,adpt_model,adpt_voc=model.add_tokens(clean)

#embed words again with extended vocabulary
w_emb_r = model.encode(sent_r, output_value='token_embeddings', convert_to_tensor=True)
#get a dict with idx, words and tensors
w_dict_r=get_words_with_tensors(model.tokenized_list)

#the same 2 steps for jd
w_emb_j = model.encode(sent_j,output_value='token_embeddings', convert_to_tensor=True)
w_dict_j=get_words_with_tensors(model.tokenized_list)

w_sim=cosim_words(w_dict_r,w_dict_j)
df=make_word_panda(w_dict_r,w_dict_j,cosim_r)

pairs,df_spairs=make_sent_pairs(sent_r,sent_j,cos_s)
end=time.time()
print("main finished after ", end-start, " seconds")


You try to use a model that was created with version 1.2.0, however, your version is 1.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





# of tokens BEFORE:  30522
We have added 0 tokens
# of tokens AFTER:  30522
# of tokens BEFORE:  30522
We have added 21 tokens
# of tokens AFTER:  30543
# of tokens BEFORE:  30543
We have added 8 tokens
# of tokens AFTER:  30551
main finished after  22.186150312423706  seconds


In [75]:
df


Unnamed: 0,we,offer,an,interesting,challenge,and,attractive,employment,conditions,in,...,from,eur,deliver,service,enhancements,key,responsibilities,:,itil,certificate
conducted,0.290504,0.217180,0.021576,0.037805,0.014925,0.043740,0.026555,0.025199,0.081655,0.197000,...,0.099950,0.147496,0.270335,0.153928,0.125402,0.058871,0.126191,0.111210,0.140330,0.074244
an,0.255133,-0.162681,0.656741,-0.022993,-0.016443,0.452546,-0.048022,0.033652,-0.150795,0.007098,...,0.220735,0.215888,0.164734,0.235080,0.206386,0.031140,0.045120,0.381500,0.260966,0.081294
effort,0.045881,0.320336,-0.068085,0.270919,0.421351,-0.080165,0.278598,0.153157,0.262735,0.216755,...,-0.137024,-0.067426,0.088918,0.134098,-0.040824,0.049422,0.111341,-0.044989,-0.127493,-0.059873
to,0.166413,0.048974,0.286482,0.082190,0.151251,0.270185,0.101064,0.036500,0.073792,0.330357,...,0.129247,0.106327,0.282467,0.307498,0.220401,0.106019,0.186611,0.272176,0.123253,0.111432
migrate,0.010095,0.069037,-0.004819,0.080557,0.142928,-0.019021,0.156295,0.088632,0.111894,0.184071,...,0.001143,0.048100,0.252025,0.026736,0.133403,0.022892,-0.047117,-0.085377,-0.002894,0.056167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
austria,-0.028889,0.060457,-0.004254,0.084854,-0.009719,0.013197,0.111491,0.068423,0.084457,0.065421,...,0.040543,0.080879,-0.068190,-0.093890,0.011948,0.124280,0.040517,0.005090,0.103760,0.013448
.,0.187649,-0.171762,0.516886,-0.096962,-0.046474,0.419285,-0.119960,-0.032790,-0.199589,-0.060601,...,0.143213,0.124577,0.119516,0.113384,0.169266,-0.017603,0.079484,0.366957,0.269492,0.211395
professional,0.056774,-0.034594,0.261446,0.028801,0.022207,0.224271,0.064881,0.149080,-0.046366,0.119094,...,0.298342,0.068160,0.254476,0.268654,0.194699,0.175701,0.346341,0.430766,0.220726,0.402097
experience,0.045517,0.144092,0.182948,0.139601,0.190277,0.258731,0.149759,0.154029,0.180029,0.272516,...,0.197197,0.069972,0.172443,0.256890,0.136585,0.104162,0.361729,0.399619,0.131578,0.298068


In [74]:
df1

Unnamed: 0,pair_no,index,score,sentence RES,sentence JD
0,0,"[6, 7]",0.796949,set up the company service desk and assess tea...,"sets up and maintain service desk, including m..."
1,1,"[6, 20]",0.750100,set up the company service desk and assess tea...,experience in setting up and running a service...
2,2,"[1, 19]",0.730559,experienced it service lead and global it cons...,to years of it service management work experi...
3,3,"[0, 4]",0.721961,it service lead and consultant,teamlead it service management .
4,4,"[0, 19]",0.663157,it service lead and consultant,to years of it service management work experi...
...,...,...,...,...,...
491,491,"[16, 25]",-0.023737,transitioning an in-sourced business unit from...,language skills: english fluent
492,492,"[5, 25]",-0.025321,"modis, vienna, austria .",language skills: english fluent
493,493,"[7, 29]",-0.045077,in charge of developing and delivering strateg...,gross monthly salary: from eur
494,494,"[5, 13]",-0.052839,"modis, vienna, austria .",monitors department issues and client complain...
