In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import gensim.downloader as api
import nltk
from nltk.tokenize import word_tokenize
import spacy
from hashlib import md5

In [29]:
nlp = spacy.load('fr_core_news_lg')

In [30]:
data_org = pd.read_csv("job_data.csv")

In [60]:
text_elements = data_org['Title'].tolist()
similarty_matrix = pd.DataFrame(index=text_elements, columns=text_elements)

In [32]:
class Token:
    def __init__(self, hash_list, weight):
        self.hash_list = hash_list
        self.weight = weight

In [33]:
data_hash = pd.read_csv("final_jobs_data.csv")

In [34]:
def md5Hash(token):
    h = bin(int(md5(token).hexdigest(), 16))
    return h[2:]

In [35]:
def hash_threshold(token_dict, fp_len):
    sum_hash = [0] * fp_len
    for _, token in token_dict.items():
        sum_hash = [ x + token.weight * y for x, y in zip(sum_hash, token.hash_list)]

    # apply binary threshold
    for i, ft in enumerate(sum_hash):
        if ft > 0:
            sum_hash[i] = 1
        else:
            sum_hash[i] = 0
    return sum_hash


In [36]:
def binconv(fp, fp_len):
    vec = [1] * fp_len
    for indx, b in enumerate(fp):
        if b == '0':
            vec[indx] = -1
    return vec

In [37]:
def calc_weights(terms, fp_len):
    term_dict = {}
    for term in terms:
        # get weights
        if term not in term_dict:
            fp_hash = md5Hash(term.encode(encoding = 'UTF-8', errors = 'strict')).zfill(fp_len)
            fp_hash_list = binconv(fp_hash, fp_len)
            token = Token(fp_hash_list, 0)
            term_dict[term] = token
        term_dict[term].weight += 1
    return term_dict

In [38]:
def simhash(doc, fp_len):
    tokens = tokenize(doc)
    token_dict = calc_weights(tokens, fp_len)
    fp_hash_list = hash_threshold(token_dict, fp_len)
    fp_hast_str =  ''.join(str(v) for v in fp_hash_list)
    return fp_hast_str

In [39]:
def tokenize(data_in):
    tmp = []
    for j in data_in:
        doc = nlp(j)
        for token in doc:
            token = token.lemma_
            tmp.append(token)
    return tmp

In [40]:
fp_len = 128

In [41]:
data_trans = data_hash.transpose()
data_trans[1]

Title                                      area manager casablanca maroc
Description            communication publicité rp marketing secteur d...
Experience                                                        3 5 an
School level                                   bac plus 5 minimum master
Contract Type                                                        cdi
Teleworking                                                          non
Deadline                                                      18 09 2023
Entreprise             entreprise 20 an orchestrer spécialiste mode e...
Post information       poste chercher compte un area manager mission ...
Profile information    profil rechercher bacplus 3 plus 5 marketing m...
Personality            organisation flexibilité ambition extraversion...
Name: 1, dtype: object

In [42]:
tokens=[]
for i in range(1231):
    token = tokenize(data_trans[i])
    tokens.append(token)

In [43]:
token_dict = []
for i in range(len(tokens)):
    token_d = calc_weights(tokens[i], fp_len)
    token_dict.append(token_d)

In [44]:
fp_hash_list=[]
for i in range(1231):
    hashs = hash_threshold(token_dict[i], fp_len)
    fp_hash_list.append(hashs)

In [45]:
fp_hast_str_list = []
for i in range(1231):
    fp_hast_str =  ''.join(str(v) for v in fp_hash_list[i])
    fp_hast_str_list.append(fp_hast_str)

In [46]:
print(fp_hast_str_list)

['11101101111010111000010101101010000011100000010111100111100111100110010011001011000111000111001011100010001101010110001010111100', '11000110100010110101000101011110101011010000100001100000110110011111011010110001011010111101000111100110000101010001101001110000', '11010110110010011100010100101100001001010000001000100000111110010110001001010001000011110000110111101101001100111101010101110100', '11110011111010001000100101011110100011000000010001000110001100110111000010101000101110110001000011100011111001010010010010010100', '11010000100110000100100100001100101011000001010001100001111100100110111000111101011111110110000111100100010101010100011000101000', '10110110011100100000110001101101000011111111001101101101100111110111111110001001100101010101000001101111001100010001101100101000', '10110101100010101100000110101100000011010001000000100000011010111111011110110110001100010101111111001001101011010100111110101101', '01010111100010010100110101011100001011010001000001100000100110100110011110

In [47]:
tokens[:10]

[['développeur',
  'full',
  'stack',
  'php',
  'casablanca',
  'maroc',
  'informatique',
  'electroniqu',
  'télécom',
  'réseau',
  'secteur',
  'informatique',
  'service',
  '1',
  'an',
  '1',
  '3',
  'an',
  'bac',
  'plus',
  '2',
  'minimum',
  'master',
  'ecole',
  'ingénieur',
  'cdi',
  'hybride',
  '18',
  '09',
  '2023',
  'entreprise',
  'heure',
  'f',
  'associat',
  '10',
  'important',
  'cabinet',
  'recrutement',
  'maroc',
  'recruter',
  'partenaire',
  '10',
  'an',
  'heure',
  'f',
  'associate',
  'accompagner',
  'partenair',
  'recrutement',
  'recherche',
  'meilleur',
  'profil',
  'jusqu',
  "'",
  'intégration',
  'candidat',
  'choisir',
  'cabinet',
  'recrutement',
  'généraliste',
  'intervenir',
  'secteur',
  'activité',
  'typ',
  'poste',
  'auprès',
  'entreprise',
  'taille',
  'grand',
  'groupe',
  'international',
  'eti',
  'pme',
  'local',
  'qualité',
  'exprimer',
  'travers',
  'méthodologie',
  'résultat',
  'tal',
  'confirmation

In [48]:
def compare_simhashes(simhash1, simhash2):
    
    int_simhash1 = int(simhash1, 16)
    int_simhash2 = int(simhash2, 16)
  
    distance = bin(int_simhash1 ^ int_simhash2).count('1')
  
    return distance

In [63]:
jobs1_list = []
jobs2_list = []
distances = []
hast_str1 = []
hast_str2 = []
for i in range(len(fp_hast_str_list)):
    for j in range(len(fp_hast_str_list)):
        dist = compare_simhashes(fp_hast_str_list[i], fp_hast_str_list[j])
        distances.append(dist)
        similarty_matrix.loc[data_org['Title'][i], data_org['Title'][j]] = 1 - dist/128
        jobs1_list.append(data_trans[i]['Title'])
        hast_str1.append(fp_hast_str_list[i])
        jobs2_list.append(data_trans[j]['Title'])
        hast_str2.append(fp_hast_str_list[j])

In [50]:
jobs1_list_df=pd.DataFrame(jobs1_list,columns=['Job offer 1'])
jobs2_list_df=pd.DataFrame(jobs2_list,columns=['Job offer 2'])
hast_str1_df = pd.DataFrame(hast_str1,columns=['Simhash For Job offer 1'])
hast_str2_df = pd.DataFrame(hast_str2,columns=['Simhash For Job offer 2'])
distances_df = pd.DataFrame(distances,columns=['Distances'])

In [51]:
temp=[]
for i in range(len(distances)):

    if distances[i] < 10:
        temp.append(distances[i])

In [52]:
len(temp)

1773

In [54]:
Simhash_Sim_df=pd.concat([jobs1_list_df,jobs2_list_df,hast_str1_df,hast_str2_df, distances_df],axis=1)
Simhash_Sim_df

Unnamed: 0,Job offer 1,Job offer 2,Simhash For Job offer 1,Simhash For Job offer 2,Distances
0,développeur full stack php casablanca maroc,développeur full stack php casablanca maroc,1110110111101011100001010110101000001110000001...,1110110111101011100001010110101000001110000001...,0
1,développeur full stack php casablanca maroc,area manager casablanca maroc,1110110111101011100001010110101000001110000001...,1100011010001011010100010101111010101101000010...,56
2,développeur full stack php casablanca maroc,stagiaire marketing casablanca maroc,1110110111101011100001010110101000001110000001...,1101011011001001110001010010110000100101000000...,59
3,développeur full stack php casablanca maroc,support it sénior plus 5 an expérience rabat m...,1110110111101011100001010110101000001110000001...,1111001111101000100010010101111010001100000001...,45
4,développeur full stack php casablanca maroc,agent logistique casablanca maroc,1110110111101011100001010110101000001110000001...,1101000010011000010010010000110010101100000101...,54
...,...,...,...,...,...
1515356,directeur qualité formation groupe call center...,ingénieur étude développement java sénior heur...,1111011101001000110011010100100110001101000000...,1010001101011011100111010000110100001101000000...,47
1515357,directeur qualité formation groupe call center...,ingénieur production casablanca maroc,1111011101001000110011010100100110001101000000...,1111111101101001000010000110011100011100110011...,49
1515358,directeur qualité formation groupe call center...,ingénieur virtualisation casablanca maroc,1111011101001000110011010100100110001101000000...,1010010101111010000010000110010110011100110000...,56
1515359,directeur qualité formation groupe call center...,stagiaire recrutement rabat maroc,1111011101001000110011010100100110001101000000...,1111001011000011100111110100111010001100010000...,43


In [1]:
Simhash_Sim_df["Similarity"] = 1 - Simhash_Sim_df["Distances"]/128

NameError: name 'Simhash_Sim_df' is not defined

In [None]:
Simhash_Sim_df

In [57]:
similar_df = Simhash_Sim_df[(Simhash_Sim_df['Distances'] < 10)]

In [58]:
similar_df

Unnamed: 0,Job offer 1,Job offer 2,Simhash For Job offer 1,Simhash For Job offer 2,Distances,Similarity
0,développeur full stack php casablanca maroc,développeur full stack php casablanca maroc,1110110111101011100001010110101000001110000001...,1110110111101011100001010110101000001110000001...,0,1.0
1232,area manager casablanca maroc,area manager casablanca maroc,1100011010001011010100010101111010101101000010...,1100011010001011010100010101111010101101000010...,0,1.0
2464,stagiaire marketing casablanca maroc,stagiaire marketing casablanca maroc,1101011011001001110001010010110000100101000000...,1101011011001001110001010010110000100101000000...,0,1.0
3696,support it sénior plus 5 an expérience rabat m...,support it sénior plus 5 an expérience rabat m...,1111001111101000100010010101111010001100000001...,1111001111101000100010010101111010001100000001...,0,1.0
4928,agent logistique casablanca maroc,agent logistique casablanca maroc,1101000010011000010010010000110010101100000101...,1101000010011000010010010000110010101100000101...,0,1.0
...,...,...,...,...,...,...
1510432,ingénieur étude développement java sénior heur...,ingénieur étude développement java sénior heur...,1010001101011011100111010000110100001101000000...,1010001101011011100111010000110100001101000000...,0,1.0
1511664,ingénieur production casablanca maroc,ingénieur production casablanca maroc,1111111101101001000010000110011100011100110011...,1111111101101001000010000110011100011100110011...,0,1.0
1512896,ingénieur virtualisation casablanca maroc,ingénieur virtualisation casablanca maroc,1010010101111010000010000110010110011100110000...,1010010101111010000010000110010110011100110000...,0,1.0
1514128,stagiaire recrutement rabat maroc,stagiaire recrutement rabat maroc,1111001011000011100111110100111010001100010000...,1111001011000011100111110100111010001100010000...,0,1.0


In [64]:
similarty_matrix = pd.DataFrame(similarty_matrix.fillna(0))
similarty_matrix

Unnamed: 0,Développeur Full Stack PHP | Casablanca (Maroc),AREA MANAGER | CASABLANCA (Maroc),STAGIAIRE EN MARKETING | CASABLANCA (Maroc),Support IT sénior ( +5 ans d'expérience) | Rabat (Maroc),AGENT LOGISTIQUE | CASABLANCA (Maroc),"Devenez Consultant en rejoignant Leyton ! (Offre ouvertes aux ingénieurs, docteurs et doctorants en informatique) | Casablanca (Maroc)",Expert.e en Finance Digital (FinTech) | Rabat (Maroc),Responsable Magasin | Kenitra (Maroc),Comptable H/F | Casablanca (Maroc),Développeur Fullstack PHP Laravel (Freelance ou CDI) | Casablanca (Maroc),...,Administrateur Système SharePoint et O365 avancés | Technopolis (Maroc),Chargé(e) de Communication | Salé (Technopolis) (Maroc),Full stack PHP/Javascript senior 5 ans d'expérience ( full remote) | Casablanca (Maroc),Ingénieur études et développement JAVA Confirmé (H/F) | Tanger/ Tétouan (Maroc),Développeur PHP | Rabat (Maroc),Ingénieur études et développement JAVA Sénior (H/F) | Tanger/ Tétouan (Maroc),Ingénieur de Production | Casablanca (Maroc),Ingénieur Virtualisation | Casablanca (Maroc),Stagiaire Recrutement | Rabat (Maroc),Directeur Qualité Formation Groupe ( Call center) | Casablanca (Maroc)
Développeur Full Stack PHP | Casablanca (Maroc),1.000000,0.562500,0.539062,0.648438,0.578125,0.609375,0.539062,0.585938,0.578125,0.640625,...,0.609375,0.593750,0.726562,0.609375,0.664062,0.578125,0.671875,0.617188,0.562500,0.632812
AREA MANAGER | CASABLANCA (Maroc),0.562500,1.000000,0.664062,0.601562,0.687500,0.546875,0.585938,0.789062,0.593750,0.500000,...,0.546875,0.546875,0.632812,0.546875,0.554688,0.531250,0.531250,0.539062,0.609375,0.554688
STAGIAIRE EN MARKETING | CASABLANCA (Maroc),0.539062,0.664062,1.000000,0.546875,0.632812,0.539062,0.609375,0.687500,0.617188,0.554688,...,0.570312,0.539062,0.546875,0.570312,0.625000,0.554688,0.414062,0.421875,0.648438,0.593750
Support IT sénior ( +5 ans d'expérience) | Rabat (Maroc),0.648438,0.601562,0.546875,1.000000,0.617188,0.523438,0.578125,0.546875,0.539062,0.648438,...,0.617188,0.632812,0.781250,0.554688,0.593750,0.539062,0.539062,0.593750,0.679688,0.656250
AGENT LOGISTIQUE | CASABLANCA (Maroc),0.578125,0.687500,0.632812,0.617188,1.000000,0.546875,0.601562,0.742188,0.640625,0.609375,...,0.593750,0.593750,0.570312,0.546875,0.585938,0.515625,0.515625,0.570312,0.578125,0.601562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ingénieur études et développement JAVA Sénior (H/F) | Tanger/ Tétouan (Maroc),0.578125,0.531250,0.554688,0.539062,0.515625,0.593750,0.601562,0.554688,0.578125,0.593750,...,0.546875,0.546875,0.617188,0.937500,0.601562,1.000000,0.609375,0.648438,0.593750,0.632812
Ingénieur de Production | Casablanca (Maroc),0.671875,0.531250,0.414062,0.539062,0.515625,0.640625,0.507812,0.554688,0.546875,0.625000,...,0.500000,0.500000,0.601562,0.609375,0.570312,0.609375,1.000000,0.710938,0.531250,0.617188
Ingénieur Virtualisation | Casablanca (Maroc),0.617188,0.539062,0.421875,0.593750,0.570312,0.664062,0.546875,0.531250,0.507812,0.632812,...,0.554688,0.585938,0.546875,0.664062,0.546875,0.648438,0.710938,1.000000,0.492188,0.562500
Stagiaire Recrutement | Rabat (Maroc),0.562500,0.609375,0.648438,0.679688,0.578125,0.562500,0.554688,0.601562,0.562500,0.609375,...,0.562500,0.625000,0.632812,0.593750,0.617188,0.593750,0.531250,0.492188,1.000000,0.664062


In [65]:
similarty_matrix.to_csv("Simhash_similarity_matrix.csv", mode = 'w', index=False)

In [69]:
Simhash_Sim_df.to_csv("Simhash_similarity.csv", mode = 'w', index=False)

In [3]:
simhash = pd.read_csv("Simhash_similarity.csv")

In [6]:
simhash["Similarity"] = 1 - simhash["Distances"]/64

In [8]:
simhash.to_csv("Simhash_similarity.csv", mode = 'w', index=False)