# Competencies Extraction Service

In [1]:
# pip install spacy
# python -m spacy download de_core_news_lg
# pip install keybert
# pip install spacy-universal-sentence-encoder

##  Tools and Libaries
pandas: a software library written for the Python programming language for data manipulation and analysis

spacy: an open-source software library for advanced natural language processing

de_core_news_lg:  trained pipelines for German language

In [2]:
import pandas as pd
import spacy
sp = spacy.load('de_core_news_lg')

2022-06-28 22:56:05.080543: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-28 22:56:05.080584: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Load Processed Data

In [3]:
labels_processed = pd.read_csv("../data/labels_processed.csv")['processedLabel']
labels_processed

0                                  musikpersonal verwalten
1                      strafvollzugsverfahr beaufsichtigen
2                     nicht unterdrückend praktik anwenden
3        einhaltung von vorschrift von eisenbahnfahrzeu...
4                               verfügbar dienst ermitteln
                               ...                        
13886    beruflich leistungsfähigkeit von nutzer nutzer...
13887               beleuchtung in transportgerät einbauen
13888                       verarbeitung natürlich sprache
13889                               bauarbeit koordinieren
13890           absturzsicherung und bordbretter anbringen
Name: processedLabel, Length: 13891, dtype: object

In [4]:
skills_info_processed = pd.read_csv("../data/skills_info_processed.csv")['skill_info_processed']
skills_info_processed

0        musikpersonal verwalten -- zuweisen verwalten ...
1        strafvollzugsverfahr beaufsichtigen -- überwac...
2        unterdrückend praktik anwenden -- ermitteln re...
3        einhaltung vorschrift eisenbahnfahrzeuge überp...
4        verfügbar dienst ermitteln -- ermitteln versch...
                               ...                        
13886    beruflich leistungsfähigkeit nutzer -- nutzeri...
13887    beleuchtung transportgerät einbauen -- einbau ...
13888    verarbeitung natürlich sprache -- technologie ...
13889    bauarbeit koordinieren -- koordinierung tätigk...
13890    absturzsicherung bordbretter anbringen -- anbr...
Name: skill_info_processed, Length: 13891, dtype: object

In [5]:
courses_info_processed = pd.read_csv("../data/courses_info_processed.csv")['course_info_processed']
courses_info_processed

0        -- schwierig -- klient -- -- patient -- angehö...
1        aktuelles arbeitsrecht 2022 -- kurzbeschreibun...
2        ambulant pflege -- rechtssicher handeln haftun...
3        aufgabe gesetzlich betreuer -- reform betreuun...
4        basisqualifikation ungelernt pflegekräft -- ze...
                               ...                        
16847    5 monat weiterbildung -- organisation & führun...
16848    conversion usability experte -- ziel maßnahme ...
16849    digital transformation management -- ziel maßn...
16850    e-commerce geschäftsmodell -- ziel maßnahme te...
16851    experte digital content creation -- teilnehmer...
Name: course_info_processed, Length: 16852, dtype: object

## NLP Algorithms

### 1. Modified Ontology-based Entity Recognition

`termStore:  {controlled vocabulary (vocabularies in label): URI}`

In [None]:
termStore = {}
URI = 0
for label_processed in labels_processed:
    label_processed = sp(label_processed)
    for word in label_processed:
        word = str(word)
        if word not in termStore:
            termStore[word] = URI
            URI += 1

In [None]:
pd.DataFrame(termStore.items(), columns = ['controlledVocabulary', 'URI'])

`sequenceStore: {URIs : (index, sequence consisted of controlled vocabularies (label))}`

In [None]:
sequenceStore = {}
for i, label_processed in enumerate(labels_processed):
    URIs = []
    label_processed = sp(label_processed)
    for word in label_processed:
        URIs.append(termStore[str(word)])
    sequenceStore[tuple(URIs)] = (i,str(label_processed))

In [None]:
pd.DataFrame(sequenceStore.items(), columns = ['URIs', '(index, label)'])

The algorithm scans the tokenized courses information from the beginning until a word contained in the `termStore` is reached. Starting from this word a lookahead is performed searching for the longest sequence of words, which are contained in the `termStore`. As soon as a subsequent term is not included in the `termStore`, the `check_candidates` method to find all sequence still contained in the `sequenceStore` by using URIs. 

In [None]:
def get_relations_ER(index_start, index_end):
    URIs_candidates = []
    word_candidates = []
    relations = []
    courses_info_processed_subset = courses_info_processed[index_start:index_end]
    for i, course_info_processed in enumerate(courses_info_processed_subset):
        index_course = index_start + i
        for word in sp(course_info_processed):
            word = str(word)
            if word != '--' and word in termStore:
                word_candidates.append(word)
                URIs_candidates.append(termStore[word])
            else:
                if URIs_candidates != []:
                    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
                word_candidates = []
            
#             # approximate matching, results not good 
#             for key in termStore.keys():
#                 if str(word) == key or key.startswith(str(word)):
#                     word_candidates.append(key)
#                     URIs_candidates.append(termStore[key])
#                 else:
#                     if URIs_candidates != []:
#                         URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
#                     word_candidates = []
                    
    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
    
#     for relation in relations:
#         print(courses['course_name'][relation[0]])
#         print(' ---> ' + skills['preferredLabel'][relation[1]] + '   ' + skills['conceptUri'][relation[1]])
#         print()
    return relations

        
def check_candidates(URIs_candidates, index_course, relations):
    n = len(URIs_candidates)
    for i in range(n):
        for j in range(i+1, n+1):
            URIs = tuple(URIs_candidates[i:j])
            if URIs in sequenceStore:
                index_label = sequenceStore[URIs][0]
                if (index_course, index_label) not in relations:
                    relations.append((index_course, index_label))
    URIs_candidates = []
    return URIs_candidates, relations

### 2. Universal Sentence Encoder

In [8]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

#### Calculate skill embedings

In [189]:
skill_info_processed_embedings = []
batch_size = 1000
for i in range(0, len(skills_info_processed), batch_size):
    skill_info_processed_embedings.extend(embed(skills_info_processed[i: i + batch_size]))
skill_info_processed_embedings = np.array(skill_info_processed_embedings)

#### Find course with related skills

In [244]:
def get_relations_NN(index_start, index_end):
    threshold = 0.45
    top_n = 5
    relations = []
    courses_info_processed_subset = courses_info_processed[index_start:index_end]
    for i, course_info_processed in enumerate(courses_info_processed_subset):
        course_i = index_start + i
        course_info_processed_embeding = embed(course_info_processed)
        similarities = np.inner(course_info_processed_embeding, skill_info_processed_embedings)[0]
        top_i = np.where(similarities >= threshold)[0]
        top_similarities = similarities[similarities >= threshold]
        related_skills_i_similarity = list(zip(top_i, top_similarities))
        top_related_skills_i_similarity = sorted(related_skills_i_similarity, key = lambda x: x[1], reverse=True)[:top_n]
        top_related_skills_i = list(map(lambda x: x[0], top_related_skills_i_similarity))
        relations.extend(list(zip([course_i]*len(top_related_skills_i), top_related_skills_i)))
    return relations

### Calculate id relations

In [255]:
model = 'NN'
start, end = 0, len(courses_info_processed)

if model == 'ER':
    # Modified Ontology-based Entity Recognition
    relations = get_relations_ER(start, end)
    file_name = 'all_relations_ER.csv'
    
elif model == 'NN':
    # Universal Sentence Encoder
    relations = get_relations_NN(start, end)
    file_name = 'all_relations_NN.csv'
    
else: raise Exception("Please set model to ER or NN")
    
graph = []
for relation in relations:
    graph.append((courses['course_id'][relation[0]],skills['conceptUri'][relation[1]]))

In [256]:
graphp_df = pd.DataFrame(graph)
graphp_df.columns =['course_id', 'concept_uri']
graphp_df.to_csv("../data/{}".format(file_name))

### Map id relations to name relations

In [257]:
courses = pd.read_csv('../data/all_courses.csv')[['course_id','course_name']]
skills = pd.read_csv('../data/skills_de.csv')[['conceptUri','preferredLabel']]
id_relations = pd.read_csv("../data/{}".format(file_name)).iloc[:,1:3]
id_relations.columns = ['course_id','conceptUri']
skill_dict = skills.set_index('conceptUri').to_dict()['preferredLabel']
course_dict = courses.set_index('course_id').to_dict()['course_name']
name_relations = pd.DataFrame(columns=['course_name','skill_label'])
name_relations['course_name'] = id_relations['course_id'].map(course_dict)
name_relations['skill_label'] = id_relations['conceptUri'].map(skill_dict)

In [258]:
name_relations

Unnamed: 0,course_name,skill_label
0,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",in der Fachkrankenpflege kommunizieren
1,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",Feedback zum Kommunikationsstil von Patienten/...
2,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",auf Vorstellungsgespräch vorbereiten
3,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",Fachkrankenpflege
4,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",Gespräche in den sozialen Diensten führen
...,...,...
44857,Digital Transformation Management,zu Überlegungen von Programmgestaltern/Program...
44858,Digital Transformation Management,beim digitalen Wandel industrieller Prozesse a...
44859,Digital Transformation Management,innovative Mobilitätslösungen entwickeln
44860,Experte im Digital Content Creation,zu Überlegungen von Programmgestaltern/Program...
