this is the NLP part, where we used two different NLP algorithms to extract competencies

# Competencies Extraction Service

In [5]:
# pip install spacy
# python -m spacy download de_core_news_lg
# pip install spacy-universal-sentence-encoder

In [7]:
import pandas as pd
import spacy
sp = spacy.load('de_core_news_lg')

## Load Processed Data

In [8]:
labels_processed = pd.read_csv("../data/labels_processed.csv")['processed_label']
labels_processed

0                                  musikpersonal verwalten
1                      strafvollzugsverfahr beaufsichtigen
2                           unterdrückend praktik anwenden
3        einhaltung vorschrift eisenbahnfahrzeuge überp...
4                               verfügbar dienst ermitteln
                               ...                        
13886    beruflich leistungsfähigkeit nutzer nutzerinn ...
13887                  beleuchtung transportgerät einbauen
13888                       verarbeitung natürlich sprache
13889                               bauarbeit koordinieren
13890               absturzsicherung bordbretter anbringen
Name: processed_label, Length: 13891, dtype: object

In [9]:
skills_info_processed = pd.read_csv("../data/skills_info_processed.csv")['skill_info_processed']
skills_info_processed

0        musikpersonal verwalten zuweise verwalten aufg...
1        strafvollzugsverfahr beaufsichtigen überwachen...
2        unterdrückend praktik anwenden ermittel repres...
3        einhaltung vorschrift eisenbahnfahrzeuge überp...
4        verfügbar dienst ermitteln ermitteln verschied...
                               ...                        
13886    beruflich leistungsfähigkeit nutzer nutzerinn ...
13887    beleuchtung transportgerät einbauen einbau bel...
13888    verarbeitung natürlich sprache technologie ikt...
13889    bauarbeit koordinieren koordinierung tätigkeit...
13890    absturzsicherung bordbretter anbringen anbring...
Name: skill_info_processed, Length: 13891, dtype: object

In [10]:
courses_info_processed = pd.read_csv("../data/courses_info_processed.csv")['course_info_processed']
courses_info_processed

0     experte prozessmanagement aktuell vorherrschen...
1     experte cloud computing cloud computing bedarf...
2     experte investition finanzierung aktuell vorhe...
3     experte unternehmensaufbau organisation kurs z...
4     experte unternehmensführung aktuell vorherrsch...
5     itprojektmanagement aktuell vorherrschend situ...
6     php programmierer datenbankentwickler php hype...
7     programmierung php framework laravel symfony z...
8     shopsystem administrator ziel maßnahme teilneh...
9     socialmedia manager ziel maßnahme teilnehmer s...
10    starte heranführung selbstständig tätigkeit he...
11    vermarktungsstrategie online offline aktuell v...
12    webentwicklung 2.0 html5 css3 wordpress ziel m...
13    zukunftswert mensch tiefgreifendwirksame indiv...
14    weiterbildung wildnispädagogik freuen intensiv...
15    qualifizierung anerkennung erzieher*inn iq net...
16    digital spielbasiert lernen einsatzmöglichkeit...
17    digital spielbasiert lernen einsatzmöglich

## NLP Algorithms

### 1. Modified Ontology-based Entity Recognition

<img src="../images/ER.png" align="left" width="800">

`termStore:  {controlled vocabulary (vocabularies in label): URI}`

In [11]:
termStore = {}
URI = 0
for label_processed in labels_processed:
    label_processed = sp(label_processed)
    for word in label_processed:
        word = word.text
        if word not in termStore:
            termStore[word] = URI
            URI += 1

In [12]:
pd.DataFrame(termStore.items(), columns = ['controlledVocabulary', 'URI'])

Unnamed: 0,controlledVocabulary,URI
0,musikpersonal,0
1,verwalten,1
2,strafvollzugsverfahr,2
3,beaufsichtigen,3
4,unterdrückend,4
...,...,...
12122,scala,12122
12123,bodentragfähigkeit,12123
12124,bibliotheksartikel,12124
12125,absturzsicherung,12125


`sequenceStore: {URIs : (index, sequence consisted of controlled vocabularies (label))}`

In [13]:
sequenceStore = {}
for i, label_processed in enumerate(labels_processed):
    URIs = []
    label_processed = sp(label_processed)
    for word in label_processed:
        URIs.append(termStore[word.text])
    sequenceStore[tuple(URIs)] = (i,label_processed.text)

In [14]:
pd.DataFrame(sequenceStore.items(), columns = ['URIs', '(index, label)'])

Unnamed: 0,URIs,"(index, label)"
0,"(0, 1)","(0, musikpersonal verwalten)"
1,"(2, 3)","(1, strafvollzugsverfahr beaufsichtigen)"
2,"(4, 5, 6)","(2, unterdrückend praktik anwenden)"
3,"(7, 8, 9, 10)","(3, einhaltung vorschrift eisenbahnfahrzeuge ü..."
4,"(11, 12, 13)","(4, verfügbar dienst ermitteln)"
...,...,...
13877,"(1802, 6820, 501, 502, 705, 1335)","(13886, beruflich leistungsfähigkeit nutzer nu..."
13878,"(2206, 1899, 289)","(13887, beleuchtung transportgerät einbauen)"
13879,"(1743, 1332, 2355)","(13888, verarbeitung natürlich sprache)"
13880,"(3594, 478)","(13889, bauarbeit koordinieren)"


The algorithm scans the tokenized courses information from the beginning until a word contained in the `termStore` is reached. Starting from this word a lookahead is performed searching for the longest sequence of words, which are contained in the `termStore`. As soon as a subsequent term is not included in the `termStore`, the `check_candidates` method to find all sequence still contained in the `sequenceStore` by using URIs. 

In [15]:
def get_relations_ER(index_start, index_end):
    URIs_candidates = []
    word_candidates = []
    relations = []
    courses_info_processed_subset = courses_info_processed[index_start:index_end]
    for i, course_info_processed in enumerate(courses_info_processed_subset):
        index_course = index_start + i
        for word in sp(course_info_processed):
            word = word.text
            if word != '--' and word in termStore:
                word_candidates.append(word)
                URIs_candidates.append(termStore[word])
            else:
                if URIs_candidates != []:
                    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
                word_candidates = []
            
#             # approximate matching, results not good 
#             for key in termStore.keys():
#                 if str(word) == key or key.startswith(str(word)):
#                     word_candidates.append(key)
#                     URIs_candidates.append(termStore[key])
#                 else:
#                     if URIs_candidates != []:
#                         URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
#                     word_candidates = []
                    
    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
    return relations

def check_candidates(URIs_candidates, index_course, relations):
    n = len(URIs_candidates)
    for i in range(n):
        for j in range(i+1, n+1):
            URIs = tuple(URIs_candidates[i:j])
            if URIs in sequenceStore:
                index_label = sequenceStore[URIs][0]
                if (index_course, index_label) not in relations:
                    relations.append((index_course, index_label))
    URIs_candidates = []
    return URIs_candidates, relations

### 2. Universal Sentence Encoder

<img src="../images/NN.png" align="left">

In [16]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2022-07-28 16:25:13.874154: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-28 16:25:13.874544: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-28 16:25:13.874634: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-07-28 16:25:13.874674: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-07-28 16:25:13.874705: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

#### Calculate skill embedings

**Problem:** There is not enough memory to directly calculate the embedding and inner product of all course information and all skill information at one time, and the running time is too long.

**Implementation tricks:** compute one batch of embeddings at a time, and use numpy matrix operations to speed up (parallelizable operations)

In [17]:
skill_info_processed_embedings = []
batch_size = 1000
for i in range(0, len(skills_info_processed), batch_size):
    skill_info_processed_embedings.extend(embed(skills_info_processed[i: i + batch_size]))
skill_info_processed_embedings = np.array(skill_info_processed_embedings)

In [18]:
print(skill_info_processed_embedings.shape)
skill_info_processed_embedings

(13891, 512)


array([[ 0.02038876, -0.00954446, -0.03040175, ...,  0.04920845,
         0.07373227, -0.04932465],
       [ 0.04407404,  0.0756475 ,  0.02994844, ...,  0.00876743,
         0.02266827, -0.06977132],
       [ 0.08902572,  0.00449151, -0.05065555, ...,  0.04441226,
        -0.06821536,  0.0327868 ],
       ...,
       [ 0.02244141,  0.02618527, -0.03633612, ..., -0.04239103,
         0.01929423, -0.04794347],
       [ 0.09077223,  0.05756629,  0.00070541, ...,  0.02687717,
        -0.0475247 , -0.0974102 ],
       [ 0.07323549,  0.00218259, -0.02024678, ..., -0.02401678,
        -0.05560207, -0.00270667]], dtype=float32)

#### Find course with related skills

Two hyperparameters:

Similarity threshold, skills with similarity above this value will be considered as course-related skills

The number of skills with the highest similarities returned among skills with a similarity above the threshold

In [19]:
threshold = 0.45
top_n = 15

In [20]:
def get_relations_NN(index_start, index_end):
    relations = []
    courses_info_processed_subset = courses_info_processed[index_start:index_end]
    for i, course_info_processed in enumerate(courses_info_processed_subset):
        course_i = index_start + i
        course_info_processed_embeding = embed(course_info_processed)
        similarities = np.inner(course_info_processed_embeding, skill_info_processed_embedings)[0]
        top_i = np.where(similarities >= threshold)[0]
        top_similarities = similarities[similarities >= threshold]
        related_skills_i_similarity = list(zip(top_i, top_similarities))
        top_related_skills_i_similarity = sorted(related_skills_i_similarity, key = lambda x: x[1], reverse=True)[:top_n]
        top_related_skills_i = list(map(lambda x: x[0], top_related_skills_i_similarity))
        relations.extend(list(zip([course_i]*len(top_related_skills_i), top_related_skills_i)))
    return relations

### Calculate id relations

In [21]:
def store_relations (start, end, model):
    
    # Modified Ontology-based Entity Recognition
    if model == 'ER': relations = get_relations_ER(start, end)
        
    # Universal Sentence Encoder   
    elif model == 'NN': relations = get_relations_NN(start, end)

    else: raise Exception("Please set model to ER or NN")
        
    course_ids = pd.read_csv("../data/all_courses.csv")['course_id']
    skill_uris = pd.read_csv("../data/all_skills.csv")['concept_uri']
    
    graph = []
    for relation in relations:
        graph.append((course_ids[relation[0]],skill_uris[relation[1]]))
    
    graphp_df = pd.DataFrame(graph)
    graphp_df.columns =['course_id', 'concept_uri']
    graphp_df.to_csv("../data/all_relations_{}.csv".format(model))
    
    # Map id relations to name relations
    courses = pd.read_csv('../data/all_courses.csv')[['course_id','course_name']]
    skills = pd.read_csv('../data/all_skills.csv')[['concept_uri','preferred_label']]
    id_relations = pd.read_csv("../data/all_relations_{}.csv".format(model)).iloc[:,1:3]
    skill_dict = skills.set_index('concept_uri').to_dict()['preferred_label']
    course_dict = courses.set_index('course_id').to_dict()['course_name']
    name_relations = pd.DataFrame(columns=['course_name','skill_label'])
    name_relations['course_name'] = id_relations['course_id'].map(course_dict)
    name_relations['skill_label'] = id_relations['concept_uri'].map(skill_dict)
    name_relations.to_csv('../data/name_relations_{}.csv'.format(model))
    
    return name_relations

In [22]:
start, end = 0, len(courses_info_processed)

In [23]:
store_relations(start, end, 'ER')

Unnamed: 0,course_name,skill_label
0,Experte im Prozessmanagement,Arbeitsmarkt
1,Experte im Prozessmanagement,sich selbst darstellen
2,Experte im Prozessmanagement,andere führen
3,Experte in Investition und Finanzierung,Arbeitsmarkt
4,Experte in Investition und Finanzierung,sich selbst darstellen
5,Experte in Investition und Finanzierung,Finanzmärkte
6,Experte in Unternehmensaufbau und Organisation,Arbeitsmarkt
7,Experte in Unternehmensaufbau und Organisation,Marktanalyse
8,Experte in Unternehmensführung,Arbeitsmarkt
9,Experte in Unternehmensführung,sich selbst darstellen


In [25]:
store_relations(start, end, 'NN')

Unnamed: 0,course_name,skill_label
0,Experte in Cloud Computing,Cloud-Daten und -Speicher verwalten
1,Experte in Cloud Computing,Cloud-Architektur konzipieren
2,Experte in Cloud Computing,Datenspeicherung
3,Experte in Cloud Computing,Cloud-Ressourcen implementieren
4,Experte in Cloud Computing,Cloud-Technologien
5,Experte in Cloud Computing,Cloud-Netzwerke konzipieren
6,Experte in Cloud Computing,Migration in die Cloud planen
7,Experte in Cloud Computing,Cloud-Aufgaben automatisieren
8,Experte in Cloud Computing,mit Cloud-Diensten entwickeln
9,Experte in Cloud Computing,auf Vorfälle in der Cloud reagieren


# Grid Search

grid search to find optimal hyperparameters, can be skipped since we've already found it: (0.45, 15). But worth trying when new data introduced.

In [30]:
# for threshold in [0.4, 0.42, 0.44, 0.46]:
#     for top_n in [5, 10, 15, 20]:
        
#         start, end = 0, len(courses_info_processed)

#         # Universal Sentence Encoder
#         relations = get_relations_NN(start, end)
#         file_name = 'all_relations_NN_{}_{}.csv'.format(threshold, top_n)


#         course_ids = pd.read_csv("../data/all_courses.csv")['course_id']
#         skill_uris = pd.read_csv("../data/all_skills.csv")['concept_uri']
#         graph = []
#         for relation in relations:
#             graph.append((course_ids[relation[0]],skill_uris[relation[1]]))

#         graphp_df = pd.DataFrame(graph)
#         graphp_df.columns =['course_id', 'concept_uri']
#         graphp_df.to_csv("../data/{}".format(file_name))

#         courses = pd.read_csv('../data/all_courses.csv')[['course_id','course_name']]
#         skills = pd.read_csv('../data/all_skills.csv')[['concept_uri','preferred_label']]
#         id_relations = pd.read_csv("../data/{}".format(file_name)).iloc[:,1:3]
#         skill_dict = skills.set_index('concept_uri').to_dict()['preferred_label']
#         course_dict = courses.set_index('course_id').to_dict()['course_name']
#         name_relations = pd.DataFrame(columns=['course_name','skill_label'])
#         name_relations['course_name'] = id_relations['course_id'].map(course_dict)
#         name_relations['skill_label'] = id_relations['concept_uri'].map(skill_dict)
#         name_relations.to_csv('../data/name_relations_{}_{}.csv'.format(threshold, top_n))