this is the NLP part, where we used two different NLP algorithms to extract competencies

# Competencies Extraction Service

In [19]:
import pandas as pd
import spacy
from tqdm import tqdm
sp = spacy.load('de_core_news_lg')

## Load Data

In [22]:
labels_processed = pd.read_csv("../data/labels_processed.csv")['processed_label']
labels_processed

0                                  musikpersonal verwalten
1                      strafvollzugsverfahr beaufsichtigen
2                           unterdrückend praktik anwenden
3        einhaltung vorschrift eisenbahnfahrzeuge überp...
4                               verfügbar dienst ermitteln
                               ...                        
13886    beruflich leistungsfähigkeit nutzer nutzerinn ...
13887                  beleuchtung transportgerät einbauen
13888                       verarbeitung natürlich sprache
13889                               bauarbeit koordinieren
13890               absturzsicherung bordbretter anbringen
Name: processed_label, Length: 13891, dtype: object

In [23]:
skills_info = pd.read_csv("../data/skills_info.csv")['info']
skills_info

0        Musikpersonal verwalten, Zuweisen und Verwalte...
1        Strafvollzugsverfahren beaufsichtigen, Überwac...
2        nicht unterdrückende Praktiken anwenden, Ermit...
3        Einhaltung von Vorschriften von Eisenbahnfahrz...
4        verfügbare Dienste ermitteln, Ermitteln der ve...
                               ...                        
13886    berufliche Leistungsfähigkeit von Nutzern/Nutz...
13887    Beleuchtung in Transportgeräten einbauen, Einb...
13888    Verarbeitung natürlicher Sprache, Technologien...
13889    Bauarbeiten koordinieren, Koordinierung der Tä...
13890    Absturzsicherungen und Bordbretter anbringen, ...
Name: info, Length: 13891, dtype: object

In [24]:
courses_info_ER = pd.read_csv("../data/courses_info_ER.csv")['course_info_ER']
courses_info_ER

0        schwierig klient patient angehörige kollege cl...
1        aktuelles arbeitsrecht 2022. kurzbeschreibung ...
2        ambulant pflege rechtssicher handeln haftungsr...
3        aufgabe gesetzlich betreuer reform betreuungsr...
4        basisqualifikation ungelernt pflegekräft zerti...
                               ...                        
16847    monat weiterbildung organisation & führung lea...
16848    conversion usability experte ziel maßnahme tei...
16849    digital transformation management ziel maßnahm...
16850    ecommerce geschäftsmodell ziel maßnahme teilne...
16851    experte digital content creation teilnehmer di...
Name: course_info_ER, Length: 16852, dtype: object

In [25]:
courses_info_NN = pd.read_csv("../data/courses_info_NN.csv")['course_info_NN']
courses_info_NN

0        . Schwierige . Klienten . . Mit Patienten , An...
1        Aktuelles Arbeitsrecht 2022. Kurzbeschreibung ...
2        Ambulante Pflege . Rechtssicher Handeln und Ha...
3        Aufgaben des gesetzlichen Betreuers . Zur Refo...
4        Basisqualifikation für ungelernte Pflegekräfte...
                               ...                        
16847    5 Monate Weiterbildung . Organisation . Führun...
16848    Conversion und Usability Experte . Ziel der Ma...
16849    Digital Transformation Management . Ziel der M...
16850    E-Commerce Geschäftsmodelle . Ziel der Maßnahm...
16851    Experte im Digital Content Creation . Die Teil...
Name: course_info_NN, Length: 16852, dtype: object

## NLP Algorithms

### 1. Modified Ontology-based Entity Recognition

<img src="../images/ER.png" align="left" width="800">

`termStore:  {controlled vocabulary (vocabularies in label): URI}`

In [26]:
termStore = {}
URI = 0
for label_processed in tqdm(labels_processed):
    label_processed = sp(label_processed)
    for word in label_processed:
        word = word.text
        if word not in termStore:
            termStore[word] = URI
            URI += 1

100%|████████████████████████████████████████████████████████████████████████████| 13891/13891 [00:42<00:00, 330.45it/s]


In [27]:
pd.DataFrame(termStore.items(), columns = ['controlledVocabulary', 'URI'])

Unnamed: 0,controlledVocabulary,URI
0,musikpersonal,0
1,verwalten,1
2,strafvollzugsverfahr,2
3,beaufsichtigen,3
4,unterdrückend,4
...,...,...
12122,scala,12122
12123,bodentragfähigkeit,12123
12124,bibliotheksartikel,12124
12125,absturzsicherung,12125


`sequenceStore: {URIs : (index, sequence consisted of controlled vocabularies (label))}`

In [28]:
sequenceStore = {}
for i, label_processed in enumerate(tqdm(labels_processed)):
    URIs = []
    label_processed = sp(label_processed)
    for word in label_processed:
        URIs.append(termStore[word.text])
    sequenceStore[tuple(URIs)] = (i,label_processed.text)

100%|████████████████████████████████████████████████████████████████████████████| 13891/13891 [00:40<00:00, 339.60it/s]


In [29]:
pd.DataFrame(sequenceStore.items(), columns = ['URIs', '(index, label)'])

Unnamed: 0,URIs,"(index, label)"
0,"(0, 1)","(0, musikpersonal verwalten)"
1,"(2, 3)","(1, strafvollzugsverfahr beaufsichtigen)"
2,"(4, 5, 6)","(2, unterdrückend praktik anwenden)"
3,"(7, 8, 9, 10)","(3, einhaltung vorschrift eisenbahnfahrzeuge ü..."
4,"(11, 12, 13)","(4, verfügbar dienst ermitteln)"
...,...,...
13877,"(1802, 6820, 501, 502, 705, 1335)","(13886, beruflich leistungsfähigkeit nutzer nu..."
13878,"(2206, 1899, 289)","(13887, beleuchtung transportgerät einbauen)"
13879,"(1743, 1332, 2355)","(13888, verarbeitung natürlich sprache)"
13880,"(3594, 478)","(13889, bauarbeit koordinieren)"


The algorithm scans the tokenized courses information from the beginning until a word contained in the `termStore` is reached. Starting from this word a lookahead is performed searching for the longest sequence of words, which are contained in the `termStore`. As soon as a subsequent term is not included in the `termStore`, the `check_candidates` method to find all sequence still contained in the `sequenceStore` by using URIs. 

In [40]:
def get_relations_ER(index_start, index_end):
    URIs_candidates = []
    word_candidates = []
    relations = []
    courses_info_ER_subset = courses_info_ER[index_start:index_end]
    for i, course_info_ER in enumerate(tqdm(courses_info_ER_subset)):
        index_course = index_start + i
        for word in sp(course_info_ER):
            word = word.text
            if word != '--' and word in termStore:
                word_candidates.append(word)
                URIs_candidates.append(termStore[word])
            else:
                if URIs_candidates != []:
                    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
                word_candidates = []
                
#             # approximate matching, results not good 
#             for key in termStore.keys():
#                 if str(word) == key or key.startswith(str(word)):
#                     word_candidates.append(key)
#                     URIs_candidates.append(termStore[key])
#                 else:
#                     if URIs_candidates != []:
#                         URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
#                     word_candidates = []
                    
    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
    return relations

def check_candidates(URIs_candidates, index_course, relations):
    n = len(URIs_candidates)
    for i in range(n):
        for j in range(i+1, n+1):
            URIs = tuple(URIs_candidates[i:j])
            if URIs in sequenceStore:
                index_label = sequenceStore[URIs][0]
                if (index_course, index_label) not in relations:
                    relations.append((index_course, index_label))
    URIs_candidates = []
    return URIs_candidates, relations

### 2. Universal Sentence Encoder

<img src="../images/NN.png" align="left">

In [33]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

#### Calculate skill embedings

**Problem:** There is not enough memory to directly calculate the embedding and inner product of all course information and all skill information at one time, and the running time is too long.

**Implementation tricks:** compute one batch of embeddings at a time, and use numpy matrix operations to speed up (parallelizable operations)

In [34]:
skill_info_embedings = []
batch_size = 500
for i in tqdm(range(0, len(skills_info), batch_size)):
    skill_info_embedings.extend(embed(skills_info[i: i + batch_size]))
skill_info_embedings = np.array(skill_info_embedings)

100%|███████████████████████████████████████████████████████████████████████████████████| 28/28 [03:01<00:00,  6.47s/it]


In [13]:
print(skill_info_embedings.shape)
skill_info_embedings

(13891, 512)


array([[ 0.05706281,  0.01265647, -0.06141534, ...,  0.03054258,
         0.05152629, -0.03009639],
       [ 0.03778337,  0.10214293,  0.0881092 , ...,  0.02205016,
         0.0346716 , -0.07374028],
       [ 0.04217117,  0.03572333,  0.01779626, ...,  0.00945657,
        -0.06897715,  0.03907941],
       ...,
       [-0.00797085,  0.03730636,  0.00821302, ..., -0.0592103 ,
         0.02274473, -0.04066873],
       [ 0.04129963,  0.02172587,  0.01060316, ...,  0.0187686 ,
        -0.04019865, -0.08722804],
       [ 0.04214134,  0.03009738,  0.00235354, ..., -0.03543701,
        -0.02515756,  0.03830037]], dtype=float32)

#### Find course with related skills

Three hyperparameters:

1. skill_threshold: for similarities between sentences in courses info and skills info, if it has value with above this threshold will be considered as course-related skills
1. course_threshold: for similarities between skills info and courses info, if it has value with above this threshold will be considered as course-related skills, It was introduced to filter out skills that are similar to sentences in the course info but do not match the whole course info context.
1. top_n: the number of skills with the highest similarities returned among skills with a similarity above the threshold (for each sentence in a course title and description)

In [36]:
skill_threshold, course_threshold, top_n = 0.4, 0.2, 10

In [45]:
def get_relations_NN(index_start, index_end):
    relations = []
    courses_info_subset = courses_info_NN[index_start:index_end]
    for i, course_info in enumerate(tqdm(courses_info_subset)):
        course_i = index_start + i
        course_info_sentences = course_info.split('.')
        # embedding of the whole course info
        course_info_embedding = embed(course_info)
        # embedding of the each sentence in the course info
        course_info_sentence_embeddings = embed(course_info_sentences)
        for course_info_sentence_embedding in course_info_sentence_embeddings:
            # similarities between sentences in courses info and skills info
            similarities = np.inner(course_info_sentence_embedding, skill_info_embedings)
            # get index of skills whose similarities bigger than the skill_threshold
            top_i = np.where(similarities >= skill_threshold)[0]
            if len(top_i) > 0:
                # get value of similarities bigger than the skill_threshold
                top_similarities = similarities[similarities >= skill_threshold]
                # get pairs of top_i and top_similarities
                related_skills_i_similarity = list(zip(top_i, top_similarities))
                # sort related_skills_i_similarity by value of similarities and get top_n from them
                top_related_skills_i_similarity = sorted(related_skills_i_similarity, 
                                                         key = lambda x: x[1], reverse=True)[:top_n]
                # get index of top related skills from last step 
                top_related_skills_i = list(map(lambda x: x[0], top_related_skills_i_similarity))
                top_related_skills_i = np.array(top_related_skills_i)
                # caculate similarities of these skills to course info to make sure that these skills not only match one of 
                # sentences in this piece of course info but also match the whole course info context
                similarities_with_course_info = np.inner(skill_info_embedings[top_related_skills_i],course_info_embedding)
                # filter out such skills by using course_threshold mentioned above (three hyperparameters)
                related_skills_i_with_course_info = np.where(similarities_with_course_info > course_threshold)[0]
                # create (index of course, index of skill) pairs
                if len(related_skills_i_with_course_info) > 0:
                    top_related_skills_i = top_related_skills_i[related_skills_i_with_course_info]
                    relations.extend(list(zip([course_i]*len(top_related_skills_i), top_related_skills_i)))
        relations = list(dict.fromkeys(relations))
    return relations

### Calculate id relations

In [41]:
def store_relations (start, end, model):
    
    # Modified Ontology-based Entity Recognition
    if model == 'ER': relations = get_relations_ER(start, end)
        
    # Universal Sentence Encoder   
    elif model == 'NN': relations = get_relations_NN(start, end)

    else: raise Exception("Please set model to ER or NN")
        
    course_ids = pd.read_csv("../data/all_courses.csv")['course_id']
    skill_uris = pd.read_csv("../data/all_skills.csv")['concept_uri']
    
    graph = []
    for relation in relations:
        graph.append((course_ids[relation[0]],skill_uris[relation[1]]))
    
    graphp_df = pd.DataFrame(graph)
    graphp_df.columns =['course_id', 'concept_uri']
    graphp_df.to_csv("../data/all_relations_{}.csv".format(model))
    
    # Map id relations to name relations
    courses = pd.read_csv('../data/all_courses.csv')[['course_id','course_name']]
    skills = pd.read_csv('../data/all_skills.csv')[['concept_uri','preferred_label']]
    id_relations = pd.read_csv("../data/all_relations_{}.csv".format(model)).iloc[:,1:3]
    skill_dict = skills.set_index('concept_uri').to_dict()['preferred_label']
    course_dict = courses.set_index('course_id').to_dict()['course_name']
    name_relations = pd.DataFrame(columns=['course_name','skill_label'])
    name_relations['course_name'] = id_relations['course_id'].map(course_dict)
    name_relations['skill_label'] = id_relations['concept_uri'].map(skill_dict)
    name_relations.to_csv('../data/name_relations_{}.csv'.format(model))
    
    return name_relations

In [42]:
start, end = 0, len(courses_info_ER)

In [43]:
with pd.option_context("display.max_rows", 1000):
    display(store_relations(start, end, 'ER'))

100%|█████████████████████████████████████████████████████████████████████████████| 16852/16852 [03:50<00:00, 72.99it/s]


Unnamed: 0,course_name,skill_label
0,Aktuelles Arbeitsrecht 2022,Arbeitsrecht
1,Ambulante Pflege - Rechtssicher Handeln und Ha...,Risikomanagement
2,Ambulante Pflege - Rechtssicher Handeln und Ha...,Datenschutz
3,Aufgaben des gesetzlichen Betreuers - Zur Refo...,planen
4,Aufgaben des gesetzlichen Betreuers - Zur Refo...,sich selbst darstellen
...,...,...
30117,Digital Transformation Management,Design Thinking
30118,E-Commerce Geschäftsmodelle,Geschäftsmodell
30119,Experte im Digital Content Creation,Psychologie
30120,Experte im Digital Content Creation,Waren sichern


In [46]:
with pd.option_context("display.max_rows", 1000):
    display(store_relations(start, end, 'NN'))

100%|███████████████████████████████████████████████████████████████████████████| 16852/16852 [1:16:45<00:00,  3.66it/s]


Unnamed: 0,course_name,skill_label
0,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",Kunden und Kundinnen zufriedenstellen
1,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",bei Kommunikationsstörungen beraten
2,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",mit Krankenpflegekräften kommunizieren
3,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",mit Nutzern/Nutzerinnen des Gesundheitssystems...
4,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",mit den Netzwerken von Nutzern/Nutzerinnen des...
...,...,...
214011,Experte im Digital Content Creation,Medienquellen studieren
214012,Experte im Digital Content Creation,Medienplan erstellen
214013,Experte im Digital Content Creation,Herstellungsprozess in der Optik
214014,Experte im Digital Content Creation,Inhalt und Form angleichen


# Grid Search

grid search to find optimal hyperparameters.

In [20]:
# for threshold in [0.4, 0.42, 0.44, 0.46]:
#     for top_n in [3, 5, 10, 15, 20]:
        
#         start, end = 0, len(courses_info_NN)

#         # Universal Sentence Encoder
#         relations = get_relations_NN(start, end)
#         file_name = 'all_relations_NN_{}_{}.csv'.format(threshold, top_n)


#         course_ids = pd.read_csv("../data/all_courses.csv")['course_id']
#         skill_uris = pd.read_csv("../data/all_skills.csv")['concept_uri']
#         graph = []
#         for relation in relations:
#             graph.append((course_ids[relation[0]],skill_uris[relation[1]]))

#         graphp_df = pd.DataFrame(graph)
#         graphp_df.columns =['course_id', 'concept_uri']
#         graphp_df.to_csv("../data/{}".format(file_name))

#         courses = pd.read_csv('../data/all_courses.csv')[['course_id','course_name']]
#         skills = pd.read_csv('../data/all_skills.csv')[['concept_uri','preferred_label']]
#         id_relations = pd.read_csv("../data/{}".format(file_name)).iloc[:,1:3]
#         skill_dict = skills.set_index('concept_uri').to_dict()['preferred_label']
#         course_dict = courses.set_index('course_id').to_dict()['course_name']
#         name_relations = pd.DataFrame(columns=['course_name','skill_label'])
#         name_relations['course_name'] = id_relations['course_id'].map(course_dict)
#         name_relations['skill_label'] = id_relations['concept_uri'].map(skill_dict)
#         name_relations.to_csv('../data/name_relations_{}_{}.csv'.format(threshold, top_n))