# Competencies Extraction Service

In [34]:
# pip install spacy
# python -m spacy download de_core_news_lg
# pip install spacy-universal-sentence-encoder

##  Tools and Libaries
pandas: a software library written for the Python programming language for data manipulation and analysis

spacy: an open-source software library for advanced natural language processing

de_core_news_lg:  trained pipelines for German language

In [35]:
import pandas as pd
import spacy
sp = spacy.load('de_core_news_lg')

## Load Processed Data

In [36]:
labels_processed = pd.read_csv("../data/labels_processed.csv")['processedLabel']
labels_processed

0                                  musikpersonal verwalten
1                      strafvollzugsverfahr beaufsichtigen
2                           unterdrückend praktik anwenden
3        einhaltung vorschrift eisenbahnfahrzeuge überp...
4                               verfügbar dienst ermitteln
                               ...                        
13886    beruflich leistungsfähigkeit nutzer nutzerinn ...
13887                  beleuchtung transportgerät einbauen
13888                       verarbeitung natürlich sprache
13889                               bauarbeit koordinieren
13890               absturzsicherung bordbretter anbringen
Name: processedLabel, Length: 13891, dtype: object

In [37]:
skills_info_processed = pd.read_csv("../data/skills_info_processed.csv")['skill_info_processed']
skills_info_processed

0        musikpersonal verwalten zuweise verwalten aufg...
1        strafvollzugsverfahr beaufsichtigen überwachen...
2        unterdrückend praktik anwenden ermittel repres...
3        einhaltung vorschrift eisenbahnfahrzeuge überp...
4        verfügbar dienst ermitteln ermitteln verschied...
                               ...                        
13886    beruflich leistungsfähigkeit nutzer nutzerinn ...
13887    beleuchtung transportgerät einbauen einbau bel...
13888    verarbeitung natürlich sprache technologie ikt...
13889    bauarbeit koordinieren koordinierung tätigkeit...
13890    absturzsicherung bordbretter anbringen anbring...
Name: skill_info_processed, Length: 13891, dtype: object

In [38]:
courses_info_processed = pd.read_csv("../data/courses_info_processed.csv")['course_info_processed']
courses_info_processed

0        schwierig klient patient angehörige kollege cl...
1        aktuelles arbeitsrecht kurzbeschreibung  arbei...
2        ambulant pflege rechtssicher handeln haftungsr...
3        aufgabe gesetzlich betreuer reform betreuungsr...
4        basisqualifikation ungelernt pflegekräft zerti...
                               ...                        
16847    monat weiterbildung organisation & führung lea...
16848    conversion usability experte ziel maßnahme tei...
16849    digital transformation management ziel maßnahm...
16850    ecommerce geschäftsmodell ziel maßnahme teilne...
16851    experte digital content creation teilnehmer di...
Name: course_info_processed, Length: 16852, dtype: object

## NLP Algorithms

### 1. Modified Ontology-based Entity Recognition

`termStore:  {controlled vocabulary (vocabularies in label): URI}`

In [39]:
termStore = {}
URI = 0
for label_processed in labels_processed:
    label_processed = sp(label_processed)
    for word in label_processed:
        word = word.text
        if word not in termStore:
            termStore[word] = URI
            URI += 1

In [40]:
pd.DataFrame(termStore.items(), columns = ['controlledVocabulary', 'URI'])

Unnamed: 0,controlledVocabulary,URI
0,musikpersonal,0
1,verwalten,1
2,strafvollzugsverfahr,2
3,beaufsichtigen,3
4,unterdrückend,4
...,...,...
12122,scala,12122
12123,bodentragfähigkeit,12123
12124,bibliotheksartikel,12124
12125,absturzsicherung,12125


`sequenceStore: {URIs : (index, sequence consisted of controlled vocabularies (label))}`

In [41]:
sequenceStore = {}
for i, label_processed in enumerate(labels_processed):
    URIs = []
    label_processed = sp(label_processed)
    for word in label_processed:
        URIs.append(termStore[word.text])
    sequenceStore[tuple(URIs)] = (i,label_processed.text)

In [42]:
pd.DataFrame(sequenceStore.items(), columns = ['URIs', '(index, label)'])

Unnamed: 0,URIs,"(index, label)"
0,"(0, 1)","(0, musikpersonal verwalten)"
1,"(2, 3)","(1, strafvollzugsverfahr beaufsichtigen)"
2,"(4, 5, 6)","(2, unterdrückend praktik anwenden)"
3,"(7, 8, 9, 10)","(3, einhaltung vorschrift eisenbahnfahrzeuge ü..."
4,"(11, 12, 13)","(4, verfügbar dienst ermitteln)"
...,...,...
13877,"(1802, 6820, 501, 502, 705, 1335)","(13886, beruflich leistungsfähigkeit nutzer nu..."
13878,"(2206, 1899, 289)","(13887, beleuchtung transportgerät einbauen)"
13879,"(1743, 1332, 2355)","(13888, verarbeitung natürlich sprache)"
13880,"(3594, 478)","(13889, bauarbeit koordinieren)"


The algorithm scans the tokenized courses information from the beginning until a word contained in the `termStore` is reached. Starting from this word a lookahead is performed searching for the longest sequence of words, which are contained in the `termStore`. As soon as a subsequent term is not included in the `termStore`, the `check_candidates` method to find all sequence still contained in the `sequenceStore` by using URIs. 

In [43]:
def get_relations_ER(index_start, index_end):
    URIs_candidates = []
    word_candidates = []
    relations = []
    courses_info_processed_subset = courses_info_processed[index_start:index_end]
    for i, course_info_processed in enumerate(courses_info_processed_subset):
        index_course = index_start + i
        for word in sp(course_info_processed):
            word = word.text
            if word != '--' and word in termStore:
                word_candidates.append(word)
                URIs_candidates.append(termStore[word])
            else:
                if URIs_candidates != []:
                    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
                word_candidates = []
            
#             # approximate matching, results not good 
#             for key in termStore.keys():
#                 if str(word) == key or key.startswith(str(word)):
#                     word_candidates.append(key)
#                     URIs_candidates.append(termStore[key])
#                 else:
#                     if URIs_candidates != []:
#                         URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
#                     word_candidates = []
                    
    URIs_candidates, relations = check_candidates(URIs_candidates, index_course, relations)
    
#     for relation in relations:
#         print(courses['course_name'][relation[0]])
#         print(' ---> ' + skills['preferredLabel'][relation[1]] + '   ' + skills['conceptUri'][relation[1]])
#         print()
    return relations

        
def check_candidates(URIs_candidates, index_course, relations):
    n = len(URIs_candidates)
    for i in range(n):
        for j in range(i+1, n+1):
            URIs = tuple(URIs_candidates[i:j])
            if URIs in sequenceStore:
                index_label = sequenceStore[URIs][0]
                if (index_course, index_label) not in relations:
                    relations.append((index_course, index_label))
    URIs_candidates = []
    return URIs_candidates, relations

### 2. Universal Sentence Encoder

In [44]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

2022-07-02 10:23:24.606852: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:961] could not open file to read NUMA node: /sys/bus/pci/devices/0000:26:00.0/numa_node
Your kernel may have been built without NUMA support.
2022-07-02 10:23:24.607032: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-02 10:23:24.607079: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-07-02 10:23:24.607163: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-07-02 10:23:24.607202: W tensorflow/stream_executor/platform/default/dso_loader.cc:6

#### Calculate skill embedings

In [45]:
skill_info_processed_embedings = []
batch_size = 1000
for i in range(0, len(skills_info_processed), batch_size):
    skill_info_processed_embedings.extend(embed(skills_info_processed[i: i + batch_size]))
skill_info_processed_embedings = np.array(skill_info_processed_embedings)

#### Find course with related skills

In [61]:
threshold = 0.45
top_n = 15

def get_relations_NN(index_start, index_end):
    relations = []
    courses_info_processed_subset = courses_info_processed[index_start:index_end]
    for i, course_info_processed in enumerate(courses_info_processed_subset):
        course_i = index_start + i
        course_info_processed_embeding = embed(course_info_processed)
        similarities = np.inner(course_info_processed_embeding, skill_info_processed_embedings)[0]
        top_i = np.where(similarities >= threshold)[0]
        top_similarities = similarities[similarities >= threshold]
        related_skills_i_similarity = list(zip(top_i, top_similarities))
        top_related_skills_i_similarity = sorted(related_skills_i_similarity, key = lambda x: x[1], reverse=True)[:top_n]
        top_related_skills_i = list(map(lambda x: x[0], top_related_skills_i_similarity))
        relations.extend(list(zip([course_i]*len(top_related_skills_i), top_related_skills_i)))
    return relations

### Calculate id relations

In [56]:
model = 'NN'
start, end = 0, len(courses_info_processed)

if model == 'ER':
    # Modified Ontology-based Entity Recognition
    relations = get_relations_ER(start, end)
    file_name = 'all_relations_ER.csv'
    
elif model == 'NN':
    # Universal Sentence Encoder
    relations = get_relations_NN(start, end)
    file_name = 'all_relations_NN.csv'
    
else: raise Exception("Please set model to ER or NN")

course_ids = pd.read_csv("../data/all_courses.csv")['course_id']
skill_uris = pd.read_csv("../data/all_skills.csv")['concept_uri']
graph = []
for relation in relations:
    graph.append((course_ids[relation[0]],skill_uris[relation[1]]))

In [57]:
graphp_df = pd.DataFrame(graph)
graphp_df.columns =['course_id', 'concept_uri']
graphp_df.to_csv("../data/{}".format(file_name))

### Map id relations to name relations

In [58]:
courses = pd.read_csv('../data/all_courses.csv')[['course_id','course_name']]
skills = pd.read_csv('../data/all_skills.csv')[['concept_uri','preferred_label']]
id_relations = pd.read_csv("../data/{}".format(file_name)).iloc[:,1:3]
skill_dict = skills.set_index('concept_uri').to_dict()['preferred_label']
course_dict = courses.set_index('course_id').to_dict()['course_name']
name_relations = pd.DataFrame(columns=['course_name','skill_label'])
name_relations['course_name'] = id_relations['course_id'].map(course_dict)
name_relations['skill_label'] = id_relations['concept_uri'].map(skill_dict)

In [59]:
name_relations.to_csv('../data/name_relations_.csv')
name_relations

Unnamed: 0,course_name,skill_label
0,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",in der Fachkrankenpflege kommunizieren
1,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",Fachkrankenpflege
2,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",Feedback zum Kommunikationsstil von Patienten/...
3,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",bei Kommunikationsstörungen beraten
4,"""Schwierige"" Klienten? - Mit Patienten, Angehö...",Patienten/Patientinnen zu familiären Themen be...
...,...,...
42587,"Staatlich geprüfte/r Betriebswirt/in, Studiens...",elektronische Logistik für Soundanlagen managen
42588,"Staatlich geprüfte/r Betriebswirt/in, Studiens...",Vorkehrungen für logistische Anforderungen an ...
42589,Digital Transformation Management,ein Team leiten
42590,Digital Transformation Management,innovative Mobilitätslösungen entwickeln


# Test

In [69]:
for threshold in [0.4, 0.42, 0.44, 0.46]:
    for top_n in [5, 10, 15, 20]:
        
        start, end = 0, len(courses_info_processed)

        # Universal Sentence Encoder
        relations = get_relations_NN(start, end)
        file_name = 'all_relations_NN_{}_{}.csv'.format(threshold, top_n)


        course_ids = pd.read_csv("../data/all_courses.csv")['course_id']
        skill_uris = pd.read_csv("../data/all_skills.csv")['concept_uri']
        graph = []
        for relation in relations:
            graph.append((course_ids[relation[0]],skill_uris[relation[1]]))

        graphp_df = pd.DataFrame(graph)
        graphp_df.columns =['course_id', 'concept_uri']
        graphp_df.to_csv("../data/{}".format(file_name))

        courses = pd.read_csv('../data/all_courses.csv')[['course_id','course_name']]
        skills = pd.read_csv('../data/all_skills.csv')[['concept_uri','preferred_label']]
        id_relations = pd.read_csv("../data/{}".format(file_name)).iloc[:,1:3]
        skill_dict = skills.set_index('concept_uri').to_dict()['preferred_label']
        course_dict = courses.set_index('course_id').to_dict()['course_name']
        name_relations = pd.DataFrame(columns=['course_name','skill_label'])
        name_relations['course_name'] = id_relations['course_id'].map(course_dict)
        name_relations['skill_label'] = id_relations['concept_uri'].map(skill_dict)
        name_relations.to_csv('../data/name_relations_{}_{}.csv'.format(threshold, top_n))

In [85]:
courses = pd.read_csv("../data/all_courses.csv")
labels = pd.read_csv("../data/all_skills.csv")['preferred_label']

In [86]:
labels_processed = []
for label in (labels):
    label = sp(label)
    label_processed = ''
    for word in label:
        word = word.lemma_.lower()
        word = str(word)
        if word != '--' and word != '' and word != ' ' and word != '\xa0':
            label_processed += word + ' '
    labels_processed.append(label_processed[:-1])

In [89]:
label_processed_embeddings = embed(labels_processed)

In [111]:
frequencies = {}
course_descriptions = courses['course_description']
for course_description in course_descriptions:
    course_description = sp(course_description)
    for word in course_description:
        if not word.is_stop and word.is_alpha:
            word = word.lemma_.lower()
            if word not in frequencies: frequencies[word] = 1
            else: frequencies[word] += 1

In [112]:
frequencies = dict(sorted(frequencies.items(), key=lambda x: x[1], reverse=True))

In [51]:
# threshold = 0.8
# top_n = 5
# sentences = sp(course_description).sents
# for sentence in sentences:
#     sentence_processed = []
#     for word in sentence:
#         if "/-" in str(word): word = sp(str(word).split("/-")[0])[0]
        
#         if not word.is_stop and word.is_alpha:
#             word = word.lemma_.lower()
#             sentence_processed.append(word)
#     if len(sentence_processed) != 0: 
#         sentence_processed_embedding = embed(sentence_processed)
#         similarities = np.inner(sentence_processed_embedding, label_processed_embeddings)[0]
#         top_i = np.where(similarities >= threshold)[0]
#         top_similarities = similarities[similarities >= threshold]
#         related_skills_i_similarity = list(zip(top_i, top_similarities))
#         top_related_skills_i_similarity = sorted(related_skills_i_similarity, key = lambda x: x[1], reverse=True)[:top_n]
#         top_related_skills_i = list(map(lambda x: x[0], top_related_skills_i_similarity))
#         top_related_skills = list(map(lambda x: skill_labels[x], top_related_skills_i))
#         print(top_related_skills)