In [1]:
# This code is importing various libraries that will be used in the SkillExtractor class. 
# The FutureWarning and DeprecationWarning libraries are being imported so that the user can ignore warnings that might come up. The SkillExtractor class will be used to extract skills from a given text.


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import pandas as pd
from tqdm import tqdm_notebook
import spacy
from spacy.matcher import PhraseMatcher
# load default skills data base
from skillNer.general_params import SKILL_DB
# import skill extractor
from skillNer.skill_extractor_class import SkillExtractor
# init params of skill extractor
nlp = spacy.load("en_core_web_lg")
# init skill extractor
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher)
import ast
from pandas import json_normalize # easy JSON -> pd.DataFrame conversion
import numpy as np

loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [3]:
# This code reads in an Excel file from a specified location
# and stores it in a pandas DataFrame called "df"
# The DataFrame is then sorted by the "text_id" column

# read_excel(): pandas function for reading in Excel files
# sort_values(): pandas function for sorting DataFrames


df = pd.read_excel('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/04_Analysis/04_01_custom_analysis/04_01_02_tableau_data/01_tableau_base_data_geocoded.xlsx')
df = df.sort_values(by=['text_id'])
df.head(5)

Unnamed: 0,text_id,jobTitle,role,level,jobDescription,Company,Industry,city,lat,long,state,state_code,country,country_code
0,1,department head data management data warehouse...,data engineer,head,location is flexible job purpose the role is f...,dfv deutsche familienversicherung ag,versicherungen,Frankfurt,50.110644,8.682092,Bayern,BY,Deutschland,de
1,2,accumulation data analyst in accumulation and ...,data analyst,mid,data engineer join us let care for tomorrow at...,allianz global corporate & specialty,versicherungen,München,48.137108,11.575382,Bayern,BY,Deutschland,de
2,3,accumulation data engineer in accumulation and...,data engineer,mid,join us let care for tomorrow at allianz globa...,allianz global corporate & specialty,versicherungen,München,48.137108,11.575382,Bayern,BY,Deutschland,de
3,4,actuarial data scientist,data scientist,mid,we take responsibility for the sustainable man...,verti versicherung ag,finanzdienstleister,Teltow,52.401646,13.264453,Brandenburg,BB,Deutschland,de
4,5,actuarial data scientist actuar,data scientist,mid,hirschau central full time about us conrad con...,concordia versicherungsgesellschaft ag,versicherungen,Hannover,52.374478,9.738553,Niedersachsen,NI,Deutschland,de


In [175]:
# The code below creates a list from the job descriptions column in the df DataFrame.

job_descriptions_list_full = df['jobDescription'].tolist()


In [194]:
# This code is a function that takes in a list of job descriptions (your_list) and outputs a list of dataframes (data)
# The function loops through each job description in the list (tqdm_notebook(range(len(job_descriptions_list_full))))
# For each job description, the function extracts the skills mentioned using the skill_extractor.annotate() method
# The extracted skills are then converted into a dataframe (skills_found_for_df)
# These dataframes are then appended to the list 'data' which is returned at the end of the function


def process_job_description(your_list):

    data = []
    for i in tqdm_notebook(range(len(job_descriptions_list_full))):
        annotations = skill_extractor.annotate(job_descriptions_list_full[i])
        skill_match = annotations
        skills_found_for_df = pd.io.json.json_normalize(skill_match)
        data.append(skills_found_for_df)
    return data

data_full = process_job_description(job_descriptions_list_full)

In [177]:
# The code concatenates a list of dataframes called "data_full" and stores the result in a new dataframe called "df_exatracted_skills".

# The new dataframe "df_exatracted_skills" is then written to a Parquet file called "04_03_skillner_extracted_skills_final.parquet", with the index set to "False" to prevent the index column from being written to the CSV file.


df_exatracted_skills = pd.concat(data_full)
df_exatracted_skills.to_parquet('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/04_Analysis/04_03_extracting_skills_with_skillner_ner/04_03_01_skillner_extracted_skills_final.parquet', engine='fastparquet')

In [6]:
# This code reads in a Parquet file containing  job descriptions to be normalized and displays the first 5 rows.

normalizing_df_full = pd.read_parquet('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/04_Analysis/04_03_extracting_skills_with_skillner_ner/04_03_01_skillner_extracted_skills_final.parquet')

normalizing_df_full.head(5)

Unnamed: 0,text,results.full_matches,results.ngram_scored
0,a good idea was the origin a successful concep...,"[{'skill_id': 'KS440HK72NCTJJWMWRRS', 'doc_nod...","[{'skill_id': 'ES76F4C57A88877D6D64', 'doc_nod..."
1,put the insurance industry upside down with us...,"[{'skill_id': 'KS441PL6JPXW200W0GRQ', 'doc_nod...","[{'skill_id': 'KS4DOAH8RJ88A8BZ4EY6', 'doc_nod..."
2,applicant information for further information ...,"[{'skill_id': 'KS125ZB6BWF5RY40BH1B', 'doc_nod...","[{'skill_id': 'ES6DBB51045D6B212C2E', 'doc_nod..."
3,applicant information for further information ...,"[{'skill_id': 'KS125ZB6BWF5RY40BH1B', 'doc_nod...","[{'skill_id': 'ES6DBB51045D6B212C2E', 'doc_nod..."
4,responsible development optimization and maint...,"[{'skill_id': 'ES2933FAEF72CE4E2840', 'doc_nod...","[{'skill_id': 'KS126LM6SPRP5ND9301W', 'doc_nod..."


In [179]:
# The first line of code drops the 'results.ngram_scored' column from the dataframe. The second line drops the 'results.full_matches' column. 
# 
# The third line drops the 'results.ngram_scored' and 'results.full_matches' columns and adds a new column called 'id' with the index values from the original dataframe.

df_full_match = normalizing_df_full.drop(columns=['results.ngram_scored'])
df_ngram_match = normalizing_df_full.drop(columns=['results.full_matches'])
df_text_df =  normalizing_df_full.drop(columns=['results.full_matches', 'results.ngram_scored'])
df_text_df['id'] = normalizing_df_full.index


In [180]:
# This code defines a function called "union" that takes in two dataframes (df1 and df2) as input. 

# The first dataframe (df1) is modified by renaming the column "results.full_matches" to "match", and then dropping all the rows that have missing values in the "match" column. 
# 
# The "match" column is then converted into a list of literal values, and each value in the list is turned into its own dataframe. These dataframes are then concatenated together, 
# the index is reset, and only the rows with a score of 0.8 or higher are kept. The column "doc_node_value" is then modified by replacing the word "datum" with "data", and the columns "score", "doc_node_id", and "level_1" are dropped. 
# The remaining columns are then renamed to "skill_name" and "id_text", respectively. 

# The second dataframe (df2) is modified in a similar way, except that the column "results.ngram_scored" is renamed to "match", and the column "type" and "len" are dropped. 

# Finally, the two dataframes (df1 and df2) are concatenated together and the resulting dataframe is returned.


def union(df1, df2):

    df1.rename(columns = {'results.full_matches':'match'}, inplace = True)
    df1 = df1['match'].dropna().apply(ast.literal_eval)
    df1 = pd.concat([pd.DataFrame(x) for x in df1], keys=df1.index)
    df1.reset_index(inplace=True)
    df1 = df1.query('score >= 0.8')
    df1['doc_node_value'] = df1['doc_node_value'].str.replace("datum", "data")
    df1 = df1.drop(columns=['score','doc_node_id','level_1'])
    df1 = df1.rename(columns = {'doc_node_value':'skill_name', 'level_0':'id_text'})


    df2.rename(columns = {'results.ngram_scored':'match'}, inplace = True)
    df2 = df2['match'].dropna().apply(ast.literal_eval)
    df2 = pd.concat([pd.DataFrame(x) for x in df2], keys=df2.index)
    df2.reset_index(inplace=True)
    df2['score'] = df2['score'].astype(float)
    df2 = df2.query('score >= 0.8')
    df2 = df2.drop(columns=['score','doc_node_id','type','len','level_1'])
    df2 = df2.rename(columns = {'doc_node_value':'skill_name', 'level_0':'id_text'})

    union  = pd.concat([df1, df2],ignore_index=True)

    return union

In [181]:
# This code joins together two dataframes, `df_full_match` and `df_ngram_match`, into a new dataframe called `normalized_df_full`. 
# 
# The dataframes are combined by rows, so the new dataframe will have all the rows of `df_full_match`, followed by all the rows of `df_ngram_match`.


normalized_df_full = union(df_full_match, df_ngram_match)
normalized_df_full


Unnamed: 0,id_text,skill_id,skill_name
0,2,KS121Z26S4VJLQ1WXN21,customer service
1,1,KS120PL63HT8CXQ6LPDR,asset managers
2,1,KS120PL63HT8CXQ6LPDR,asset manager
3,2,KS120XP636CB5432F5TP,telecommunications
4,2,KS122556LMQ829GZCCRV,communications
5,2,KS1216L6SNBF22SP0V84,broadband


In [182]:

# The function takes 2 dataframes, df and df_text.

# It changes the type of the 'id' column in df_text to int.

# Then, it merges df and df_text on the 'id_text' column of df and the 'id' column of df_text.

# The how='left' parameter keeps all the rows from df and only adds rows from df_text that have a match in df.

# The function then drops the 'id' column from the merged dataframe and returns the new dataframe.

# Finally, it assigns the new dataframe to normalized_df_full.


def rejoin_df(df,df_text):
    df_text['id'] = df_text['id'].astype(int)
    df = pd.merge(df, df_text, left_on='id_text', right_on='id', how='left')
    df = df.drop(columns=['id'])
    return df

normalized_df_full = rejoin_df(normalized_df_full,df_text_df)
normalized_df_full

Unnamed: 0,id_text,skill_id,skill_name,text
0,2,KS121Z26S4VJLQ1WXN21,customer service,as a listed company with around million custom...
1,1,KS120PL63HT8CXQ6LPDR,asset managers,swiss life asset managers is the most successf...
2,1,KS120PL63HT8CXQ6LPDR,asset manager,swiss life asset managers is the most successf...
3,2,KS120XP636CB5432F5TP,telecommunications,as a listed company with around million custom...
4,2,KS122556LMQ829GZCCRV,communications,as a listed company with around million custom...
5,2,KS1216L6SNBF22SP0V84,broadband,as a listed company with around million custom...


In [183]:
# 1. This code takes a dataframe of normalized skills from a file called 'normalized_df_full' and saves it as a CSV file called 'normalized_df_full_final'

# 2. It then reads in a CSV file called 'normalized_df_full_final' and assigns it to a variable called 'found_skills'

# 3. Finally, it reads in a CSV file called 'enski_skills_db' and assigns it to a variable called 'enski_skills_db'


normalized_df_full.to_parquet('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/04_Analysis/04_03_extracting_skills_with_skillner_ner/04_03_02_normalized_df_full_final.parquet',engine='fastparquet')
found_skills = pd.read_parquet('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/04_Analysis/04_03_extracting_skills_with_skillner_ner/04_03_02_normalized_df_full_final.parquet')
enski_skills_db = pd.read_csv('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/04_Analysis/04_02_ensi_skills_extraction/04_02_ensi_skills_db.csv')

In [184]:
# Displays the column in enski_skills_db DataFrame

enski_skills_db.columns

Index(['category', 'description', 'descriptionSource', 'id', 'infoUrl',
       'isLanguage', 'isSoftware', 'name', 'removedDescription', 'subcategory',
       'tags', 'type.id', 'type.name', 'category.id', 'category.name',
       'subcategory.id', 'subcategory.name'],
      dtype='object')

In [185]:
# Print the first 5 rows in the DataFrame

enski_skills_db.head(5)

Unnamed: 0,category,description,descriptionSource,id,infoUrl,isLanguage,isSoftware,name,removedDescription,subcategory,tags,type.id,type.name,category.id,category.name,subcategory.id,subcategory.name
0,,,,KS120P86XDXZJT3B7KVJ,https://skills.emsidata.com/skills/KS120P86XDX...,False,False,(American Society For Quality) ASQ Certified,,,[],ST3,Certification,,,,
1,,\nDefined by Microsoft for use in recent versi...,https://en.wikipedia.org/wiki/.NET_assemblies,KS126XS6CQCFGC3NG79X,https://skills.emsidata.com/skills/KS126XS6CQC...,False,True,.NET Assemblies,,,"[{'key': 'wikipediaExtract', 'value': '\nDefin...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools
2,,,,ES50D03AC9CFC1A0BC93,https://skills.emsidata.com/skills/ES50D03AC9C...,False,False,.NET Development,,,"[{'key': 'wikipediaExtract', 'value': 'The .NE...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools
3,,The .NET Framework is a software framework dev...,https://en.wikipedia.org/wiki/.NET_Framework,KS1200B62W5ZF38RJ7TD,https://skills.emsidata.com/skills/KS1200B62W5...,False,True,.NET Framework,,,"[{'key': 'wikipediaExtract', 'value': 'The .NE...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools
4,,Microsoft started development on the .NET Fram...,https://en.wikipedia.org/wiki/.NET_Framework_1.0,KS126XW78QJCF4TRV2X7,https://skills.emsidata.com/skills/KS126XW78QJ...,False,True,.NET Framework 1,,,"[{'key': 'wikipediaExtract', 'value': 'Microso...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools


In [186]:
# This code starts by renaming certain columns in the 'enski_skills_db' dataframe. The columns that are renamed are 'id', 'name', 'description', 'category.name', 'subcategory.name', and 'type.name'. The new names for these columns are 'skill_id', 'skill_name', 'skill_description', 'skill_category_name', 'skill_subcategory_name', and 'skill_type_name', respectively. 

# After the columns have been renamed, the first five rows of the dataframe are printed.

enski_skills_db = enski_skills_db.rename(columns={'id': 'skill_id', 'name': 'skill_name', 'description': 'skill_description', 'category.name'	: 'skill_category_name', 'subcategory.name': 'skill_subcategory_name', 'type.name': 'skill_type_name'})
enski_skills_db.head(5)

Unnamed: 0,category,skill_description,descriptionSource,skill_id,infoUrl,isLanguage,isSoftware,skill_name,removedDescription,subcategory,tags,type.id,skill_type_name,category.id,skill_category_name,subcategory.id,skill_subcategory_name
0,,,,KS120P86XDXZJT3B7KVJ,https://skills.emsidata.com/skills/KS120P86XDX...,False,False,(American Society For Quality) ASQ Certified,,,[],ST3,Certification,,,,
1,,\nDefined by Microsoft for use in recent versi...,https://en.wikipedia.org/wiki/.NET_assemblies,KS126XS6CQCFGC3NG79X,https://skills.emsidata.com/skills/KS126XS6CQC...,False,True,.NET Assemblies,,,"[{'key': 'wikipediaExtract', 'value': '\nDefin...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools
2,,,,ES50D03AC9CFC1A0BC93,https://skills.emsidata.com/skills/ES50D03AC9C...,False,False,.NET Development,,,"[{'key': 'wikipediaExtract', 'value': 'The .NE...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools
3,,The .NET Framework is a software framework dev...,https://en.wikipedia.org/wiki/.NET_Framework,KS1200B62W5ZF38RJ7TD,https://skills.emsidata.com/skills/KS1200B62W5...,False,True,.NET Framework,,,"[{'key': 'wikipediaExtract', 'value': 'The .NE...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools
4,,Microsoft started development on the .NET Fram...,https://en.wikipedia.org/wiki/.NET_Framework_1.0,KS126XW78QJCF4TRV2X7,https://skills.emsidata.com/skills/KS126XW78QJ...,False,True,.NET Framework 1,,,"[{'key': 'wikipediaExtract', 'value': 'Microso...",ST1,Specialized Skill,17.0,Information Technology,442.0,Microsoft Development Tools


In [187]:

#1. The function 'drop_columns_ensi_skills' takes in a dataframe as input and drops certain columns from the dataframe.
#2. It then filters the dataframe by removing all rows where the 'skill_category_name' column is null.
#3. Finally, it returns the filtered dataframe.
#4. The variable 'enski_skills_db' is assigned the output of the 'drop_columns_ensi_skills' function applied to the 'enski_skills_db' dataframe.
#5. The first five rows of the resulting dataframe are then printed.


def drop_columns_ensi_skills(df):
    df = df.drop(columns=['category','descriptionSource','removedDescription','tags','subcategory','removedDescription','category.id','subcategory.id','type.id'])
    filtered_df = df[df['skill_category_name'].notnull()]

    return filtered_df
enski_skills_db = drop_columns_ensi_skills(enski_skills_db)
enski_skills_db.head(5)

Unnamed: 0,skill_description,skill_id,infoUrl,isLanguage,isSoftware,skill_name,skill_type_name,skill_category_name,skill_subcategory_name
1,\nDefined by Microsoft for use in recent versi...,KS126XS6CQCFGC3NG79X,https://skills.emsidata.com/skills/KS126XS6CQC...,False,True,.NET Assemblies,Specialized Skill,Information Technology,Microsoft Development Tools
2,,ES50D03AC9CFC1A0BC93,https://skills.emsidata.com/skills/ES50D03AC9C...,False,False,.NET Development,Specialized Skill,Information Technology,Microsoft Development Tools
3,The .NET Framework is a software framework dev...,KS1200B62W5ZF38RJ7TD,https://skills.emsidata.com/skills/KS1200B62W5...,False,True,.NET Framework,Specialized Skill,Information Technology,Microsoft Development Tools
4,Microsoft started development on the .NET Fram...,KS126XW78QJCF4TRV2X7,https://skills.emsidata.com/skills/KS126XW78QJ...,False,True,.NET Framework 1,Specialized Skill,Information Technology,Microsoft Development Tools
5,The .NET Framework is a software framework dev...,KS126XY68BNKXSBSLPYS,https://skills.emsidata.com/skills/KS126XY68BN...,False,True,.NET Framework 3,Specialized Skill,Information Technology,Microsoft Development Tools


In [188]:
found_skills.head(5) # Prints the first 5 rows of the found_skills dataframe

Unnamed: 0,id_text,skill_id,skill_name,text
0,2,KS121Z26S4VJLQ1WXN21,customer service,as a listed company with around million custom...
1,1,KS120PL63HT8CXQ6LPDR,asset managers,swiss life asset managers is the most successf...
2,1,KS120PL63HT8CXQ6LPDR,asset manager,swiss life asset managers is the most successf...
3,2,KS120XP636CB5432F5TP,telecommunications,as a listed company with around million custom...
4,2,KS122556LMQ829GZCCRV,communications,as a listed company with around million custom...


In [189]:

# This code merges two dataframes based on the common column 'skill_id'.
# The dataframes are:
# 1) found_skills - a dataframe of skills that were found in a job posting
# 2) enski_skills_db - a dataframe of skills in the Enki Skills Database
# The merge is an 'inner' merge, meaning that only the skills that are in both dataframes will be merged together.
# The resulting dataframe is found_skills_for_tableau.
# The .head(5) command just shows the first 5 rows of the resulting dataframe.

found_skills_for_tableau = pd.merge(found_skills, enski_skills_db, how='inner',on='skill_id')
found_skills_for_tableau.head(5)

Unnamed: 0,id_text,skill_id,skill_name_x,text,skill_description,infoUrl,isLanguage,isSoftware,skill_name_y,skill_type_name,skill_category_name,skill_subcategory_name
0,2,KS121Z26S4VJLQ1WXN21,customer service,as a listed company with around million custom...,Customer service is the provision of service t...,https://skills.emsidata.com/skills/KS121Z26S4V...,False,False,Customer Service,Common Skill,Customer and Client Support,Customer Service
1,1,KS120PL63HT8CXQ6LPDR,asset managers,swiss life asset managers is the most successf...,,https://skills.emsidata.com/skills/KS120PL63HT...,False,False,Asset Management,Specialized Skill,Business,Business Operations
2,1,KS120PL63HT8CXQ6LPDR,asset manager,swiss life asset managers is the most successf...,,https://skills.emsidata.com/skills/KS120PL63HT...,False,False,Asset Management,Specialized Skill,Business,Business Operations
3,2,KS120XP636CB5432F5TP,telecommunications,as a listed company with around million custom...,Telecommunication is the transmission of infor...,https://skills.emsidata.com/skills/KS120XP636C...,False,False,Telecommunications,Specialized Skill,Information Technology,Telecommunications
4,2,KS122556LMQ829GZCCRV,communications,as a listed company with around million custom...,Communication is the act of developing meaning...,https://skills.emsidata.com/skills/KS122556LMQ...,False,False,Communications,Common Skill,Physical and Inherent Abilities,Communication


In [190]:
# Saves the dataframe as an Excel file


found_skills_for_tableau.to_excel('C:/Users/Aleksej Aikov/Desktop/Enablement/Master/04_Analysis/04_01_custom_analysis/04_01_tableau_data/14_tableau_analysis_ensi_skills_lost.xlsx', index=False) 