# Load data and dependencies

In [1]:
import pandas as pd
tech_example = pd.read_csv('Files\Example_Technical_Skills.csv')
data = pd.read_csv('Files\Raw_Skills_Dataset.csv')

In [2]:
pd.set_option('display.max_rows', 10)

In [3]:
tech_example.head(10)

Unnamed: 0,Technology Skills
0,SAP Fiori Developer
1,Oracle Instance Management & Strategy
2,Boomi Master Data Management
3,Digital Manufacturing on Cloud ( DMC)
4,DevOps
5,CA SAM
6,OpenShift
7,Acxiom Data Analytics
8,SAP Digital Boardroom
9,Seeburger BIS


In [4]:
data.rename(columns={'RAW DATA': 'skill'}, inplace=True)
data.columns

Index(['skill'], dtype='object')

In [5]:
data.head()

Unnamed: 0,skill
0,What ifs
1,seniority
2,familiarity
3,functionalities
4,Lambdas


# Data analysis

In [6]:
print(tech_example['Technology Skills'].nunique())
example_skills = set(tech_example['Technology Skills'].unique())

897


In [7]:
print(data.skill.nunique())
all_skills = set(data.skill.unique())
data.skill.unique()

15677


array(['What ifs', 'seniority', 'familiarity', ..., 'all applicants',
       'Self-motivated, enthusiastic and strong drive', 'negotiation'],
      dtype=object)

In [8]:
len(all_skills.difference(example_skills))

15641

In [9]:
# data.head(100)

In [10]:
# data.tail(100)

In [11]:
chars = data.skill.str.split("")
unq_chars = set()
for c in chars:
    for char in c:
        unq_chars.add(char)

In [12]:
print([char for char in unq_chars])

['', 'b', 'z', 'ö', '/', 'B', 'O', 'ß', ']', 'G', '❤', 'é', 'R', '2', '%', 'ä', '>', 'x', 'U', 'c', 'w', 'M', 'i', 'v', 'q', '„', '™', '&', '”', '\ufeff', '\uf0b7', '’', '9', '–', '4', 'm', '®', '5', 'N', '$', 'Y', ';', '(', 'V', 'Ø', '‘', '🃏', '“', 's', 'E', 'C', 'Z', '*', "'", '8', '0', 'h', '•', '\u202f', 'j', 'g', 'I', 't', 'k', '\xad', 'l', '@', 'ô', '\\', ' ', 'F', '-', '\xa0', '~', 'ü', 'W', ',', '🏗', 'f', 'r', '·', 'Q', ')', '.', '\u2028', '^', '°', '∙', '●', 'T', '?', ':', '\t', 'D', 'X', 'o', 'L', 'A', '"', '\n', 'J', 'K', 'e', 'y', 'u', '3', 'S', 'p', 'd', '1', '6', '+', 'H', '#', 'a', '7', '[', 'P', 'n']


In [13]:
len(unq_chars)

119

**all unique chars**

In [14]:
unique_chars = ['"',
 '#',
#  '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
#  ',',
 '-',
 '.',
 '/', 
 '~',
 '\xa0',
 '\xad',
 '®',
 '°',
 '·',
 'Ø',
 'ß',
 'ä',
 'é',
 'ô',
 'ö',
 'ü',
 '–',
 '‘',
#  '’',
 '“',
 '”',
 '„',
 '•',
 '\u2028',
 '\u202f',
 '™',
 '∙',
 '●',
 '❤',
 '\uf0b7',
 '\ufeff',
 '🃏',
 '🏗']

In [None]:
for char in unique_chars:
    intresting_data = data[data.skill.str.contains(char)]
    print(char)
    display(intresting_data)

In [46]:
data[data.skill.str.contains(',')].head()

Unnamed: 0,skill


In [47]:
import spacy
nlp = spacy.load('en_core_web_md')



In [51]:
word = nlp('strong motivation and dedciated')
for token in word:
    print(token,token.pos_, token.lemma_)

strong ADJ strong
motivation NOUN motivation
and CCONJ and
dedciated VERB dedciate


In [52]:
word = nlp('report writing and editing')
for token in word:
    print(token,token.pos_)

report NOUN
writing VERB
and CCONJ
editing VERB


In [53]:
word = nlp('conceptual, logical & physical data model designs')
for token in word:
    print(token,token.pos_)

conceptual ADJ
, PUNCT
logical ADJ
& CCONJ
physical ADJ
data NOUN
model NOUN
designs NOUN


In [17]:
data[data.skill.str.contains('é')]
# now we got french

Unnamed: 0,skill
6275,Montréal
11155,Développement Back End
21282,Professional / Expérimenté(e) / Professionell ...
23753,modélisation des données
23763,Développement
23764,Traitement des données dans plusieurs formats
23902,the Ordre des ingénieurs du Québec
33919,Banques de données


In [18]:
# 'Ø' remove these
data[data.skill.str.contains('ä')]

Unnamed: 0,skill
13385,Selbstständige Erstellung
13392,Aktive Unterstützung der Qualitätssicherung fü...
13406,Ausgeprägte Kenntnisse


In [19]:
'ß'
data[data.skill.str.contains('ß')]
# this is german...

Unnamed: 0,skill
13399,Umsetzung von konkreten Optimierungsmaßnahmen
13424,Fließend Englischkenntnisse


In [20]:
data[data.skill.str.contains('🏗')]

Unnamed: 0,skill
30875,🏗


In [21]:
data[data.skill.str.contains('🃏')]

Unnamed: 0,skill
21397,automated Jest 🃏


In [22]:
data[data.skill.str.contains('\ufeff' or '\u2028' or '')]

Unnamed: 0,skill
12669,﻿
15807,﻿. Mandatory Skills & Qualification Frontend d...
24187,﻿Tier 1 Educational Background Preferred - IIT...
30841,﻿
31575,teams﻿RequirementsDegree
33632,etc.)﻿. Strong aptitude


In [23]:
data.iloc[12669]
# some are empty spaces

skill    ﻿
Name: 12669, dtype: object

In [24]:
data[data.skill.str.contains('❤')]

Unnamed: 0,skill
21381,❤


In [25]:
words = data.skill.str.split(" ")
unq_words = set()
for wo in words:
    for word in wo:
        unq_words.add(word)

In [26]:
len(unq_words)

10010

**Translated skills**

In [27]:
olang_skills = set()

 # these are the charecters found in other languages
other_lang_chars = [
    'ß',
    'ä',
    'é',
    'ô',
    'ö',
    'ü'] 

# filter out the other languages
for char in other_lang_chars:
    o_data = data[data.skill.str.contains(char)]
    display(o_data)
    idx = o_data.index
    for idx_val in idx.values:
        olang_skills.add(idx_val)

Unnamed: 0,skill
13399,Umsetzung von konkreten Optimierungsmaßnahmen
13424,Fließend Englischkenntnisse


Unnamed: 0,skill
13385,Selbstständige Erstellung
13392,Aktive Unterstützung der Qualitätssicherung fü...
13406,Ausgeprägte Kenntnisse


Unnamed: 0,skill
6275,Montréal
11155,Développement Back End
21282,Professional / Expérimenté(e) / Professionell ...
23753,modélisation des données
23763,Développement
23764,Traitement des données dans plusieurs formats
23902,the Ordre des ingénieurs du Québec
33919,Banques de données


Unnamed: 0,skill
23780,Contrôle de version


Unnamed: 0,skill
13383,Schnittstellen für Cloud-native Lösungen
13390,Software-Lösungen


Unnamed: 0,skill
13378,bei der Einführung
13379,Bewertung von IT-weit gültigen Standards
13383,Schnittstellen für Cloud-native Lösungen
13392,Aktive Unterstützung der Qualitätssicherung fü...
13394,Durchführung entsprechender Tests
13408,Mindset eines „Technologie-Vordenkers“ wünsche...


In [28]:
data.iloc[list(olang_skills)]

Unnamed: 0,skill
6275,Montréal
11155,Développement Back End
21282,Professional / Expérimenté(e) / Professionell ...
13378,bei der Einführung
13379,Bewertung von IT-weit gültigen Standards
...,...
13406,Ausgeprägte Kenntnisse
13408,Mindset eines „Technologie-Vordenkers“ wünsche...
23780,Contrôle de version
13424,Fließend Englischkenntnisse


In [29]:
# these skills manually translated with google translation
trans_skills = [
    "Montreal",
    "Back End Development",
    "Professional / Experienced / Professional ...",
    "at the introduction",
    "Evaluation of IT-wide applicable standards",
    "Interfaces for cloud-native solutions",
    "Independent creation",
    "data modeling",
    "software solutions",
    "Active support of quality assurance for...",
    "Conducting appropriate tests",
    "Development",
    "Processing data in multiple formats",
    "Implementation of concrete optimization measures",
    "the Order of Engineers of Quebec",
    "Pronounced knowledge",
    "Mindset of a technology pioneer wish...",
    "Version control",
    "Fluent in English",
    "Databases"
    ]

In [30]:
# repalce skills in other languages to english
data.iloc[list(olang_skills),0] = trans_skills
data.iloc[list(olang_skills)]

Unnamed: 0,skill
6275,Montreal
11155,Back End Development
21282,Professional / Experienced / Professional ...
13378,at the introduction
13379,Evaluation of IT-wide applicable standards
...,...
13406,Pronounced knowledge
13408,Mindset of a technology pioneer wish...
23780,Version control
13424,Fluent in English


# Clean data
- clean unwanted symbols, emojis, punctuation and spaces

In [31]:
import regex
import re
from string import digits

In [32]:
# # remove unwanted chars
# def clean(skill):
#     skill = skill.lower()
#     # if skill.contains('c++' | 'c#'):
#     if 'c++' or "c#" in skill:
#         skill.replace('c++', 'cpp')
#         skill.replace('c#', 'c-hash')
#     re.sub(r'[^a-zA-Z]+', '', skill)   #  this can remove skill like c++ or C#
#     regex.sub(r'[^\u{1F600}-\u{1F6FF}\s]', '', skill)
#     return skill

In [33]:
#  clean the dataset
unwanted_chars = ['"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
#  '-',
 '.',
 '/', 
 '~',
 '\xa0',
 '\xad',
 '®',
 '°',
 '·',
 'Ø',
 'ß',
 'ä',
 'é',
 'ô',
 'ö',
 'ü',
 '–',
 '‘',
 '’',
 '“',
 '”',
 '„',
 '•',
 '\u2028',
 '\u202f',
 '™',
 '∙',
 '●',
 '❤',
 '\uf0b7',
 '\ufeff',
 '🃏',
 '🏗',
 ":"
 ]

def clean(skill):
    skill = skill.lower()
    if 'c++' or "c#" or ".net" in skill:
        skill = skill.replace('c++', 'cpp')
        skill = skill.replace('c#', 'c-hash')
        skill = skill.replace('.net', 'dot-net')
    # remove unwanted characters 
    # skill = re.sub(skill, "[d]", "")
    skill = skill.translate(digits)
    skill = skill.replace('/', ' and ')
    for char in unwanted_chars:
        skill = skill.replace(char, '')
    return skill

In [34]:
# pd.set_option('display.max_rows', None)

In [35]:
data = data.applymap(clean)

In [38]:
tech_example = tech_example.applymap(clean)

In [242]:
data.head(20)

Unnamed: 0,skill
0,what ifs
1,seniority
2,familiarity
3,functionalities
4,lambdas
...,...
15,git familiarity
16,maven
17,gradle familiarity
18,continuous integration


In [249]:
data.tail(30)

Unnamed: 0,skill
34086,roku applications development
34087,roku applications
34088,brightscript scenegraph
34089,other tech stacks
34090,beneficial html and js
...,...
34111,negotiation
34112,deadlines
34113,self-motivated enthusiastic and strong drive
34114,negotiation


In [39]:
data.iloc[21381]

skill    
Name: 21381, dtype: object

0

# Extract tech skills

- The goal of the notebook here is to seperate technincal skills from given list of skills
- tech skills --> "Technical skills include tools and machines on which you work and get proficiency after a certain time."
- soft skills --> "Soft skills on the other hand are the interpersonal skills or people skills that can be used in any/every job"

In [40]:
print(data.skill.nunique())
all_skills = set(data.skill.unique())

14752


In [41]:
tech_example.applymap(clean)
example_skills = set(tech_example['Technology Skills'].unique())

In [42]:
all_skills.intersection(example_skills)

{'activemq',
 'adobe photoshop',
 'amazon lambda',
 'apache oozie',
 'bash',
 'bitbucket',
 'bluetooth',
 'consul',
 'cortex',
 'devops',
 'drm',
 'eac',
 'eclipse',
 'embedded systems',
 'ethereum',
 'forcecom',
 'ftp',
 'github',
 'guidewire policycenter',
 'gulp',
 'javascript frameworks',
 'katalon',
 'kotlin',
 'magento',
 'microsoft azure networking',
 'mysql',
 'netflow',
 'netsuite',
 'nuget',
 'octopus deploy',
 'onestream',
 'openshift',
 'oracle database',
 'phantom',
 'postgis',
 'radius',
 'recruiting',
 'sap analytics cloud',
 'sap integration',
 'sap lumira',
 'sas base',
 'sas enterprise guide',
 'sas jmp',
 'scikit-learn',
 'scss',
 'solidity'}

- A simple and naive approach is to have a set of soft skills and filter them out from our data.

In [119]:
# list of soft skills manually collected from the web
soft_skills = [
    "Attention",
    "Integrity",
    "Persistence",
    "Time management",
    "communication",
    "Passionate",
    "motivated",
    "Flexibility",
    "Optimism",
    "Consistency",
    "organised",
    "Curiosity",
    "Taking calculated risks",
    "Teamwork",
    "Research",
    "leadership",
    "Creativity",
    "Confidence",
    "Active listening",
    "Adaptability",
    "collaboration",
    "positive attitude",
    "integrity",
    "courtesy",
    "Trustworthiness",
    "Honesty",
    "Kindness",
    "Focusing",
    "Attentiveness",
    "Public Speaking",
    "Clear Speech and Writing",
    "NonVerbal Communication",
    "Presentation Skills",
    "Listening Skills",
    "Industriousness",
    "Perseverance",
    "Determination",
    "Diligence",
    "Committing",
    "Productiveness",
    "Efficiency",
    "Self Motivation",
    "Punctuality",
    "Respectability",
    "Consideration",
    "Pragmatism",
    "Mentoring",
    "Persuasion",
    "deadlines managment",
    "Presentation skills",
    "Diplomacy",
    "Networking",
    "Patience",
    "Storytelling",
    "Confidence",
    "Negotiation",
    "Tolerance",
    "Sensitivity",
    "Innovation",
    'Reasoning',
    "Inspiration",
    "Innovation",
    "Imagination",
    'Agility',
    "Coaching",
    "Versatility",
    "self driven",
    "enthusiastic"
]

In [120]:
soft_skills_lemma = [] 

def get_softskill_lemma(word):
    word = word.lower()
    doc = nlp(word)
    for token in doc:
        soft_skills_lemma.append(token.lemma_)

In [121]:
for word in soft_skills:
    get_softskill_lemma(word)
soft_skills_lemma[:5]

['attention', 'integrity', 'persistence', 'time', 'management']

In [137]:
from nltk.stem.snowball import SnowballStemmer
def get_stem(word):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(word)

In [138]:
soft_skills_stem = []
for word in soft_skills:
    get_softskill_lemma(word)
soft_skills_lemma[:5]

['attention', 'integrity', 'persistence', 'time', 'management']

In [122]:
def get_lemma(word):
    word = word.lower()
    doc = nlp(word)
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    return lemma_list

In [None]:
def get_stem(word):
    stemmer = SnowballStemmer("english")
    return stemmer.stem(word)

In [103]:
tech_example['techskills_lemma'] = tech_example['Technology Skills'].apply(get_lemma)
tech_example

Unnamed: 0,Technology Skills,techskills_lemma
0,sap fiori developer,"[sap, fiori, developer]"
1,oracle instance management strategy,"[oracle, instance, management, , strategy]"
2,boomi master data management,"[boomi, master, datum, management]"
3,digital manufacturing on cloud dmc,"[digital, manufacturing, on, cloud, , dmc]"
4,devops,[devop]
...,...,...
974,oracle cloud revenue management,"[oracle, cloud, revenue, management]"
975,oracle ebs grid contral mgt pack,"[oracle, ebs, grid, contral, mgt, pack]"
976,amazon elastic mapreduce emr,"[amazon, elastic, mapreduce, emr]"
977,apache kudu,"[apache, kudu]"


In [104]:
tech_skills_list = []
for tech in tech_example.techskills_lemma:
    tech_skills_list.extend(tech)

In [65]:
from nltk.corpus import  stopwords
stopwords  = stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [105]:
tech_skills_set = set(tech_skills_list) - set(stopwords)
tech_skills_set = tech_skills_set -set(unwanted_chars)
tech_skills_list = list(tech_skills_set)

In [106]:
# if we have parts of speech proper-noun tag then in most likely be some tool or technology (so extract them)
def get_nouns(word):
    doc = nlp(word)
    nouns_list = []
    for token in doc:
        if token.pos_ == 'PROPN' or token.pos_ == 'NOUN':
            nouns_list.append(token.text)
    return nouns_list
        

In [126]:
def get_pnouns(word):
    doc = nlp(word)
    nouns_list = []
    for token in doc:
        if token.pos_ == 'PROPN':
            nouns_list.append(token.text)
    return nouns_list

In [108]:
get_nouns('apache kudu developer')

['apache', 'kudu', 'developer']

In [127]:
get_pnouns('apache kudu developer')

['apache', 'kudu']

In [109]:
data['skill_lemma'] = data['skill'].apply(get_lemma)

In [None]:
data['nouns'] = data['skill'].apply(get_nouns)

In [128]:
data['pnouns'] = data['skill'].apply(get_pnouns)

In [111]:
data.tail()

Unnamed: 0,skill,skill_lemma,nouns
34111,negotiation,[negotiation],[negotiation]
34112,deadlines,[deadline],[deadlines]
34113,self-motivated enthusiastic and strong drive,"[self, -, motivate, enthusiastic, and, strong,...","[self, drive]"
34114,negotiation,[negotiation],[negotiation]
34115,deadlines,[deadline],[deadlines]


In [125]:
'negotiation' in soft_skills_lemma

True

In [136]:
get_stem('negotiation')

'negoti'

In [144]:
data.iloc[1,2]

['seniority']

In [None]:
for i in range(len(data)):
    if len(data.iloc[i,2]) > 0:
        data.iloc[i,2] = data.iloc[i,2][0]

In [145]:
extracted_skills = []
for i in range(len(data)):
    s = data.iloc[i,2]
    if len(s) > 0 and s not in soft_skills_lemma:
        extracted_skills.append(s)

In [148]:
#  the extracted skills list is not 100% accurate, but it is a good approach
#  extracted_skills is list of technical skills
extracted_skills[100:120]

[['raft'],
 ['map', 'multicast'],
 ['paxos'],
 ['ci', 'cd'],
 ['github'],
 ['microservices'],
 ['aws'],
 ['rest', 'servicesexperience', 'writing', 'tools'],
 ['ruby'],
 ['perlexperience'],
 ['bazel'],
 ['toolingscala', 'programming', 'experienceknowledge'],
 ['http'],
 ['protocolsexperience'],
 ['experience', 'building', 'cloud', 'environments'],
 ['unix', 'linux', 'experience', 'programming', 'experience'],
 ['python', 'shell'],
 ['ipc', 'mechanisms'],
 ['tcp', 'ip'],
 ['kafka']]

In [149]:
len(extracted_skills)

31060

In [151]:
extracted_skills_set = set()

for list in extracted_skills:
    for word in list:
        if word not in soft_skills_lemma:
            extracted_skills_set.add(word)

In [153]:
#  these are unique skills extracted from the data
len(extracted_skills_set)

4865

In [154]:
extracted_skills_set

{'gateways',
 'xquery',
 'solutionsagainst',
 'assignments',
 'screen',
 'pharmacies',
 'transfers',
 'calgary',
 'containment',
 'webapp',
 'analysis',
 'commons',
 'disabilities',
 'ufw',
 'choices',
 'solana',
 'lake',
 'outreach',
 'coworking',
 'procurementand',
 'qtp',
 'bazel',
 'adoptions',
 'authentication',
 'artifact',
 'dialog',
 'bots',
 'redshift',
 'memory',
 'quantities',
 'portal',
 'client',
 'allen',
 'hadoop',
 'autosar',
 'cmis',
 'slis',
 'elixir',
 'pipe',
 'plumbing',
 'cpt',
 'result',
 'supervision',
 'assistants',
 'normalisation',
 'wix',
 'workloads',
 'po',
 'contribution',
 'siebel',
 'optimisation',
 'qualification',
 'mbtcp',
 'malfunctions',
 'ant',
 'frontend',
 'pcie',
 'phone',
 'generating',
 'futures',
 'lans',
 'vusmartmaps',
 'intruder',
 'shortcomings',
 'labview',
 'jms',
 'sysc',
 'pitfalls',
 'resume',
 'purposehelm',
 'diop',
 'topics',
 'loader',
 'ship',
 'playback',
 'networks',
 'co-',
 'interactions',
 'xdr',
 'community',
 'basics',
 