In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import difflib

## Import datasets

In [2]:
# linkedin data
linkedin_IS_appended = pd.read_csv('../extract_linkedIn_skills/cleaned_data/extracted/title_skill/appended_skill_IS_cleaned_03-10-2023_14-53-44.csv')
linkedin_SE_appended = pd.read_csv('../extract_linkedIn_skills/cleaned_data/extracted/title_skill/appended_skill_SE_cleaned_03-10-2023_14-53-53.csv')

# SIT modules
ictIS_modules = pd.read_csv('../sit_crawler/data/ICT(IS)_Module_Description_Skills.csv')
ictSE_modules = pd.read_csv('../sit_crawler/data/ICT(SE)_Module_Description_Skills.csv')

## cleaning the values

In [3]:
# check datasets

# remove NA values in seniority
linkedin_IS_appended = linkedin_IS_appended.dropna()
linkedin_SE_appended = linkedin_SE_appended.dropna()

# check: Seniority
print(linkedin_IS_appended['Seniority'].value_counts())
print()
print(linkedin_SE_appended['Seniority'].value_counts())

print(linkedin_IS_appended.columns.values.tolist())
print(linkedin_SE_appended.columns.values.tolist())

Mid-Senior level    311
Entry level         127
Associate            73
Executive            20
Internship            7
Director              3
Name: Seniority, dtype: int64

Mid-Senior level    925
Entry level         479
Associate           149
Executive            41
Director             40
Internship           24
Name: Seniority, dtype: int64
['Unnamed: 0', 'Job Title', 'Job URN', 'Company Name', 'Location', 'Applicants', 'Seniority', 'Employment type', 'Job function', 'Industries', 'Job description', 'Posted on', 'Skills', 'Extracted Skills']
['Unnamed: 0', 'Job Title', 'Job URN', 'Company Name', 'Location', 'Applicants', 'Seniority', 'Employment type', 'Job function', 'Industries', 'Job description', 'Posted on', 'Skills', 'Extracted Skills']


## Load the classification model

In [4]:
# for lemmatization
import nltk
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

def lemmatize_word(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)

[nltk_data] Downloading package omw-1.4 to /Users/xinhui/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
import joblib
import spacy

# Load the trained model from the file
loaded_model = joblib.load('./models/skill_classifier_model.pkl')
classifier = joblib.load('./models/classifier_model_3.pkl')

# Load spaCy model for tokenization (not needed for new model)
nlp = spacy.load("en_core_web_sm")

In [6]:
# prediction model (OLD METHOD)
def predict_skill(skill):
    skill = skill.lower().strip()
    skill = lemmatize_word(skill)
    new_skill = skill
    new_skill_vector = nlp(new_skill).vector

    # Use the loaded model for prediction
    prediction = loaded_model.predict([new_skill_vector])

    # Interpret the prediction (CHECK)
    # if skill.lower() == "advocacy":
    # if prediction[0] != 1:
    #     print(f"\n\n{new_skill} is a soft skill.")
    # else:
    #     print(f"\n\n{new_skill} is a hard skill.")

    return prediction[0]

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the vectorizer
vectorizer = joblib.load('./models/vectorizer_3.pkl')

# Deploy the model
def predict_skill_2(skill):
    skill_vector = vectorizer.transform([skill])
    prediction = classifier.predict_proba(skill_vector)

    if prediction[0][1] > 0.5:
        return "hard skill"
    else:
        return "soft skill"

In [28]:
def categorize_skill_linkedin(df):
    overall_hard_skill = {}
    overall_soft_skill = {}
    for index, row in df.iterrows():
        curr_hSkill = []
        curr_sSkill = []
        # print(row['Extracted Skills'])
        skills_list = row["Extracted Skills"].split(",")

        for skill in skills_list:
            predict_skill_type = predict_skill(skill) # -> OLD METHOD
            # predict_skill_type = predict_skill_2(skill)
            if predict_skill_type == 1:
                curr_hSkill.append(skill)
                overall_hard_skill[skill] = (
                    overall_hard_skill[skill] + 1 if skill in overall_hard_skill else 1
                )
            else:  # predict_skill_type = 0
                curr_sSkill.append(skill)
                overall_soft_skill[skill] = (
                    overall_soft_skill[skill] + 1 if skill in overall_soft_skill else 1
                )

        # hard_skill.append(','.join(curr_hSkill))
        df.at[index, "Hard Skill"] = ",".join(curr_hSkill)
        df.at[index, "Soft Skill"] = ",".join(curr_sSkill)

    return [df, overall_hard_skill, overall_soft_skill]


(
    categorised_IS,
    overall_hard_skill_IS,
    overall_soft_skill_IS,
) = categorize_skill_linkedin(linkedin_IS_appended)
(
    categorised_SE,
    overall_hard_skill_SE,
    overall_soft_skill_SE,
) = categorize_skill_linkedin(linkedin_SE_appended)

categorised_IS.to_csv("./data/categorised_IS", encoding="utf-8")
categorised_SE.to_csv("./data/categorised_SE", encoding="utf-8")

In [30]:
import collections

# Create a Counter object from the dictionary
counter = collections.Counter(overall_hard_skill_IS)
counter.most_common(50)

[('Cyber Security', 313),
 ('Computer Science', 242),
 ('Operations', 159),
 ('Innovation', 151),
 ('Vulnerability', 146),
 ('Certified Information Systems Security Professional', 136),
 ('Problem Solving', 128),
 ('Firewall', 123),
 ('Auditing', 121),
 ('Information Technology', 113),
 ('Network Security', 99),
 ('Incident Response', 97),
 ('Cyber Threat Intelligence', 89),
 ('Troubleshooting (Problem Solving)', 88),
 ('Automation', 86),
 ('Python (Programming Language)', 84),
 ('Security Policies', 83),
 ('Risk Analysis', 82),
 ('Project Management', 81),
 ('Governance', 78),
 ('Security Information And Event Management (SIEM)', 78),
 ('Risk Management', 77),
 ('Leadership', 74),
 ('Agile Methodology', 71),
 ('Penetration Testing', 69),
 ('Research', 68),
 ('Amazon Web Services', 66),
 ('Certified Information System Auditor (CISA)', 64),
 ('Vulnerability Assessments', 64),
 ('Writing', 64),
 ('Artificial Intelligence', 62),
 ('Planning', 62),
 ('Scripting', 60),
 ('Presentations', 59