In [1]:
import spacy
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import random

# load spaCy model
nlp = spacy.load('en_core_web_sm')

In [2]:
# for lemmatization
import nltk
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package omw-1.4 to /Users/xinhui/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
def lemmantize_word(word):
    lemmatizer = WordNetLemmatizer()
    return lemmatizer.lemmatize(word)


def preprocess(word):
    word = word_tokenize(lemmantize_word(word.strip().lower()))
    processed_text = " ".join(word)
    return processed_text

In [4]:
import pandas as pd
allsSkills = []

summarised = pd.read_csv('./data/ml/manual_summarise.csv')
count = 0
for index, row in summarised.iterrows():
    if row['summarised'] not in allsSkills:
        allsSkills.append(row['summarised'])
        count += 1

allsSkills = [item for item in allsSkills if isinstance(item, str)]

In [6]:
combineSkills = {}
# load hardskills from txt file
with open("./data/ml/hardskills.txt", "r") as file:
    for line in file.readlines():
        combineSkills[preprocess(line)] = 1

# load softskills from txt file
# with open("./data/ml/summarized_sSkill.txt", "r") as file:
#     for line in file.readlines():
#         combineSkills[preprocess(line)] = 0

for skill in allsSkills:
    combineSkills[preprocess(skill)] = 0


# randomize the order of the skills dict
allSkills = list(combineSkills.keys())  # List of keys
random.shuffle(allSkills)
skills_df = {}
for i in allSkills:
    skills_df[i] = combineSkills[i]

skills_df["linux"]


1

In [7]:
# combine hard and soft skill data
all_skills = [skill for skill, types in skills_df.items()]
all_labels = [types for skill, types in skills_df.items()]

# vectorize
tfidf_vectorizer = TfidfVectorizer()
all_vectors = tfidf_vectorizer.fit_transform(all_skills)
total_len = len(all_labels)

In [15]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(all_vectors, all_labels, test_size=0.15, random_state=42)

# Train a logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)
# G_y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.82


In [16]:
# using the model!!
input_skill = [preprocess("Chef (Configuration Management Tool)")]
input_skill_vector = tfidf_vectorizer.transform(input_skill)

prediction = clf.predict(input_skill_vector)
print(prediction)

if prediction[0] == 1:
    print(f"{input_skill} is a hard skill.")
else:
    print(f"{input_skill} is a soft skill.")

[1]
['chef ( configuration management tool )'] is a hard skill.


In [17]:
import joblib

# save the trained model into a file
joblib.dump(clf, './models/skill_classifier_model.pkl')
joblib.dump(tfidf_vectorizer, './models/tfidf_vectorize.pkl')

['./models/tfidf_vectorize.pkl']

In [None]:
# when cannot load en_core_web_sm, run this
!python3 -m spacy download en_core_web_sm