In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
#import ssl

import warnings
warnings.filterwarnings('ignore')

# download nltk packages
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\Khor Kean
[nltk_data]     Teng\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Khor Kean
[nltk_data]     Teng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# load data
qualification_data = pd.read_excel("../data/qualification level.xlsx")
sectors_data = pd.read_excel("../data/skill by sector.xlsx")

In [2]:
import re
text_input = "sifodfj Hi #$%#^ My na@#me is Kt"
text_input = re.sub(r"&quot;", "", text_input)
text_input = re.sub(r".hack//", "", text_input)
text_input = re.sub(r"&#039;", "", text_input)
text_input = re.sub(r"A&#039;s", "", text_input)
text_input = re.sub(r"I&#039;", "I'", text_input)
text_input = re.sub(r"&amp;", "and", text_input)
text_input

'sifodfj Hi #$%#^ My na@#me is Kt'

In [3]:
# Load qualification data from the "qualification level" Excel file
qualification_dict = dict(zip(qualification_data["qualification"], qualification_data["mqf level"]))

def preprocess_text(text):
    # lower case everything
    token = text.lower()   
    # tokenize 
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    # remove stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

def compare_skills(user_skills, sector_skills):
    # preprocess text
    user_skills = preprocess_text(user_skills)
    sector_skills = preprocess_text(sector_skills)

    # vectorize text and calculate cosine similarity
    vectorizer = TfidfVectorizer(stop_words="english", analyzer="word")
    tfidf_matrix = vectorizer.fit_transform([user_skills, sector_skills])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    return cosine_sim[0][0]

- We have a function that processes and clean the text
- Another function to return similarity score

In [6]:
user_skills = "English, Leadership, Problem Solving, Malay, planning"
user_qualification = 5
user_sector = "Teacher"

matching_sectors = []

for _, row in sectors_data.iterrows():
    sector = row["sector"]
    sector_skills = row["skills"]
    min_qualification = row["qualification"]
    # compute the similarity score between user skills and all sector skills
    similarity_score = compare_skills(user_skills, sector_skills)
    
    # check if the similarity score is above 0 and if the user's qualification is above the minimum qualification level
    if similarity_score > 0 and int(user_qualification) >= int(min_qualification):
        if not user_sector or user_sector.lower() in sector.lower():
            matching_sectors.append(sector)

# output the results
if matching_sectors:
    # the matches sector could be more than one, so we need to loop through all of them
    for sector in matching_sectors:
        sector_row = sectors_data.loc[sectors_data["sector"] == sector].iloc[0]
        required_skills = set(sector_row["skills"].split(","))
        user_input_skills = set(user_skills.lower().split(","))
        matching_skills = user_input_skills.intersection(required_skills)
        lacking_skills = required_skills.difference(user_input_skills)

        print(f"Sector: {sector.title()}")
        print("Matching Skills:", ", ".join(matching_skills).title())
        print("Lacking Skills:", ", ".join(lacking_skills)[2:].title())
        print(f"Minimum Qualification: MQF Level {sector_row['qualification']}")

        if len(lacking_skills) > 0:
            print("Job Description:")
            job_description = sector_row["job description"].split(";")
            for desc in job_description:
                print(desc.strip())
# if no match found, output this message
else:
    print("Sorry, no matching sectors found in our database for your skills and qualification level.")

Sector: Special Education Teacher
Matching Skills: English
Lacking Skills: Problem Solving, Speaking, Speech Terapy, Writing, Cimmunication, Mandarin, Nursing, Planning, Malay, Child Care, Leadership
Minimum Qualification: MQF Level 4
Job Description:
Designing or modifying the curricula and preparing lessons and activities adapted to students’ abilities and needs
giving instructions to individuals or groups using special techniques or teaching aids suited to the students’ needs
Giving instructions using techniques or special teaching aids such as Braille or lip-reading appropriate to the students’ disability and level and supervising work in class
Encouraging students to have confidence, helping them to discover and adopt methods which compensate for limitations imposed by their disability, and promoting a sense of achievement
Administering tests, evaluating and observing progress of each student and discussing it with parents, teachers, physiotherapists, social workers, etc.
Discussi