In [157]:
import requests
import pandas as pd
import re
import time
import random  # for random delay
from bs4 import BeautifulSoup
from rapidfuzz import fuzz
from docx import Document
import pdfplumber

In [162]:
def extract_text_pdf(file_path):
    full_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + " "
    return full_text.lower()

In [159]:
known_skills = ["python", "sql", "machine learning", "tensorflow", "data analysis",
                 "ml", "tf", 'pandas', 'numpy', 'llm', 'rag', 'cnn', 'transformer','rnn', 'lstm', 'gru',
                'matplotlib', 'seaborn', 'plotly', 'transfer learning', 'resnet', 'chatgpt']

def extract_skills_from_resume(text, skills_list):
    found_skills = []
    for skill in skills_list:
        if re.search(r'\b' + re.escape(skill.lower()) + r'\b', text):
            found_skills.append(skill.lower())
    return list(set(found_skills))

In [164]:
resume_text = extract_text_pdf("../Resume Ling Chin Ung 16-9-2025.pdf")

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


In [166]:
my_skills_from_resume = extract_skills_from_resume(resume_text, known_skills)

In [145]:
# Fuzzy matching threshold (0-100)
FUZZY_THRESHOLD = 80

def match_skills(text, skill_dict, threshold=FUZZY_THRESHOLD):
    matched = []
    text = text.lower()
    for skill, synonyms in skill_dict.items():
        all_terms = [skill] + synonyms
        for term in all_terms:
            # Exact match
            if re.search(r'\b' + re.escape(term.lower()) + r'\b', text):
                matched.append(skill)
                break
            # Fuzzy match
            if fuzz.partial_ratio(term.lower(), text) >= threshold:
                matched.append(skill)
                break
    return list(set(matched))  # remove duplicates

In [150]:
# Your skills and synonyms
my_skills = {
    "python": ["python3"],
    "sql": ["structured query language", "pl/sql"],
    "machine learning": ["ml", "machine-learning"],
    "tensorflow": ["tf"],
    "data analysis": ["analytics", "data analytics"]
}

In [None]:
# my_skills = [
#     "python", "sql", "machine learning", "tensorflow",
#     "data analysis", "healthcare", "streamlit",
#     "pandas", "numpy", 'deep learning', 'lstm', 'llm', 'matplotlib', 'seaborn', 'docker'
# ]

In [148]:
def get_jobs(keyword="data", pages=1, min_delay=5, max_delay=15):
    """
    Scrape MyCareersFuture jobs with a rate limiter.

    Args:
    - keyword: search keyword
    - pages: number of pages to fetch
    - min_delay: minimum delay between requests (seconds)
    - max_delay: maximum delay between requests (seconds)

    Returns:
    - pandas DataFrame with full_text for skill extraction
    """
    jobs = []

    for page in range(1, pages+1):
        url = f"https://api.mycareersfuture.gov.sg/v2/jobs?search={keyword}&page={page}"
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        if response.status_code != 200:
            print(f"Failed to fetch page {page}")
            continue

        data = response.json()
        for item in data.get("results", []):
            title = item.get("title", "Unknown")

            # Actual posting company
            company = item.get("postedCompany", {}).get("name", "Unknown")

            # Convert HTML description to plain text
            description_html = item.get("description", "")
            description_text = BeautifulSoup(description_html, "html.parser").get_text(separator=" ", strip=True)

            # Extract skills from API
            skills_from_api = [s['skill'].lower() for s in item.get("skills", []) if 'skill' in s]

            # Combine all text for skill matching
            full_text = " ".join([description_text.lower()] + skills_from_api)

            # Direct job details URL
            job_url = item.get("metadata", {}).get("jobDetailsUrl", "https://www.mycareersfuture.gov.sg/jobs")

            # Original posting date
            original_date = item.get("metadata", {}).get("originalPostingDate", "Unknown")

            # Salary min & max
            salary_min = item.get("salary", {}).get("minimum", None)
            salary_max = item.get("salary", {}).get("maximum", None)

            # Match skills with synonyms and fuzzy
            matched_skills = match_skills(full_text, my_skills)
            skill_score = len(matched_skills)

            jobs.append({
                    "title": title,
                    "company": company,
                    "description": description_text,
                    "skills_from_api": skills_from_api,
                    "apply_url": job_url,
                    "original_posting_date": original_date,
                    "salary_min": salary_min,
                    "salary_max": salary_max,
                    "full_text": full_text,
                    "matched_skills": matched_skills,
                    "skill_score": skill_score
                })

        # Rate limiter: wait a random time between requests
        delay = random.uniform(min_delay, max_delay)
        print(f"Page {page} fetched. Sleeping {delay:.2f} seconds...")
        time.sleep(delay)

    df_jobs = pd.DataFrame(jobs)
    return df_jobs.sort_values(by="skill_score", ascending=False)

In [151]:
df = get_jobs(keyword="data")
df.head()


Page 1 fetched. Sleeping 8.78 seconds...


Unnamed: 0,title,company,description,skills_from_api,apply_url,original_posting_date,salary_min,salary_max,full_text,matched_skills,skill_score
3,Data Scientist (Data Empowerment Team) (DSAD),SKILLSFUTURE SINGAPORE AGENCY,SSG is a dynamic and forward-thinking organiza...,"[machine learning, missions, usability, policy...",https://www.mycareersfuture.gov.sg/job/public/...,2025-09-17,3000,6000,ssg is a dynamic and forward-thinking organiza...,"[data analysis, python, machine learning, tens...",5
16,Senior Data Engineer,RAPSYS TECHNOLOGIES PTE. LTD.,Job Responsibilities: Work across workstreams ...,"[tableau, pyspark, apache spark, data analysis...",https://www.mycareersfuture.gov.sg/job/informa...,2025-09-18,7000,11000,job responsibilities: work across workstreams ...,"[data analysis, python, machine learning, tens...",5
6,Data Modeler (12 Months Contract),BANK OF SINGAPORE LIMITED,This is an essential role that supports Bank o...,"[business intelligence, big data, data modelin...",https://www.mycareersfuture.gov.sg/job/banking...,2025-09-19,8000,16000,this is an essential role that supports bank o...,"[data analysis, python, machine learning, tens...",5
5,Sr. IT Data Analyst,TD INTERNATIONAL SERVICES LTD.,Department Overview: Residing within Enterpris...,"[uat, oracle, interpersonal skills, informatic...",https://www.mycareersfuture.gov.sg/job/informa...,2025-09-08,7300,14600,department overview: residing within enterpris...,"[data analysis, python, machine learning, tens...",5
19,"Data Analyst, Digital Platforms",STANDARD CHARTERED BANK (SINGAPORE) LIMITED,Job Summary • Leading the way in Internatio...,"[tableau, dcs, charter, logistic regression, b...",https://www.mycareersfuture.gov.sg/job/banking...,2025-09-18,8100,12400,job summary • leading the way in internatio...,"[data analysis, python, machine learning, tens...",5


In [144]:
# def match_skills(text, skill_list):
#     """Return list of skills found in text"""
#     matched = [skill for skill in skill_list if re.search(r'\b' + re.escape(skill) + r'\b', text)]
#     return matched

df['matched_skills'] = df['full_text'].apply(lambda x: match_skills(x, my_skills))
df['skill_score'] = df['matched_skills'].apply(len)

# Show top jobs by skill match
top_jobs = df.sort_values(by='skill_score', ascending=False).head(10)
print(top_jobs[['title', 'matched_skills', 'skill_score']])

AttributeError: 'list' object has no attribute 'items'

In [30]:
df[df.index == 68]

Unnamed: 0,title,date_posted,full_text,url,matched_skills,skill_score
68,Senior Data Analyst | Healthcare |2 years Cont...,,senior data analyst | healthcare |2 years cont...,,"[python, sql, machine learning, healthcare, st...",7
