In [1]:
import os
import pandas as pd
import ast
import numpy as np
import nltk, re
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer, util
from collections import Counter
from rapidfuzz import process

In [2]:
df = pd.read_csv("bertopic_topic_results_1.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1220 entries, 0 to 1219
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Topic                1220 non-null   int64 
 1   Count                1220 non-null   int64 
 2   Name                 1220 non-null   object
 3   Representation       1220 non-null   object
 4   Representative_Docs  1220 non-null   object
 5   category             1220 non-null   object
dtypes: int64(2), object(4)
memory usage: 57.3+ KB


In [4]:
# Ensure correct data type
# Force 'Name' and 'category' as string
df["Name"] = df["Name"].astype(str)
df["category"] = df["category"].astype(str)

# Ensure 'Representation' and 'Representative_Docs' are lists
def ensure_list(x):
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        try:
            parsed = ast.literal_eval(x)
            if isinstance(parsed, list):
                return parsed
        except Exception:
            pass
        return [w.strip() for w in x.split(",") if w.strip()]
    return []

df["Representation"] = df["Representation"].apply(ensure_list)
df["Representative_Docs"] = df["Representative_Docs"].apply(ensure_list)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1220 entries, 0 to 1219
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Topic                1220 non-null   int64 
 1   Count                1220 non-null   int64 
 2   Name                 1220 non-null   object
 3   Representation       1220 non-null   object
 4   Representative_Docs  1220 non-null   object
 5   category             1220 non-null   object
dtypes: int64(2), object(4)
memory usage: 57.3+ KB


In [6]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yifan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yifan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
lemmatizer = WordNetLemmatizer()
sem_model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
# lowercase, lemmatize and dedpulicate words
def clean_and_lemmatize(words):
    seen, cleaned = set(), []
    for w in words:
        lemma = lemmatizer.lemmatize(w.lower())
        lemma = re.sub(r'[^a-zA-Z\s]', '', lemma).strip()
        if lemma and lemma not in seen:
            seen.add(lemma)
            cleaned.append(lemma)
    return cleaned

In [9]:
# Merge semantically similar words using cosine similarity
def merge_similar_words(words, threshold=0.8):
    if len(words) <= 1:
        return words
    embeddings = sem_model.encode(words, convert_to_tensor=True)
    merged, used = [], set()
    for i, w in enumerate(words):
        if i in used:
            continue
        sims = util.cos_sim(embeddings[i], embeddings)[0]
        similar_idx = [j for j, s in enumerate(sims) if s > threshold]
        used.update(similar_idx)
        merged.append(w)
    return merged

In [10]:
def clean_topic_words(words):
    words = clean_and_lemmatize(words)
    words = merge_similar_words(words, threshold=0.8)
    return words

In [11]:
# Irrelevant words found in job ads e.g. report discriminatory job ads
irrelevant_keywords = {"fraud", "ad fraudulent" ,"fraudulent", "discrimination", "investigation", "investigative", "apply", "job ad", "fraudulent discrimination", "discrimination misleading", "select fraudulent", "ad fraudulent"}

In [12]:
def count_irrelevant_terms(words):
    return sum(1 for w in words if w.lower() in irrelevant_keywords)

In [13]:
dominant_skills = []

for cat, group in df.groupby("category"):
    topic0 = group[group["Topic"] == 0]
    topic1 = group[group["Topic"] == 1]

    if len(topic0) == 0:
        print(f"No Topic 0 found for {cat}")
        continue

    topic_row = topic0.iloc[0]
    raw_words = topic_row["Representation"]

    fraud_count = count_irrelevant_terms(raw_words)
    if fraud_count > 1 and len(topic1) > 0:
        print(f"Switching {cat} from Topic 0 to Topic 1 (fraud terms = {fraud_count})")
        topic_row = topic1.iloc[0]
        raw_words = topic_row["Representation"]

    cleaned_words = clean_topic_words(raw_words)
    dominant_skills.append({
        "Category": cat,
        "Original_topic_words": ", ".join(raw_words),
        "Cleaned_topic_words": ", ".join(cleaned_words),
        "Topic_Used": int(topic_row["Topic"]),
        "Topic_Count": topic_row["Count"]
    })

dominant_skills_df = pd.DataFrame(dominant_skills)

Switching Healthcare from Topic 0 to Topic 1 (fraud terms = 4)
Switching Hospitality & Services from Topic 0 to Topic 1 (fraud terms = 7)
Switching Management & Strategy from Topic 0 to Topic 1 (fraud terms = 2)
Switching Sales & Marketing from Topic 0 to Topic 1 (fraud terms = 6)


In [14]:
dominant_skills_df

Unnamed: 0,Category,Original_topic_words,Cleaned_topic_words,Topic_Used,Topic_Count
0,Accounting & Finance,"accounting, accounting skill, skill accounting...","accounting, skill accounting, reconciliation, ...",0,5296
1,Administration,"secretarial, administrative, administrative su...","secretarial, administrative, administrative su...",0,1681
2,Construction & Trades,"quantity surveyor, contract administration, co...","quantity surveyor, contract administration, co...",0,518
3,Creative & Design,"autocad, designer, architectural, architect, d...","autocad, designer, architectural, architect, d...",0,748
4,Education,"curriculum, academic, degree, teach, education...","curriculum, academic, degree, teach, education...",0,991
5,Engineering & Technology,"software development, asp net, developer, devo...","software development, asp net, developer, devo...",0,1997
6,Healthcare,"pharmacist, pharmacists, pharmacy skill, pharm...","pharmacist, pharmacy skill, pharmacy, medicine...",1,123
7,Hospitality & Services,"store manager, retail management, manager, sal...","store manager, retail management, manager, sal...",1,310
8,Human Resources,"hr policies, hr, human resource, human resourc...","hr policies, hr, human resource, employee rela...",0,1586
9,Legal & Compliance,"law, legal, agreements, skill legal, litigatio...","law, legal, agreement, skill legal, litigation...",0,485


In [15]:
dominant_skills_df.to_csv("dominant_cleaned_skills_no_mapping.csv", index=False)
print("download completed")

download completed


In [16]:
esco = pd.read_csv("skills_en.csv", usecols=["conceptUri", "preferredLabel"])
esco["preferredLabel"] = esco["preferredLabel"].str.lower().str.strip()

In [17]:
model = SentenceTransformer("all-MiniLM-L6-v2")
esco_emb = model.encode(esco["preferredLabel"].tolist(), convert_to_tensor=True)

def map_to_esco(skill_words, top_k=1, threshold=0.7):
    results = []
    for w in skill_words:
        emb = model.encode(w, convert_to_tensor=True)
        sims = util.cos_sim(emb, esco_emb)[0]
        best_idx = int(np.argmax(sims))
        best_score = float(sims[best_idx])
        if best_score > threshold:
            mapped_skill = esco.iloc[best_idx]["preferredLabel"]
            uri = esco.iloc[best_idx]["conceptUri"]
            results.append((w, mapped_skill, best_score, uri))
        else:
            results.append((w, None, best_score, None))
    return results

In [18]:
dominant_skills_df["ESCO_Mapping"] = dominant_skills_df["Cleaned_topic_words"].apply(lambda s: map_to_esco([w.strip() for w in s.split(",")]))

In [19]:
def extract_valid_esco(esco_mapping, min_conf=0.75):
    # Handle missing or invalid values
    if esco_mapping is None:
        return []
    if isinstance(esco_mapping, float) and pd.isna(esco_mapping):
        return []
    if isinstance(esco_mapping, (np.ndarray, list, tuple)) and len(esco_mapping) == 0:
        return []

    if isinstance(esco_mapping, str):
        try:
            esco_mapping = ast.literal_eval(esco_mapping)
        except Exception:
            return []

    # Make sure it is a list or array
    if not isinstance(esco_mapping, (list, tuple, np.ndarray)):
        return []

    # Extract valid ESCO skills
    valid_skills = []
    for m in esco_mapping:
        if isinstance(m, (list, tuple)) and len(m) >= 3:
            skill, confidence = m[1], m[2]
            if skill and isinstance(confidence, (int, float)) and confidence >= min_conf:
                valid_skills.append(skill.lower().strip())

    return list(set(valid_skills)) 



In [20]:
dominant_skills_df["Valid_ESCO_Skills"] = dominant_skills_df["ESCO_Mapping"].apply(extract_valid_esco)

In [21]:
def fill_with_cleaned(row):
    if not row["Valid_ESCO_Skills"]:
        return [w.strip() for w in row["Cleaned_topic_words"].split(",")]
    return row["Valid_ESCO_Skills"]

dominant_skills_df["Final_Skills"] = dominant_skills_df.apply(fill_with_cleaned, axis=1)


In [22]:
dominant_skills_df

Unnamed: 0,Category,Original_topic_words,Cleaned_topic_words,Topic_Used,Topic_Count,ESCO_Mapping,Valid_ESCO_Skills,Final_Skills
0,Accounting & Finance,"accounting, accounting skill, skill accounting...","accounting, skill accounting, reconciliation, ...",0,5296,"[(accounting, accounting, 1.0, http://data.eur...","[accounting, financial management, audit techn...","[accounting, financial management, audit techn..."
1,Administration,"secretarial, administrative, administrative su...","secretarial, administrative, administrative su...",0,1681,"[(secretarial, None, 0.4476470351219177, None)...","[office administration, deliver correspondence...","[office administration, deliver correspondence..."
2,Construction & Trades,"quantity surveyor, contract administration, co...","quantity surveyor, contract administration, co...",0,518,"[(quantity surveyor, quantity surveying, 0.860...","[procurement legislation, contract law, quanti...","[procurement legislation, contract law, quanti..."
3,Creative & Design,"autocad, designer, architectural, architect, d...","autocad, designer, architectural, architect, d...",0,748,"[(autocad, None, 0.6866293549537659, None), (d...","[architectural design, design process]","[architectural design, design process]"
4,Education,"curriculum, academic, degree, teach, education...","curriculum, academic, degree, teach, education...",0,991,"[(curriculum, analyse curriculum, 0.8558096289...","[academic english, analyse curriculum]","[academic english, analyse curriculum]"
5,Engineering & Technology,"software development, asp net, developer, devo...","software development, asp net, developer, devo...",0,1997,"[(software development, None, 0.66632562875747...","[java (computer programming), devops, asp.net,...","[java (computer programming), devops, asp.net,..."
6,Healthcare,"pharmacist, pharmacists, pharmacy skill, pharm...","pharmacist, pharmacy skill, pharmacy, medicine...",1,123,"[(pharmacist, None, 0.6252124309539795, None),...","[teach pharmacy principles, pharmacy law, phar...","[teach pharmacy principles, pharmacy law, phar..."
7,Hospitality & Services,"store manager, retail management, manager, sal...","store manager, retail management, manager, sal...",1,310,"[(store manager, None, 0.574077308177948, None...","[achieve sales targets, own management skills,...","[achieve sales targets, own management skills,..."
8,Human Resources,"hr policies, hr, human resource, human resourc...","hr policies, hr, human resource, employee rela...",0,1586,"[(hr policies, None, 0.6318604946136475, None)...","[identify talent, human resource management, m...","[identify talent, human resource management, m..."
9,Legal & Compliance,"law, legal, agreements, skill legal, litigatio...","law, legal, agreement, skill legal, litigation...",0,485,"[(law, criminal law, 0.8008726835250854, http:...","[criminal law, contract law, assist with litig...","[criminal law, contract law, assist with litig..."


In [24]:
dominant_skills_df.to_csv("dominant_skills_df_with_ESCO_mapping.csv", index=False)
print("download completed")

download completed
