In [10]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from skillNer.skill_extractor_class import SkillExtractor
import json
from tqdm import tqdm

# Load your CSV dataset
csv_file_path = "1.7_dataset.csv"
df = pd.read_csv(csv_file_path)

# Load spaCy model and initialize skill extractor
nlp = spacy.load("en_core_web_lg")

# Load the skill database from the JSON file
with open("skill_db_relax_20.json", "r") as json_file:
    skill_db = json.load(json_file)

# Initialize skill extractor with the loaded skill database
skill_extractor = SkillExtractor(nlp, skill_db, PhraseMatcher)

# Function to extract and process skills
def process_skills(description):
    annotations = skill_extractor.annotate(description)
    return annotations

def get_skill_names(skill_ids):
    return [skill_db.get(skill_id, {}).get("skill_name") for skill_id in skill_ids]

# Function to categorize skills
def categorize_skills(skill_annotations):
    hard_skills = []
    soft_skills = []
    
    for annotation in skill_annotations["results"]["full_matches"]:
        skill_id = annotation["skill_id"]
        skill_type = skill_db.get(skill_id, {}).get("skill_type")
        
        if skill_type == "Hard Skill":
            hard_skills.append(skill_id)
        elif skill_type == "Soft Skill":
            soft_skills.append(skill_id)
    
    for ngram_annotation in skill_annotations["results"]["ngram_scored"]:
        ngram_skill_id = ngram_annotation["skill_id"]
        ngram_skill_type = skill_db.get(ngram_skill_id, {}).get("skill_type")
        
        if ngram_skill_type == "Hard Skill":
            hard_skills.append(ngram_skill_id)
        elif ngram_skill_type == "Soft Skill":
            soft_skills.append(ngram_skill_id)
    
    return hard_skills, soft_skills


loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [None]:
df["hard_skill"] = ""
df["soft_skill"] = ""

# Use tqdm to track processing
with tqdm(total=len(df)) as pbar:
    # Process descriptions and add values to new columns
    for index, row in df.iterrows():
        annotations = process_skills(row["processed_description"])
        hard_skill_ids, soft_skill_ids = categorize_skills(annotations)
        hard_skill_names = get_skill_names(hard_skill_ids)
        soft_skill_names = get_skill_names(soft_skill_ids)
        df.at[index, "hard_skill"] = hard_skill_names
        df.at[index, "soft_skill"] = soft_skill_names
        pbar.update(1)  # Update tqdm progress bar


7it [00:17,  3.15s/it]

In [None]:

# Save the updated DataFrame to a new CSV file
updated_csv_file_path = "1.9_dataset.csv"
df.to_csv(updated_csv_file_path, index=False)