In [1]:
import pandas as pd
import spacy
from spacy.matcher import PhraseMatcher
from skillNer.skill_extractor_class import SkillExtractor
import json
from tqdm import tqdm

# Load your CSV dataset
csv_file_path = "1.7.2_dataset.csv"
df = pd.read_csv(csv_file_path)

# Load spaCy model and initialize skill extractor
nlp = spacy.load("en_core_web_lg")

# Load the skill database from the JSON file
with open("skill_db_relax_20.json", "r") as json_file:
    skill_db = json.load(json_file)

# Initialize skill extractor with the loaded skill database
skill_extractor = SkillExtractor(nlp, skill_db, PhraseMatcher)

# Function to extract and process skills
def process_skills(description):
    annotations = skill_extractor.annotate(description)
    return annotations

def get_skill_names(skill_ids):
    return [skill_db.get(skill_id, {}).get("skill_name") for skill_id in skill_ids]

# Function to categorize skills
def categorize_skills(skill_annotations):
    hard_skills = []
    soft_skills = []
    
    for annotation in skill_annotations["results"]["full_matches"]:
        skill_id = annotation["skill_id"]
        skill_type = skill_db.get(skill_id, {}).get("skill_type")
        
        if skill_type == "Hard Skill":
            hard_skills.append(skill_id)
        elif skill_type == "Soft Skill":
            soft_skills.append(skill_id)
    
    for ngram_annotation in skill_annotations["results"]["ngram_scored"]:
        ngram_skill_id = ngram_annotation["skill_id"]
        ngram_skill_type = skill_db.get(ngram_skill_id, {}).get("skill_type")
        
        if ngram_skill_type == "Hard Skill":
            hard_skills.append(ngram_skill_id)
        elif ngram_skill_type == "Soft Skill":
            soft_skills.append(ngram_skill_id)
    
    return hard_skills, soft_skills


loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


In [38]:
csv_file_path = "1.7.2_dataset.csv"
df = pd.read_csv(csv_file_path)

In [39]:
start_index = 0

end_index = 18040

# Use tqdm to track progress
with tqdm(total=(end_index - start_index + 1)) as pbar:
    for index, row in df.iloc[start_index:end_index+1].iterrows():
        skill_annotations = process_skills(row["processed_description"])
        hard_skills, soft_skills = categorize_skills(skill_annotations)
        
        # Get skill names from skill IDs
        hard_skill_names = get_skill_names(hard_skills)
        soft_skill_names = get_skill_names(soft_skills)
        
        # Convert lists of skill names to comma-separated strings
        hard_skill_str = ", ".join(hard_skill_names)
        soft_skill_str = ", ".join(soft_skill_names)
        
        # Update the DataFrame with the extracted and categorized skills
        df.at[index, "hard_skills"] = hard_skill_str
        df.at[index, "soft_skills"] = soft_skill_str
        
        pbar.update(1)  # Update tqdm progress bar



  vec_similarity = token1.similarity(token2)
100%|██████████| 21/21 [00:06<00:00,  3.36it/s]


In [40]:

# # Save the updated DataFrame to a new CSV file
# updated_csv_file_path = "1.8_dataset.csv"
# df.to_csv(updated_csv_file_path, index=False)

In [42]:
# Save the updated DataFrame to a new CSV file
output_csv_file_path = "1.7.2_dataset.csv"
df.to_csv(output_csv_file_path, index=False)
