In [None]:
import spacy
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download(['stopwords', 'wordnet'])

In [None]:
nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "./data/preprocessing/jz_skill_patterns.jsonl"
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

In [None]:
def create_spacy_pattern(title):
    words = title.split()
    pattern = [{"LOWER": word.lower()} for word in words]
    return {"label": "TITLE", "pattern": pattern}

In [None]:
def get_skills(doc):
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return list(set(skills))

In [None]:
def get_title(doc):
    for ent in doc.ents:
        if ent.label_ == "TITLE":
            return ent.text
    return None

In [None]:
def extract_experience(cv_text):
    # Define a pattern for experience
    experience_pattern = r'\b(\d+)\s*(year|month)\b'

    # Find all experience mentions in the CV text
    experience_found = re.findall(experience_pattern, cv_text)

    # Filter out None or empty entries and sum up experience
    total_months = 0
    for value, unit in experience_found:
        if unit == "year":
            total_months += int(value) * 12
        elif unit == "month":
            total_months += int(value)

    return total_months

In [None]:
def clean_text(text):
    review = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", text)
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    return " ".join(review)

In [None]:
titles_df = pd.read_json('./data/preprocessing/job-titles.json')
title_patterns = [create_spacy_pattern(title) for title in titles_df['job-titles'].to_numpy()]
ruler.add_patterns(title_patterns)

In [None]:
df = pd.read_csv('./data/training/resume-dataset.csv')
df = df.reindex(np.random.permutation(df.index))
df = df.copy().iloc[0:10,]

processed_resumes = []
for text in df['Resume']:
    cv_text = clean_text(text)
    doc = nlp(cv_text)
    skills = get_skills(doc)
    title = get_title(doc)
    experience_data = extract_experience(cv_text)
    processed_resumes.append({ "job-title": title, "skills": skills, "experience_months": experience_data })

df = pd.DataFrame(processed_resumes)
df.head()

In [None]:
df