In [None]:
import spacy
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download(['stopwords', 'wordnet'])

In [None]:
def parse_experience_expectations(experience_str):
    match = re.match(r'(\d+)\s+to\s+(\d+)\s+Years', experience_str)
    if match:
        min_exp, pref_exp = match.groups()
        return int(min_exp) * 12, int(pref_exp) * 12
    return None, None

def split_skills(skills_str):
    pattern = re.compile(r'([A-Z][a-z]*(?:\s[a-z]+)*)')
    skills = pattern.findall(skills_str)
    skills = [skill.lower() for skill in skills]
    return skills

jobs_df = pd.read_csv('./data/training/job_descriptions.csv')
# jobs_df = jobs_df.reindex(np.random.permutation(jobs_df.index))
# jobs_df = jobs_df.copy().iloc[0:10,]

experience = jobs_df['Experience'].apply(lambda x: pd.Series(parse_experience_expectations(x)))
jobs_df = pd.DataFrame({
    'job_id': jobs_df['Job Id'],
    'job_title': jobs_df['Job Title'].str.lower(),
    'skills': jobs_df['skills'].apply(split_skills),
    'min_experience': experience[0],
    'max_experience': experience[1],
})

In [None]:
def create_spacy_pattern(title):
    words = title.split()
    pattern = [{"LOWER": word.lower()} for word in words]
    return {"label": "TITLE", "pattern": pattern}

nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "./data/preprocessing/jz_skill_patterns.jsonl"
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

titles_df = pd.read_json('./data/preprocessing/job-titles.json')
title_patterns = [create_spacy_pattern(title) for title in titles_df['job-titles'].to_numpy()]
ruler.add_patterns(title_patterns)

def get_skills(doc):
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return list(set(skills))

def get_title(doc):
    for ent in doc.ents:
        if ent.label_ == "TITLE":
            return ent.text
    return None

def extract_experience(cv_text):
    # Define a pattern for experience
    experience_pattern = r'\b(\d+)\s*(year|month)\b'

    # Find all experience mentions in the CV text
    experience_found = re.findall(experience_pattern, cv_text)

    # Filter out None or empty entries and sum up experience
    total_months = 0
    for value, unit in experience_found:
        if unit == "year":
            total_months += int(value) * 12
        elif unit == "month":
            total_months += int(value)

    return total_months

def clean_text(text):
    review = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", text)
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    return " ".join(review)

resume_df = pd.read_csv('./data/training/resume-dataset.csv')
resume_df['candidate_id'] = range(1, len(resume_df) + 1)
# resume_df = resume_df.reindex(np.random.permutation(resume_df.index))
# resume_df = resume_df.copy().iloc[0:10,]

processed_resumes = []
for _, row in resume_df.iterrows():
    text = row['Resume']
    candidate_id = row['candidate_id']
    cv_text = clean_text(text)
    doc = nlp(cv_text)
    skills = get_skills(doc)
    title = get_title(doc)
    experience_data = extract_experience(cv_text)
    processed_resumes.append({ "candidate_id": candidate_id, "job_title": title, "skills": skills, "experience": experience_data })

resume_df = pd.DataFrame(processed_resumes)
resume_df.head()

In [None]:
jobs_df.to_csv('./data/processed/job_descriptions_processed.csv')
resume_df.to_csv("./data/processed/resume-dataset-processed.csv")