In [None]:
import spacy
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download(['stopwords', 'wordnet'])

In [None]:
def job_get_experience_expectations(experience_str):
    match = re.match(r'(\d+)\s+to\s+(\d+)\s+Years', experience_str)
    if match:
        min_exp, pref_exp = match.groups()
        return int(min_exp) * 12, int(pref_exp) * 12
    return None, None

def job_get_skills(skills_str):
    pattern = re.compile(r'([A-Z][a-z]*(?:\s[a-z]+)*)')
    skills = pattern.findall(skills_str)
    skills = [skill.lower() for skill in skills]
    return skills

In [None]:
jobs_df = pd.read_csv('./data/training/job_descriptions.csv')
# jobs_df = jobs_df.reindex(np.random.permutation(jobs_df.index))
# jobs_df = jobs_df.copy().iloc[0:10,]

jobs_df['job_id'] = range(1, len(jobs_df) + 1)
experience = jobs_df['Experience'].apply(lambda x: pd.Series(job_get_experience_expectations(x)))
jobs_df = pd.DataFrame({
    'job_id': jobs_df['job_id'],
    'job_title': jobs_df['Job Title'].str.lower(),
    'skills': jobs_df['skills'].apply(job_get_skills),
    'min_experience': experience[0],
    'max_experience': experience[1],
})

In [None]:
jobs_df.head(100)

In [None]:
def create_spacy_pattern(title):
    words = title.split()
    pattern = [{"LOWER": word.lower()} for word in words]
    return {"label": "TITLE", "pattern": pattern}

nlp = spacy.load("en_core_web_lg")
skill_pattern_path = "./data/preprocessing/jz_skill_patterns.jsonl"
ruler = nlp.add_pipe("entity_ruler")
ruler.from_disk(skill_pattern_path)

In [None]:
def resume_get_skills(doc):
    skills = [ent.text for ent in doc.ents if ent.label_ == "SKILL"]
    return list(set(skills))

# def resume_get_title(doc):
#     for ent in doc.ents:
#         if ent.label_ == "TITLE":
#             return ent.text
#     return None

def resume_get_title(text):
    # Regular expression to capture text at the beginning until a common separator or multiple whitespaces
    match = re.match(r'^[A-Z\s]+(?:,|:|\n|\s{2,})?', text)
    if match:
        title = match.group(0).strip()
        # Remove any trailing special characters and multiple whitespaces, normalize title to title case
        title = re.split(r'[,:;\n]|\s{2,}', title)[0].strip()
        return title.title()
    return None

# def resume_get_experience(cv_text):
#     # Define a pattern for experience
#     experience_pattern = r'\b(\d+)\s*(year|month)\b'
# 
#     # Find all experience mentions in the CV text
#     experience_found = re.findall(experience_pattern, cv_text)
# 
#     # Filter out None or empty entries and sum up experience
#     total_months = 0
#     for value, unit in experience_found:
#         if unit == "year":
#             total_months += int(value) * 12
#         elif unit == "month":
#             total_months += int(value)
# 
#     return total_months

def clean_text(text):
    review = re.sub('(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?"', " ", text)
    review = review.lower()
    review = review.split()
    lm = WordNetLemmatizer()
    review = [lm.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    return " ".join(review)

def extract_experience_section(text):
    # Regex to find the 'Experience' section
    experience_section = re.search(r'Experience\s*([\s\S]*?)(?=\n[A-Z]|$)', text, re.IGNORECASE)
    return experience_section.group(1).strip() if experience_section else None

def extract_dates_and_calculate_months(text):
    # Regex to find dates in the format 'MM/YYYY'
    dates = re.findall(r'\b(0[1-9]|1[0-2])/((19|20)\d{2})\b', text)

    total_months = 0

    for i in range(0, len(dates), 2):
        start_date = dates[i]
        if i + 1 < len(dates):
            end_date = dates[i + 1]
        else:
            end_date = ('Current',)

        start_date_obj = datetime.strptime(f"{start_date[0]}/01/{start_date[1]}", "%m/%d/%Y")

        if end_date[0] == 'Current':
            end_date_obj = datetime.now()
        else:
            end_date_obj = datetime.strptime(f"{end_date[0]}/01/{end_date[1]}", "%m/%d/%Y")

        # Calculate the difference in months
        months_diff = (end_date_obj.year - start_date_obj.year) * 12 + end_date_obj.month - start_date_obj.month
        if months_diff > 0:
            total_months += months_diff

    return total_months

def normalize_skills(skills):
    return [skill.lower() for skill in skills]

def resume_get_experience(text):
    experience_section = extract_experience_section(text)
    if experience_section:
        total_months = extract_dates_and_calculate_months(experience_section)
        return total_months
    return 0

In [None]:
resume_df = pd.read_csv('./data/training/general-resumes-dataset.csv')
# resume_df = resume_df.reindex(np.random.permutation(resume_df.index))
# resume_df = resume_df.copy().iloc[0:10,]

resume_df['candidate_id'] = range(1, len(resume_df) + 1)

processed_resumes = []
for _, row in resume_df.iterrows():
    text = row['Resume_str']
    candidate_id = row['candidate_id']

    job_title = resume_get_title(text).lower()

    cv_text = clean_text(text)
    doc = nlp(cv_text)
    skills = resume_get_skills(doc)
    # skills_section = re.search(r'Skills\s*([\s\S]*?)(?=\n[A-Z]|$)', text, re.IGNORECASE)
    # skills = normalize_skills(skills_section.group(1).strip().split(', ')) if skills_section else resume_get_skills(doc)

    experience_section = extract_experience_section(text)
    total_experience_months = extract_dates_and_calculate_months(experience_section) if experience_section else 0

    processed_resumes.append({ "candidate_id": candidate_id, "category": row['Category'].lower(), "job_title": job_title, "skills": skills, "experience": total_experience_months })

resume_df = pd.DataFrame(processed_resumes)

In [None]:
resume_df.head(100)

In [None]:
jobs_df.to_csv('./data/processed/job_descriptions_processed.csv')
resume_df.to_csv("./data/processed/general-resume-dataset-processed-v1.csv")