In [14]:
from IPython.core.display import HTML
from bs4 import BeautifulSoup as bs
from pathlib import Path
import sys
import os
from collections import Counter
import numpy as np
import pandas as pd

def load_job_requirements_df(html_files_dir):        
    # holds all job tiles and requirements listed for that title
    job_requirements = {}
    # run over each htm or html file and extra stuff
    for html_file in list(Path().joinpath(html_files_dir).glob("*.htm*")):
        with open(html_file) as fp:
            soup = bs(fp)
            job = soup.find('title')
            if not job:
                job = 'unknown'
            else:
                job = job.text.split('-')[0].strip().lower()
            # pull out the text in each bullet (if there is any)         
            requirements = [x.text.strip() for x in soup.find_all('li') if len(x) > 0]
            # if we've seen this job title before add the requirements to the existing list
            # which may introduce duplicates but we'll reduce that later
            if job in job_requirements.keys():
                job_requirements[job] = requirements + job_requirements[job]
            else:
                job_requirements[job] = requirements
        
    # get a list of all the job titles
    jobs = job_requirements.keys()

    # create tuples (job, requirements) removing any duplicates
    raw_data = [(job, list(set(job_requirements[job]))) for job in jobs if len(job_requirements[job]) > 0]
    
    return pd.DataFrame(raw_data, columns=['Job Title', 'List of Requirements'])

    
"""
Assumptions: 
 (1) executed from within the resume-job-posting-nlp-project cloned repo
 (2) ./data/html_job_postings.zip already unzipped
"""

html_files_dir = os.path.join(os.getcwd(), 'data', 'html_job_postings')

if not os.path.exists(html_files_dir):
    html_files_dir = input("Please enter the absolute path to the html files directory")
    if not os.path.exists(html_files_dir): sys.exit("Failed to locate html files folder!")

data = load_job_requirements_df(html_files_dir)    

#  save to file
pickled_file = 'data_frame_from_step_1.pkl'

data.to_pickle(pickled_file)

