### Avoids scroll-in-the-scroll in the entire Notebook

In [1]:
from IPython.display import Javascript
def resize_colab_cell():
    display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

# Preprocessing **`Job Description`**

### Importing dataset from Hugging Face

In [2]:
!pip install datasets

<IPython.core.display.Javascript object>

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.1-py3-none-a

In [3]:
from datasets import load_dataset

dataset = load_dataset("jacob-hugging-face/job-descriptions")
print(dataset)

<IPython.core.display.Javascript object>

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.77M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['company_name', 'job_description', 'position_title', 'description_length', 'model_response'],
        num_rows: 853
    })
})


### Extracting *`Company Name`*

In [4]:
def JD_extract_company(job_num):
    return dataset["train"][job_num]["company_name"]

<IPython.core.display.Javascript object>

### Extracting *`Position`*

In [5]:
def JD_extract_position(job_num) :
    return dataset["train"][job_num]["position_title"]

<IPython.core.display.Javascript object>

### Extracting *`Required Skills`*

In [6]:
def JD_extract_required_skills(job_num):
    model_response = eval(dataset["train"][job_num]["model_response"])
    return model_response["Required Skills"] if "Required Skills" in model_response else "N/A"

<IPython.core.display.Javascript object>

### Extracting *`Educational Requirements`*

In [7]:
def JD_extract_educational_requirements(job_num):
    model_response = eval(dataset["train"][job_num]["model_response"])
    return model_response["Educational Requirements"] if "Educational Requirements" in model_response else "N/A"

<IPython.core.display.Javascript object>

# Preprocessing **`Resume Data`**

### Extracting text from resume PDF

In [8]:
!pip install PyPDF2

<IPython.core.display.Javascript object>

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [9]:
import PyPDF2


def extract_resume_text(filePath):
    reader = PyPDF2.PdfReader(open(filePath, "rb"))

    text = ""
    for i in range(len(reader.pages)): text += reader.pages[i].extract_text()
    return text

<IPython.core.display.Javascript object>

### Extracting *`Category (Job Role)`*

In [10]:
def extract_category(text):
     return text.strip().split("\n")[0].strip() if text.strip() else None

<IPython.core.display.Javascript object>

### Extracting *`Education`*

In [11]:
import re


def extract_education(text):
    education_keywords = ['Bsc', 'B. Pharmacy', 'B Pharmacy', 'Msc', 'M. Pharmacy', 'Ph.D', 'Bachelor', 'Master']
    education = []

    for word in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(word))
        match = re.search(pattern, text)
        if match: education.append(match.group())

    return education

<IPython.core.display.Javascript object>

### Extracting *`Skills`*

In [12]:
!pip install spacy
!python -m spacy info
!python -m pip freeze | grep spacy
!python -m spacy download en_core_web_sm

<IPython.core.display.Javascript object>

2023-09-16 08:28:45.544821: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[1m

spaCy version    3.6.1                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-5.15.109+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.6.0)        

en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl#sha256=83276fc78a70045627144786b52e1f2728ad5e29e5e43916ec37ea9c26a11212
spacy==3.6.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
2023-09-16 08:28:56.444650: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in pe

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm")


def extract_skills(text):
    skills = []
    doc = nlp(text)

    for word in doc:
        if "NN" in word.tag_: skills.append(word.text)

    return list(set(skills))

<IPython.core.display.Javascript object>

### Uploading Resume PDFs as `Zip` and Extracting

In [14]:
from google.colab import files
import zipfile
import io


resume_folder = files.upload()

for folder in resume_folder.keys():
    if folder.endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(resume_folder[folder]), "r") as zip_ref: zip_ref.extractall("/content/")
        path = ("/content/" + folder).replace(".zip", "")
    else: print("Enter a ZIP file containing resume PDFs")


print("Path: ", path)

<IPython.core.display.Javascript object>

Saving small_tempResume.zip to small_tempResume.zip
Path:  /content/small_tempResume


# Driver Function

In [18]:
import PyPDF2, glob, os


# considering first 10 jobs from dataset
jobs = 10
pdf_files = glob.glob(os.path.join(path, "*.pdf"))

for i in range(jobs):
    JD_company = JD_extract_company(i)
    JD_position = JD_extract_position(i)
    JD_req_skills = JD_extract_required_skills(i)
    JD_req_edu = JD_extract_educational_requirements(i)

    print(JD_company)
    print(JD_position)
    print(JD_req_skills)
    print(JD_req_edu)
    print()


    for i in pdf_files:
        thisPDF_text = extract_resume_text(i)
        CV_category = extract_category(thisPDF_text)
        CV_skills = extract_skills(thisPDF_text)
        CV_education = extract_education(thisPDF_text)

        print(CV_category)
        print(CV_skills)
        print(CV_education)
        print()


    print("\n\n\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n\n\n")

<IPython.core.display.Javascript object>

Google
Sales Specialist
Bachelor's degree or equivalent experience. Experience managing enterprise SaaS accounts and sales cycles.
Bachelor's degree or equivalent experience.

SOFTWARE ENGINEERING MANAGER
['Beijing', 'engine', 'product', 'City', 'Engineer', 'encoder', 'servers', 'DTA', 'formats', 'bridge', 'projects', 'engineers', 'service', 'Development', 'Visual', 'Transcoder', 'Helper', 'code', 'Current', 'startup', 'developers', 'Project', 'India', 'scalability', 'development', 'management', 'HTML5', 'backup', 'explorer', 'Hercules', 'crash', 'concept', 'player', 'Principal', 'personnel', 'holding', 'desktop', 'years', 'tools', 'architecture', 'Helios', 'Engineering', 'protocol', 'SAX', 'integration', 'documents', 'ScrumMaster', '-', 'project', 'document', 'Experience', 'teams', 'Editing', 'client', 'layer', 'Scavenger', 'backend', 'Technology', 'functions', 'engineering', 'spinoff', 'URI', 'nodes', 'Initiated', 'Encoder', 'MediaBase', 'AAC', 'JavaScript', 'time', 'solution', 'CLI'