### Avoids scroll-in-the-scroll in the entire Notebook

In [None]:
from IPython.display import Javascript
def resize_colab_cell():
    display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

### Extracting text from resume PDF

In [None]:
!pip install PyPDF2

<IPython.core.display.Javascript object>

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/232.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import PyPDF2


def extract_resume_text(filePath):
    reader = PyPDF2.PdfReader(open(filePath, "rb"))

    text = ""
    for i in range(len(reader.pages)): text += reader.pages[i].extract_text()
    return text

<IPython.core.display.Javascript object>

### Extracting *_`Category (Job Role)`_*

In [None]:
def extract_category(text):
     return text.strip().split("\n")[0].strip() if text.strip() else None

<IPython.core.display.Javascript object>

### Extracting *_`Education`_*

In [None]:
import re


def extract_education(text):
    education_keywords = [
        "High School", "Certificate", "Associate", "Diploma", "High School Diploma", "GED", "Undergraduate",
        "B.A.", "B.S.", "B.Sc.", "B.Eng.", "B.Pharm.", "B. Pharmacy", "B Pharmacy", "Bachelor", "Graduate",
        "M.A.", "M.S.", "M.Sc.", "M.Eng.", "M.Pharm.", "M. Pharmacy", "MBA", "Master", "Postgraduate",
        "Ph.D.", "Doctorate", "Doctor", "Doctor of Medicine", "Doctor of Science"
    ]

    education = []

    for word in education_keywords:
        pattern = r"(?i)\b{}\b".format(re.escape(word))
        match = re.search(pattern, text)
        if match: education.append(match.group())

    return education

<IPython.core.display.Javascript object>

### Extracting *_`Skills`_*

In [None]:
!pip install spacy
!python -m spacy info
!python -m pip freeze | grep spacy
!python -m spacy download en_core_web_sm

<IPython.core.display.Javascript object>

2023-09-16 19:48:18.980331: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[1m

spaCy version    3.6.1                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-5.15.109+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.6.0)        

en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl#sha256=83276fc78a70045627144786b52e1f2728ad5e29e5e43916ec37ea9c26a11212
spacy==3.6.1
spacy-legacy==3.0.12
spacy-loggers==1.0.4
2023-09-16 19:48:30.514910: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in pe

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")


def extract_skills(text):
    skills = []
    doc = nlp(text)

    for word in doc:
        if "NN" in word.tag_: skills.append(word.text)

    return list(set(skills))

<IPython.core.display.Javascript object>

### Uploading folder containing Resume/CVs as *`ZIP`*
> here i used drive_mount method, you can use anything (local, cloud, database)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

<IPython.core.display.Javascript object>

Mounted at /content/drive


In [None]:
import zipfile, os


# location/path of zip file in colab runtime storage
resume_folder = "/content/drive/MyDrive/Resume-to-Job-Matcher/dataset/resume_PDF_dataset.zip"

if zipfile.is_zipfile(resume_folder):
    with zipfile.ZipFile(resume_folder, 'r') as zip_ref:
        zip_ref.extractall()
        path = os.getcwd() + "/" + os.path.splitext(os.path.basename(resume_folder))[0]
else:
    print("# ERROR: Please upload a valid ZIP file with resume PDFs.")

<IPython.core.display.Javascript object>

### Driver function
> here i have extracted only `category(job role)`, `skills` and `education` <br>
> you may extract more data if needed

In [None]:
import PyPDF2, glob, os


def main():
    pdf_files = glob.glob(os.path.join(path, "*.pdf"))

    for i in pdf_files:
        thisPDF_text = extract_resume_text(i)

        category = extract_category(thisPDF_text)
        skills = extract_skills(thisPDF_text)
        education = extract_education(thisPDF_text)

        # print("Category:", category)
        # print("Skills:", skills)
        # print("Education:", education)
        # print()

In [None]:
if __name__ == "__main__": main()