In [None]:
!pip install PyPDF2
!pip install transformers
!pip install datasets
!pip install fitz
!pip install PyMuPDF


In [23]:
import os
import PyPDF2
import fitz
import re
import torch
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from datasets import load_dataset

In [24]:
import zipfile

with zipfile.ZipFile('archive (2).zip', 'r') as zip_ref:
    zip_ref.extractall('/content/')

In [None]:

def get_pdfs(data_folder_path):

  pdfs = []
  for root, dirs, files in os.walk(data_folder_path):
    for file in files:
      if file.endswith(".pdf"):
        pdfs.append(os.path.join(root, file))
  return pdfs

# Get the path to the data folder.
data_folder_path = "/content/data"

# Get a list of all PDF files in the data folder.
pdfs = get_pdfs(data_folder_path)

# Print the list of PDF files.
for pdf in pdfs:
  print(pdf)

In [None]:
len(pdfs)

In [None]:
# List of resume PDF file paths
resume_files = pdfs

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            text += page.get_text()
    except Exception as e:
        print(f"Error reading PDF file: {e}")
    return text

# Function to extract Category, Skills, and Education
def extract_details(text):
    # Extract Category (Job Role) based on the first line
    category_match = re.search(r'^([^\n]+)', text)
    if category_match:
        category = category_match.group(1).strip()
    else:
        category = None

    education_pattern = r'Education((?:(?!Skills|Experience|Accomplishments|Work History|ProfessionalExperience|Languages|Additional Information|Highlights|Interests).)+)'
    education_match = re.search(education_pattern, text, re.DOTALL | re.IGNORECASE)
    if education_match:
        education = education_match.group(1).strip()
    else:
        education = None

    # Extract Skills
    skills_pattern = r'Skills((?:(?!Education|Education and Training|Experience|Accomplishments|Work History|ProfessionalExperience|Languages|Additional Information|Highlights|Interests).)+)'
    skills_match = re.search(skills_pattern, text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills = skills_match.group(1).strip()
    else:
        skills = None
    return {
        "Category": category,
        "Skills": skills,
        "Education": education,
        "File Name": resume_file
    }
# Process each resume and extract information
resume_data = []
for resume_file in resume_files:
    resume_text = extract_text_from_pdf(resume_file)
    extracted_info = extract_details(resume_text)
    if extracted_info:
        extracted_info["File Name"] = resume_file  # Add file name to the extracted info
        resume_data.append(extracted_info)

# Print the extracted information along with file names
for i, data in enumerate(resume_data, start=1):
    print(f"Resume {i} Data:")
    print(f"File Name: {data['File Name']}")
    print(f"Category: {data['Category']}")
    print(f"Skills:\n{data['Skills']}")
    print(f"Education:\n{data['Education']}")
    print("\n")


In [44]:
print(resume_data[44])


{'Category': 'CUSTOMER CARE REPRESENTATIVE', 'Skills': 'Account reconciliation\nExceptional organization\nBilling and collections expert\nInvoice and payment transactions', 'Education': 'Associate of Applied Business Administration : Human Resources, Management Studies, Payroll Records, Accounting, and Business Law , 6\n2007 \nTrumbull Business College - City , State \nHuman Resources, Management Studies, Payroll Records, Accounting, and Business Law', 'File Name': '/content/data/data/FITNESS/19938081.pdf'}


In [29]:
from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')


In [30]:
# Initialize an empty list to store the embeddings
resume_embeddings = []

for resume in resume_data:
    # Combine all relevant text fields
    text = f"{resume['Category']}\n{resume['Skills']}\n{resume['Education']}"

    # Tokenize and embed the text using DistilBERT
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    with torch.no_grad():
        embeddings = model(**tokens).last_hidden_state.mean(dim=1)

    # Now we have the embeddings for this resume, which we can use for similarity calculations
    resume_embeddings.append(embeddings)


In [None]:
resume_embeddings

In [32]:
# Convert the list of PyTorch tensors into a numpy array for easier computation
resume_embeddings_np = np.array([embeddings.numpy().flatten() for embeddings in resume_embeddings])

In [40]:
resume_embeddings_np

array([[-0.25966945,  0.15504622,  0.45100433, ..., -0.06747317,
        -0.08497264, -0.04510937],
       [-0.11657897,  0.28492567,  0.29582697, ..., -0.21612859,
        -0.08118855,  0.02277062],
       [-0.1293561 ,  0.25318   ,  0.3022909 , ..., -0.22822148,
        -0.0369491 , -0.11637843],
       ...,
       [-0.23886997,  0.26157835,  0.3607795 , ..., -0.19203323,
        -0.03768524, -0.03823875],
       [-0.5063967 ,  0.12057196,  0.20492427, ..., -0.23346125,
        -0.08815116, -0.02434005],
       [-0.12091623,  0.32280862,  0.46017826, ..., -0.18726407,
        -0.09763461, -0.03962423]], dtype=float32)

In [34]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [None]:
dataset = load_dataset("jacob-hugging-face/job-descriptions")
job_descriptions = dataset["train"]["job_description"][:10]
company_names = dataset["train"]["company_name"][:10]

# Initialize DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Tokenize and embed job descriptions
job_description_embeddings = []
for job_desc in job_descriptions:
    tokens = tokenizer(job_desc, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**tokens)
    # Take the mean of embeddings across tokens to get a single vector for the entire description
    job_desc_embedding = torch.mean(embeddings.last_hidden_state, dim=1)
    job_description_embeddings.append(job_desc_embedding)

# Convert the list of embeddings to a NumPy array for efficient cosine similarity calculation
job_description_embeddings = torch.cat(job_description_embeddings).numpy()

# Print company names and job descriptions
for company_name, job_desc in zip(company_names, job_descriptions):
    print(f"Company Name: {company_name}")
    print(f"Job Description: {job_desc}")
    print("-" * 50)


In [39]:
job_description_embeddings

array([[-0.22112422,  0.24726777,  0.46985587, ..., -0.41906098,
         0.1289057 , -0.15868504],
       [-0.21113673,  0.2663724 ,  0.39599907, ..., -0.42854762,
         0.15848167, -0.20625532],
       [-0.10257979,  0.22048831,  0.48648548, ..., -0.22881685,
         0.15171622, -0.2467409 ],
       ...,
       [-0.23925798,  0.25549638,  0.36412224, ..., -0.32255176,
         0.02410375, -0.13327275],
       [-0.20862173,  0.20706552,  0.3475924 , ..., -0.24943605,
         0.05624549, -0.13983488],
       [-0.20905495,  0.23487322,  0.32924074, ..., -0.2898    ,
         0.03931919, -0.0595683 ]], dtype=float32)

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

# Initialize a list to store the top 5 resume indices for each job description
top_resume_indices = []

# Iterate through each job description and its embedding
for job_desc_embedding, job_desc_text in zip(job_description_embeddings, job_descriptions):
    # Calculate cosine similarities between the current job description and all resumes
    similarities = cosine_similarity([job_desc_embedding], resume_embeddings_np)

    # Get the indices of the top 5 most similar resumes
    top_indices = similarities.argsort()[0][::-1][:5]
    top_resume_indices.append(top_indices)

# Now, top_resume_indices contains the top 5 resume indices for each job description

# Display the results
for i, top_indices in enumerate(top_resume_indices):
    print(f"Company Name: {company_names[i]}")
    print(f"Job Description: {job_descriptions[i]}")


    # Calculate cosine similarities for the current job description
    similarities = cosine_similarity([job_description_embeddings[i]], resume_embeddings_np)

    for j, idx in enumerate(top_indices):
        print(f"Top Resume {j + 1}: Similarity Score = {similarities[0][idx]}")
        resume_info = resume_data[idx]  # Assuming resume_data contains the resume content
        print(f"File Name: {resume_info.get('File Name', 'Not Available')}")
        # print(resume_info)  # Print the resume content
        print("\n")
    print("-" * 50)


Company Name: Google
Job Description: minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our produ