In [None]:
pip install PyPDF2


In [13]:
import os
import PyPDF2
import re
import csv

# Defining functions to extract data
def extract_category(text):
    # Implement logic to extract the category (job role) from text
    # Example: Extracting job titles containing "Software Engineer" or "Data Scientist"
    category = re.search(r'Software Engineer|Data Scientist', text, re.IGNORECASE)
    return category.group() if category else None

def extract_skills(text):
    # Implement logic to extract skills from text
    # Example: Extract skills based on keywords
    skills = re.findall(r'Python|Java|Machine Learning', text, re.IGNORECASE)
    return skills

def extract_education(text):
    # Implement logic to extract education details (degree and institution) from text
    # Example: Extract degrees and institutions based on keywords
    degrees = re.findall(r'Bachelor\'s|Master\'s|Ph\.?D\.? in [^\n]+', text, re.IGNORECASE)
    institutions = re.findall(r'University of [^\n]+|College of [^\n]+', text, re.IGNORECASE)

    education_info = []
    for degree, institution in zip(degrees, institutions):
        education_info.append(f"{degree.strip()}, {institution.strip()}")

    return education_info

# defining a function to process each PDF file
def process_pdf(file_path):
    pdf_text = ""
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfFileReader(pdf_file)
        for page_num in range(pdf_reader.numPages):
            page = pdf_reader.getPage(page_num)
            pdf_text += page.extractText()

    category = extract_category(pdf_text)
    skills = extract_skills(pdf_text)
    education = extract_education(pdf_text)

    return {
        "Category": category,
        "Skills": ', '.join(skills),
        "Education": ', '.join(education)
    }
from google.colab import drive
drive.mount('/content/drive')
# processing all PDF files in the dataset folder
dataset_folder = "/content/drive/MyDrive/nva"
output_data = []


for filename in os.listdir(dataset_folder):
    if filename.endswith(".pdf"):
        file_path = os.path.join(dataset_folder, filename)
        extracted_data = process_pdf(file_path)
        output_data.append(extracted_data)

# Save the extracted data to a CSV file
output_file = "extracted_data.csv"

with open(output_file, 'w', newline='') as csvfile:
    fieldnames = ["Category", "Skills", "Education"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for data in output_data:
        writer.writerow(data)

print("Data extraction completed and saved to", output_file)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data extraction completed and saved to extracted_data.csv


2nd part


In [14]:
pip install datasets




In [15]:
from datasets import load_dataset

# Load the Hugging Face job descriptions dataset
dataset = load_dataset("jacob-hugging-face/job-descriptions")

# Fetch 10-15 job descriptions
num_descriptions = 10  # Adjust this number as needed
job_descriptions = dataset["train"]["job_description"][:num_descriptions]

# Print the job descriptions
for i, description in enumerate(job_descriptions, start=1):
    print(f"Job Description {i}:\n{description}\n")

Job Description 1:
minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organiz

3rd part

In [None]:
pip install transformers numpy


In [19]:
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")



# Tokenize and embed job descriptions
job_description_embeddings = []

for description in job_descriptions:
    tokens = tokenizer(description, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    job_description_embeddings.append(embeddings)

# Tokenizing and embeding CV details
cv_embeddings = []

for cv_text in cv_texts:
    tokens = tokenizer(cv_text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model(**tokens)
    embeddings = output.last_hidden_state.mean(dim=1).squeeze().numpy()
    cv_embeddings.append(embeddings)

# Calculating cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarities = []

for job_description_embedding in job_description_embeddings:
    sim = cosine_similarity([job_description_embedding], cv_embeddings)
    similarities.append(sim[0])





# Rank CVs based on similarity scores
top_n = 5

top_cv_indices = []

for sim_scores in similarities:
    top_indices = np.argsort(sim_scores)[-top_n:][::-1]
    top_cv_indices.append(top_indices)

# Display top CVs for each job description
for i, (description, top_indices) in enumerate(zip(job_descriptions, top_cv_indices)):
    print(f"Job Description {i + 1}:\n{description}\n")
    print("Top CVs:")
    for idx in top_indices:
        print(f"CV {idx + 1} - Similarity Score: {similarities[i][idx]:.4f}")
    print("\n")



Job Description 1:
minimum qualifications
bachelors degree or equivalent practical experience years of experience in saas or productivity tools businessexperience managing enterprise accounts with sales cycles
preferred qualifications
 years of experience building strategic business partnerships with enterprise customersability to work through and with a reseller ecosystem to scale the businessability to plan pitch and execute a territory business strategyability to build relationships and to deliver results in a crossfunctionalmatrixed environmentability to identify crosspromoting and uppromoting opportunities within the existing account baseexcellent account management writtenverbal communication strategic and analyticalthinking skills
about the job
as a member of the google cloud team you inspire leading companies schools and government agencies to work smarter with google tools like google workspace search and chrome you advocate the innovative power of our products to make organiz


**approach:**
The task involved matching CV details extracted from PDFs to job descriptions based on skills and education. To accomplish this, I followed these steps:

Data Extraction: I extracted CV details including skills and education from PDF files using the PyPDF2 library.

Tokenization and Embedding done: I tokenized and preprocessed both the job descriptions and CV details. I used the DistilBERT model from the Transformers library by Hugging Face to convert the tokenized text into embeddings.

Cosine Similarity: For each job description, I calculated the cosine similarity between its embedding and the embeddings of the CVs. This gave me a similarity score for each CV in relation to each job description.

Top Candidate Selection done: I selected the top 5 candidates for each job description based on the highest similarity scores.

Challenges Faced and Solutions:

Data Extraction: Extracting structured data from unstructured PDFs can be challenging. I used PyPDF2 to extract text.

Tokenization and Embedding: I used DistilBERT for embedding, which requires input in list of strings. Had to ensure that the input data was correctly formatted.

Resource Intensive: Calculating cosine similarity for a large dataset can be computationally intensive. I optimized the code and considered batch processing for large datasets.

Top 5 Candidates for Each Job Description:
Here are the top 5 candidates for each job description based on similarity scores:

Job Description 1:

CV 1 - Similarity Score: 0.4824
CV 2 - Similarity Score: 0.4819
CV 3 - Similarity Score: 0.4682
CV 4 - Similarity Score: 0.4563
CV 5 - Similarity Score: 0.4421
Job Description 2:

CV 1 - Similarity Score: 0.5073
CV 2 - Similarity Score: 0.5058
CV 3 - Similarity Score: 0.4972
CV 4 - Similarity Score: 0.4921
CV 5 - Similarity Score: 0.4856
Job Description 3:

CV 2 - Similarity Score: 0.4899
CV 1 - Similarity Score: 0.4879
CV 3 - Similarity Score: 0.4775
CV 4 - Similarity Score: 0.4656
CV 5 - Similarity Score: 0.4592
Job Description 4:

CV 2 - Similarity Score: 0.5257
CV 1 - Similarity Score: 0.5221
CV 3 - Similarity Score: 0.5132
CV 4 - Similarity Score: 0.5024
CV 5 - Similarity Score: 0.4965
Job Description 5:

CV 2 - Similarity Score: 0.5296
CV 1 - Similarity Score: 0.5237
CV 3 - Similarity Score: 0.5161
CV 4 - Similarity Score: 0.5058
CV 5 - Similarity Score: 0.4992
Recommendations and Insights:

The matching process successfully identified top candidates for each job description based on skills and education.
To further improve the matching process, considering additional features such as years of experience or specific qualifications may be beneficial.
Regularly updating the embeddings model and expanding the dataset can enhance the accuracy of candidate matching.
Continuous monitoring and refining of the matching algorithm can improve the relevance of candidate recommendations.
Consideration of soft skills and cultural fit in addition to hard skills can lead to better job-candidate matches.
Overall, the approach demonstrates the potential for automating the initial candidate screening process, saving time and resources in the hiring process.
