<a href="https://colab.research.google.com/github/kADALIdurgasivasankarprasad/resume-ranking-system/blob/main/resume_ranking_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 !pip install PyPDF2 gensim nltk




In [None]:
import os
import PyPDF2
import numpy as np
import re
import gensim.downloader as api
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from google.colab import files
uploaded = files.upload()  # Manually upload PDF files


Saving 47470864.pdf to 47470864.pdf


In [None]:
import os
print("Uploaded files:", uploaded.keys())


Uploaded files: dict_keys(['47470864.pdf'])


In [None]:
import shutil
os.makedirs("./resumes", exist_ok=True)  # Ensure the folder exists

for filename in uploaded.keys():
    shutil.move(filename, f"./resumes/{filename}")

print("Files now in resumes folder:", os.listdir("./resumes"))


Files now in resumes folder: ['resume.pdf.csv', '47470864.pdf']


In [None]:
print("Files in resumes folder:", os.listdir("./resumes"))


Files in resumes folder: ['resume.pdf.csv', '47470864.pdf']


In [None]:
resume_folder = "./resumes"
if os.path.exists(resume_folder):
    pdf_files = [f for f in os.listdir(resume_folder) if f.endswith(".csv")]
    if pdf_files:
        print("Found PDF resumes:", pdf_files)
    else:
        print("No PDFs found in the folder!")
else:
    print("Resumes folder does not exist!")


Found PDF resumes: ['resume.pdf.csv']


In [None]:
from google.colab import drive
drive.mount('/content/drive')
resume_folder = "/content/drive/My Drive/resumes"  # Adjust this path accordingly


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def preprocess_text(text):
    """Cleans and preprocesses text by removing special characters, numbers, and stopwords."""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return words

In [None]:

def extract_text_from_pdf(pdf_path):
    """Extracts text from a given PDF file and preprocesses it."""
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + " "
    return preprocess_text(text.strip())

In [None]:

def load_glove_model():
    """Loads the pre-trained GloVe model."""
    return api.load("glove-wiki-gigaword-300")  # Using a higher-dimensional model for better accuracy

In [None]:

def get_vector_representation(model, words):
    """Gets the average GloVe vector representation for a list of words."""
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(300)

In [None]:

def cosine_similarity(vec1, vec2):
    """Computes cosine similarity between two vectors safely."""
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return np.dot(vec1, vec2) / (norm1 * norm2)

In [None]:
def rank_resumes(resume_folder, job_description):
    """Ranks resumes based on their relevance to the job description using GloVe embeddings."""
    resumes = []
    file_names = []

    # Check if the folder exists
    if not os.path.exists(resume_folder):
        print(f"Error: The folder '{resume_folder}' does not exist!")
        return []

    # Read PDFs from the folder
    for file in os.listdir(resume_folder):
        if file.endswith(".pdf"):
            file_path = os.path.join(resume_folder, file)
            text = extract_text_from_pdf(file_path)
            if text:
                resumes.append(text)
                file_names.append(file)

    if not file_names:
        print("No PDFs were found in the folder!")
        return []

    print("Processing the following resumes:", file_names)

    # Load the GloVe model
    glove_model = load_glove_model()

    # Get vector representations
    job_description_tokens = preprocess_text(job_description)
    job_desc_vector = get_vector_representation(glove_model, job_description_tokens)
    resume_vectors = [get_vector_representation(glove_model, resume) for resume in resumes]

    # Compute similarity and rank resumes
    similarity_scores = [cosine_similarity(job_desc_vector, resume_vector) for resume_vector in resume_vectors]
    ranked_indices = np.argsort(similarity_scores)[::-1]  # Sort in descending order
    ranked_resumes = [(file_names[i], similarity_scores[i]) for i in ranked_indices]

    return ranked_resumes


In [None]:
import os
print("Folder exists:", os.path.exists("./resumes"))
print("Files in folder:", os.listdir("./resumes") if os.path.exists("./resumes") else "Folder not found")


Folder exists: True
Files in folder: ['resume.pdf.csv', '47470864.pdf']


In [None]:
job_description = """
Looking for a Data Analyst with experience in Python, SQL, and Machine Learning.
Should have knowledge of data visualization tools like Tableau or Power BI.
"""

resume_folder = "/content/UpdatedResumeDataSet.csv"  # Folder where resumes (PDFs) are stored

# Ensure the folder exists
if not os.path.exists(resume_folder):
    os.makedirs(resume_folder)
    print(f"Created folder: {resume_folder}")

# Run the ranking function
rankings = rank_resumes(resume_folder, job_description)

# Display results
print("\nRanked Resumes:")
for rank, (file, score) in enumerate(rankings, start=1):
    print(f"{rank}. {file} - Score: {score:.4f}")


No PDFs were found in the folder!

Ranked Resumes:


In [None]:

if __name__ == "__main__":
    job_description = """
    Looking for a Data Analyst with experience in Python, SQL, and Machine Learning.
    Should have knowledge of data visualization tools like Tableau or Power BI.
    """

    resume_folder = "./resumes"  # Folder where resumes (PDFs) are stored
    rankings = rank_resumes(resume_folder, job_description)

    print("Ranked Resumes:")
    for rank, (file, score) in enumerate(rankings, start=1):
        print(f"{rank}. {file} - Score: {score:.4f}")

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
