In [1]:
import pandas as pd
import re
import math
from collections import Counter


In [6]:
# Load dataset
file_path = "UpdatedResumeDataSet.csv"
df = pd.read_csv(file_path)

# Display first few rows
print(df.head())


       Category                                             Resume
0  Data Science  Skills * Programming Languages: Python (pandas...
1  Data Science  Education Details \r\nMay 2013 to May 2017 B.E...
2  Data Science  Areas of Interest Deep Learning, Control Syste...
3  Data Science  Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4  Data Science  Education Details \r\n MCA   YMCAUST,  Faridab...


In [7]:
# Function to clean and tokenize text
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = text.split()  # Tokenize by splitting on spaces
    return tokens

# Apply preprocessing to resumes
df['cleaned_resume'] = df['Resume'].apply(preprocess_text)


In [8]:
# Function to count word occurrences (Term Frequency)
def word_frequency(tokens):
    return Counter(tokens)

# Compute word frequency for resumes
df['word_freq'] = df['cleaned_resume'].apply(word_frequency)


In [9]:
# Sample job description for matching
job_description = "Looking for a Data Scientist with experience in Python, Machine Learning, NLP. Must have MSc in Computer Science and at least 4 years of experience."

# Preprocess job description
cleaned_job_desc = preprocess_text(job_description)

# Compute word frequency for job description
job_desc_word_freq = word_frequency(cleaned_job_desc)

# Function to calculate Cosine Similarity
def cosine_similarity(freq1, freq2):
    all_words = set(freq1.keys()).union(set(freq2.keys()))

    vec1 = [freq1.get(word, 0) for word in all_words]
    vec2 = [freq2.get(word, 0) for word in all_words]

    dot_product = sum(x * y for x, y in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(x**2 for x in vec1))
    magnitude2 = math.sqrt(sum(x**2 for x in vec2))

    return dot_product / (magnitude1 * magnitude2) if magnitude1 and magnitude2 else 0

# Compute similarity scores
df['similarity_score'] = df['word_freq'].apply(lambda x: cosine_similarity(x, job_desc_word_freq))

# Rank candidates based on similarity scores
df_sorted = df.sort_values(by='similarity_score', ascending=False)

# Display top 10 ranked candidates
print(df_sorted[['Resume', 'Category', 'similarity_score']].head(10))


                                               Resume      Category  \
38  Personal Skills â¢ Ability to quickly grasp t...  Data Science   
28  Personal Skills â¢ Ability to quickly grasp t...  Data Science   
18  Personal Skills â¢ Ability to quickly grasp t...  Data Science   
8   Personal Skills â¢ Ability to quickly grasp t...  Data Science   
17  Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
7   Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
27  Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
37  Education Details \r\n B.Tech   Rayat and Bahr...  Data Science   
19  Expertise â Data and Quantitative Analysis â...  Data Science   
39  Expertise â Data and Quantitative Analysis â...  Data Science   

    similarity_score  
38          0.496469  
28          0.496469  
18          0.496469  
8           0.496469  
17          0.426289  
7           0.426289  
27          0.426289  
37          0.426289  
19         