Importing necessary packages:

In [1]:
import pandas as pd
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from sentence_transformers import util
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [3]:
import os

# Folder containing resumes and job description
FOLDER_PATH = "resumes"

# Read Job Description
job_desc_path = os.path.join(FOLDER_PATH, "jobdesc.txt")
with open(job_desc_path, "r", encoding="utf-8") as f:
    job_description = f.read()

# Read Resumes
resumes = []
resume_names = []

for file in os.listdir(FOLDER_PATH):
    if file.startswith("resume") and file.endswith(".txt"):
        resume_names.append(file)
        with open(os.path.join(FOLDER_PATH, file), "r", encoding="utf-8") as f:
            resumes.append(f.read())

In [4]:
# Embed Resumes and Job Description
resume_vectors = [embedding_model.embed_query(resume) for resume in resumes]
job_vector = embedding_model.embed_query(job_description)

In [5]:
# Compute Cosine Similarity
similarity_scores = [util.cos_sim(job_vector, resume_vec).item() for resume_vec in resume_vectors]

In [6]:
print(similarity_scores)

[0.7809348106384277, 0.7244797945022583, 0.8186139464378357, 0.6920086741447449, 0.7510493397712708]


In [7]:
# Extract Additional Features for ML Model
additional_features = [
    [len(resume.split()), resume.count("AI") + resume.count("deep learning")] for resume in resumes
]

In [10]:
import regex as re

# Extract Additional Features
def extract_features(resume_text):
    keywords = {
        "cloud_platforms": ["AWS", "Azure", "GCP"],
        "security_tools": ["SIEM", "IDS", "IPS", "Firewalls", "VPN", "ELK", "Splunk"],
        "programming": ["Python", "Bash"],
        "compliance": ["ISO 27001", "SOC 2", "NIST", "GDPR", "HIPAA"],
        "certifications": ["AWS Certified Security", "CCSP", "CISSP", "CEH", "OSCP"]
    }
    
    features = {
        "word_count": len(resume_text.split()),
        "cloud_platform_count": sum(resume_text.count(term) for term in keywords["cloud_platforms"]),
        "security_tool_count": sum(resume_text.count(term) for term in keywords["security_tools"]),
        "programming_count": sum(resume_text.count(term) for term in keywords["programming"]),
        "compliance_count": sum(resume_text.count(term) for term in keywords["compliance"]),
        "certification_count": sum(resume_text.count(term) for term in keywords["certifications"]),
        "years_experience": max(map(int, re.findall(r"(\d+) ?(?:years|yrs) ?(?:of experience)?", resume_text)), default=0)
    }
    
    return list(features.values())

# Extract Features for Each Resume
additional_features = [extract_features(resume) for resume in resumes]

In [11]:
# Create DataFrame for ML Training
feature_columns = ["word_count", "cloud_platform_count", "security_tool_count", "programming_count",
                   "compliance_count", "certification_count", "years_experience"]
data = pd.DataFrame(additional_features, columns=feature_columns)
data["similarity_score"] = similarity_scores  # Cosine similarity feature

# Hybrid Scoring Formula (Adjust Weights if Needed)
data["final_score"] = (
    data["similarity_score"] * 0.5 +
    data["cloud_platform_count"] * 0.1 +
    data["security_tool_count"] * 0.1 +
    data["programming_count"] * 0.1 +
    data["certification_count"] * 0.1 +
    data["years_experience"] * 0.1
)

In [12]:
# Train ML Model (XGBoost)
X = data.drop(columns=["final_score"])
y = data["final_score"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
model = xgb.XGBRegressor(n_estimators=50, max_depth=3)
model.fit(X,y)

In [43]:
# Predict Scores for Ranking
predictions = model.predict(X)

In [44]:
# Rank Resumes
data["predicted_score"] = predictions
# ranked_resumes = data.sort_values(by="predicted_score", ascending=False)

In [45]:
print(data)

   word_count  cloud_platform_count  security_tool_count  programming_count  \
0         112                     7                    3                  2   
1          96                     3                    3                  0   
2         101                     3                    0                  1   
3          88                     3                    0                  0   
4          92                     5                    2                  2   

   compliance_count  certification_count  years_experience  similarity_score  \
0                 4                    1                 0          0.780935   
1                 1                    0                 2          0.724480   
2                 0                    1                 0          0.818614   
3                 0                    0                 0          0.692009   
4                 0                    1                 0          0.751049   

   final_score  predicted_score  
0     1.69

In [46]:
# Rank Resumes in Ascending Order (Best Score First)
ranked_resumes = data.sort_values(by="predicted_score", ascending=False)

output = "output.txt"
# Write to File
with open(output, "w", encoding="utf-8") as f:
    for rank, row in enumerate(ranked_resumes.iterrows(), start=1):
        index, row_data = row
        f.write(f"Rank {rank}: {resume_names[index]} - Score: {row_data['predicted_score']:.4f}\n")

print(f"Ranked resumes saved to {output}")

Ranked resumes saved to output.txt


In [23]:
# testing data

# Read Resumes
test_resumes = []
test_resume_names = []
FOLDER_PATH = "test_resumes"
for file in os.listdir(FOLDER_PATH):
    if file.startswith("testresume") and file.endswith(".txt"):
        test_resume_names.append(file)
        with open(os.path.join(FOLDER_PATH, file), "r", encoding="utf-8") as f:
            test_resumes.append(f.read())

In [24]:
# Embed Resumes 
test_resume_vectors = [embedding_model.embed_query(test_resume) for test_resume in test_resumes]

In [25]:
similarity_scores = [util.cos_sim(job_vector, resume_vec).item() for resume_vec in test_resume_vectors]

In [28]:
print(similarity_scores)

[0.6876025199890137, 0.7245613932609558, 0.6515498757362366]


In [48]:
# Extract Features for Each Resume
additional_features_test = [extract_features(resume) for resume in test_resumes]
# Create DataFrame for testing
feature_columns = ["word_count", "cloud_platform_count", "security_tool_count", "programming_count",
                   "compliance_count", "certification_count", "years_experience"]
test_data = pd.DataFrame(additional_features_test, columns=feature_columns)
test_data["similarity_score"] = similarity_scores  # Cosine similarity feature

# # Hybrid Scoring Formula (Adjust Weights if Needed)
# test_data["final_score"] = (
#     data["similarity_score"] * 0.5 +
#     data["cloud_platform_count"] * 0.1 +
#     data["security_tool_count"] * 0.1 +
#     data["programming_count"] * 0.1 +
#     data["certification_count"] * 0.1 +
#     data["years_experience"] * 0.1
# )

y_pred = model.predict(test_data)

In [49]:
test_data["predicted_score"] = y_pred

# Rank Resumes in Ascending Order (Best Score First)
test_ranked_resumes = test_data.sort_values(by="predicted_score", ascending=False)

test_output = "test_output.txt"
# Write to File
with open(test_output, "w", encoding="utf-8") as f:
    for rank, row in enumerate(test_ranked_resumes.iterrows(), start=1):
        index, row_data = row
        f.write(f"Rank {rank}: {test_resume_names[index]} - Score: {row_data['predicted_score']:.4f}\n")

print(f"Ranked resumes saved to {output}")

Ranked resumes saved to output.txt
