In [1]:
import pandas as pd

# Loading the Dataset

In [2]:
df = pd.read_csv("resume_dataset.csv")
df

Unnamed: 0,Candidate Name,Experience,Skills,Achievements,Education,Score
0,Ramesh Shrestha,"3 years junior developer, 6 months intern","Python, Django, Git, JavaScript","Completed 4 major projects, Built internal tool",BSc Hons Computing,8.5
1,Sanjay Thapa,"5 years backend Java dev, 1 year internship","Java, Spring Boot, SQL, AWS","Developed 6 enterprise apps, Contributed to op...",Master in Machine Learning,9.2
2,Priya Magar,2 years front-end developer,"HTML, CSS, JavaScript, React, Bootstrap","Built responsive website, Developed e-commerce...",BSc CSIT,7.8
3,Anil Gurung,6 months internship,"Python, Flask, Django",Completed mini project,+2 in Science,5.2
4,Suman Adhikari,1 year Java developer,"Java, MySQL, Python","Bug fixes, Code optimization",BIT,6.0
...,...,...,...,...,...,...
5232,Sunil Tamang,,,,,2.6
5233,Puja Karki,5 years enterprise architect,"Enterprise Architecture, Business Alignment",Aligned tech with business,Master in Enterprise Architecture,8.7
5234,Rupa Adhikari,4 years senior scrum master,"Agile Coaching, Scrum, Facilitation",Coached 8 teams,Bachelor in Management,7.7
5235,Bikash Chhetri,,"Java, Spring Data JPA",,Bachelor Running,4.9


In [3]:
df.head()


Unnamed: 0,Candidate Name,Experience,Skills,Achievements,Education,Score
0,Ramesh Shrestha,"3 years junior developer, 6 months intern","Python, Django, Git, JavaScript","Completed 4 major projects, Built internal tool",BSc Hons Computing,8.5
1,Sanjay Thapa,"5 years backend Java dev, 1 year internship","Java, Spring Boot, SQL, AWS","Developed 6 enterprise apps, Contributed to op...",Master in Machine Learning,9.2
2,Priya Magar,2 years front-end developer,"HTML, CSS, JavaScript, React, Bootstrap","Built responsive website, Developed e-commerce...",BSc CSIT,7.8
3,Anil Gurung,6 months internship,"Python, Flask, Django",Completed mini project,+2 in Science,5.2
4,Suman Adhikari,1 year Java developer,"Java, MySQL, Python","Bug fixes, Code optimization",BIT,6.0


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5237 entries, 0 to 5236
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Candidate Name  5237 non-null   object 
 1   Experience      4189 non-null   object 
 2   Skills          4794 non-null   object 
 3   Achievements    4479 non-null   object 
 4   Education       5057 non-null   object 
 5   Score           5208 non-null   float64
dtypes: float64(1), object(5)
memory usage: 245.6+ KB


In [5]:
df.isnull().sum()


Candidate Name       0
Experience        1048
Skills             443
Achievements       758
Education          180
Score               29
dtype: int64

# Cleaning the Dataset


In [6]:
columns = ["Experience", "Skills", "Achievements", "Education"]
for col in columns:
    df[col] = df[col].fillna("")

df = df[df["Score"].notna()].copy()


# Combining all Dataset Columns into One Text Column


In [7]:
df["combined_Columns"] = (
    df["Experience"].astype(str) + " " +
    df["Skills"].astype(str) + " " +
    df["Achievements"].astype(str) + " " +
    df["Education"].astype(str)
)

In [8]:
df[["Candidate Name", "combined_Columns"]].head()

Unnamed: 0,Candidate Name,combined_Columns
0,Ramesh Shrestha,"3 years junior developer, 6 months intern Pyth..."
1,Sanjay Thapa,"5 years backend Java dev, 1 year internship Ja..."
2,Priya Magar,"2 years front-end developer HTML, CSS, JavaScr..."
3,Anil Gurung,"6 months internship Python, Flask, Django Comp..."
4,Suman Adhikari,"1 year Java developer Java, MySQL, Python Bug ..."


In [9]:
x = df["combined_Columns"]
y = df["Score"]


# Spliting the Data for Training and Testing


In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)


# Creating TF-IDF vectorizer, Transforming training and testing data

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_dt = TfidfVectorizer(max_features=1000, stop_words="english")

x_train_tfidf = tfidf_dt.fit_transform(x_train)
x_test_tfidf = tfidf_dt.transform(x_test)


# Creating and training the Decision Tree model

In [12]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor(max_depth=8, random_state=42)

dt_model.fit(x_train_tfidf, y_train)


# Predicting Scores Using Decision Tree Model


In [13]:
y_train_dt = dt_model.predict(x_train_tfidf)
y_test_dt = dt_model.predict(x_test_tfidf)


# Calculating accuracy and error of Decision Tree Model

In [14]:
from sklearn.metrics import r2_score, mean_squared_error


r2_train_dt = r2_score(y_train, y_train_dt)
r2_test_dt = r2_score(y_test, y_test_dt)
mse_test_dt = mean_squared_error(y_test, y_test_dt)

print(f"Decision Tree - R2 Train: {r2_train_dt:.3f}")
print(f"Decision Tree - R2 Test: {r2_test_dt:.3f}")
print(f"Decision Tree - MSE Test: {mse_test_dt:.3f}")



Decision Tree - R2 Train: 0.908
Decision Tree - R2 Test: 0.892
Decision Tree - MSE Test: 0.447


# Predicting score for a new cv text using Decision Tree Model

In [15]:
new_cv = """
1 years backend developer, 3 months internship
Python, Django, Flask, AWS, SQL
Built 5 enterprise apps, open source contributor
Master in Machine Learning
"""

new_cv_dt = tfidf_dt.transform([new_cv])
predicted_score_dt = dt_model.predict(new_cv_dt)
print(f"Decision Tree Predicted score: {predicted_score_dt[0]:.2f}")


Decision Tree Predicted score: 8.73


# Creating and training the Random Forest model

In [16]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, max_depth=8, random_state=42)

rf_model.fit(x_train_tfidf, y_train)  


# Predicting Scores Using Random Forest Model

In [17]:
y_train_rf = rf_model.predict(x_train_tfidf)
y_test_rf = rf_model.predict(x_test_tfidf)


# Calculating accuracy and error of Random Forest Model

In [18]:
r2_train_rf = r2_score(y_train, y_train_rf)
r2_test_rf = r2_score(y_test, y_test_rf)
mse_test_rf = mean_squared_error(y_test, y_test_rf)

print(f"Random Forest - R2 Train: {r2_train_rf:.3f}")
print(f"Random Forest - R2 Test: {r2_test_rf:.3f}")
print(f"Random Forest - MSE Test: {mse_test_rf:.3f}")



Random Forest - R2 Train: 0.920
Random Forest - R2 Test: 0.908
Random Forest - MSE Test: 0.383


# Predicting score for a new cv text using Random Forest Model

In [19]:
predicted_score_rf = rf_model.predict(new_cv_dt)
print(f"Random Forest Predicted score: {predicted_score_rf[0]:.2f}")


Random Forest Predicted score: 8.73


# Creating and training the Linear Regression model

In [20]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(x_train_tfidf, y_train)  


# Predicting Scores Using Linear Regression Model

In [21]:
y_train_lr = lr_model.predict(x_train_tfidf)
y_test_lr = lr_model.predict(x_test_tfidf)



# Calculating accuracy and error of Linear Regression Model

In [22]:
r2_train_lr = r2_score(y_train, y_train_lr)
r2_test_lr = r2_score(y_test, y_test_lr)
mse_test_lr = mean_squared_error(y_test, y_test_lr)

print(f"Linear Regression - R2 Train: {r2_train_lr:.3f}")
print(f"Linear Regression - R2 Test: {r2_test_lr:.3f}")
print(f"Linear Regression - MSE Test: {mse_test_lr:.3f}")


Linear Regression - R2 Train: 0.927
Linear Regression - R2 Test: 0.878
Linear Regression - MSE Test: 0.508


# Predicting score for a new cv text using Linear Regression Model

In [23]:
predicted_score_lr = lr_model.predict(new_cv_dt)
print(f"Linear Regression Predicted score: {predicted_score_lr[0]:.2f}")

import warnings
warnings.filterwarnings("ignore")

Linear Regression Predicted score: 9.96


# Choosing the best models

In [24]:
best_r2_score = max(r2_test_dt, r2_test_rf, r2_test_lr)

if best_r2_score == r2_test_rf:
    best_model = rf_model
    best_model_name = "Random Forest"
elif best_r2_score == r2_test_dt:
    best_model = dt_model
    best_model_name = "Decision Tree"
else:
    best_model = lr_model
    best_model_name = "Linear Regression"

print(f"\n✅ BEST MODEL: {best_model_name} with R2 score: {best_r2_score:.3f}")



✅ BEST MODEL: Random Forest with R2 score: 0.908


# Function to Read all PDF resumes from a folder

In [25]:
import pdfplumber
import os

def extract_text_from_pdfs(folder_path):
    
    extracted_resumes = {}

    
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):  
            pdf_path = os.path.join(folder_path, file)
            text = ""

            try:
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        page_text = page.extract_text()
                        if page_text:  
                            text += page_text + " "
            except Exception as e:
                print(f"Error reading {pdf_path}: {e}")

            extracted_resumes[file] = text.strip()  
    
    return extracted_resumes


# Reading all PDF resumes from the folder

In [26]:
folder = "Resumes"

pdf_texts = extract_text_from_pdfs(folder)


print("PDF files read:", list(pdf_texts.keys()))


if pdf_texts:  
    first_pdf_name = list(pdf_texts.keys())[0]
    print("\nFirst PDF name:", first_pdf_name)

Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could not get FontBBox from font descriptor because None cannot be parsed as 4 floats


PDF files read: ['Bibek Shrestha.pdf', 'Krish Bikram Thapa.pdf', 'Krishal Ale.pdf', 'Kriti Bista CV.pdf', 'Manoj Ghimire.pdf', 'Niruta-Adhikari-FlowCV-Resume-20251207.pdf', 'PRABHA RAI CV.pdf', 'Sahajadi_LohaniCV.pdf']

First PDF name: Bibek Shrestha.pdf


# Cleaning Extracted PDF Text

In [27]:
import re

def clean_basic(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[•●▪–-]', ' ', text)
    text = re.sub(r'[^a-z0-9,. ]', ' ', text)
    return text.strip()

cleaned_pdfs = {name: clean_basic(txt) for name, txt in pdf_texts.items()}


# Removing Useless Sections

In [28]:
def remove_unwanted(text):
    remove_words = [
        "about me", "summary", "objective",
        "hobbies", "interests",
        "linkedin", "github", "facebook", "twitter",
        "contact", "personal details"
    ]
    for word in remove_words:
        text = text.replace(word, "")
    return text

filtered_pdfs = {name: remove_unwanted(txt) for name, txt in cleaned_pdfs.items()}


# Extracting Important Sections

In [29]:
def extract_section(text, keywords, stopwords):
    text_lower = text.lower()
    for key in keywords:
        if key in text_lower:
            start = text_lower.index(key)
            end = len(text)
            for stop in stopwords:
                if stop in text_lower[start+1:]:
                    e = text_lower.index(stop, start+1)
                    end = min(end, e)
            return text[start:end]
    return ""

def extract_all_sections(text):
    experience = extract_section(text,
                                 ["experience", "work experience", "employment"],
                                 ["education", "skills", "projects", "achievements"])
    skills = extract_section(text,
                             ["skills", "technical skills"],
                             ["experience", "education", "projects"])
    education = extract_section(text,
                                ["education", "academic"],
                                ["skills", "experience", "projects"])
    achievements = extract_section(text,
                                   ["projects", "achievements", "certifications"],
                                   ["education", "skills", "experience"])
    return experience + " " + skills + " " + education + " " + achievements

final_cleaned_resumes = {
    name: extract_all_sections(text)
    for name, text in filtered_pdfs.items()
}


# Displaying first 5 cv with there clean text

In [30]:
for i, (name, text) in enumerate(final_cleaned_resumes.items()):
    if i >= 5:
        break
    print(f"{i+1}. {name}:\n{text[:500]}...\n")


1. Bibek Shrestha.pdf:
   ...

2. Krish Bikram Thapa.pdf:
experience in xxx, seeking full time xxx roles.  skills technical skills reactjs, springboot, aws, mysql soft skills time management, work under pressure, communication xyz a, b, c, d  education high school, dav suhil kedia viswa bharati 2022 23 bachelor of science  hons  computing, itahari internation college 2023   2026  projects automatic fire fighting robot.  esp32, iot, embedded systems  gym website. designed and developed a responsive gym website to showcase services, memberships plans and...

3. Krishal Ale.pdf:
 skills technical skills  programming languages  html, css, javascript, sql, python, java frameworks  react applications and software  visual studio code, mysql workbench, anaconda, jupyter notebook,  operating systems  windows 10 11, ubuntu linux soft skills  effective communication time management interpersonal skills team collaboration reference mr.nishesh bishwas  number  9801597005 educational background bsc 

# Converting PDF Text to Numbers

In [31]:
pdf_vectors = tfidf_dt.transform(final_cleaned_resumes.values())


# Predicting Scores Using All Three Models


In [32]:
dt_scores = dt_model.predict(pdf_vectors)
rf_scores = rf_model.predict(pdf_vectors)
lr_scores = lr_model.predict(pdf_vectors)

import pandas as pd

compare_scores = pd.DataFrame({
    "CV Name": list(final_cleaned_resumes.keys()),
    "Decision Tree Score": dt_scores,
    "Random Forest Score": rf_scores,
    "Linear Regression Score": lr_scores
})

compare_scores
best_model

# Predict Final Scores with Best Model


In [33]:
final_scores = best_model.predict(pdf_vectors)

final_results = pd.DataFrame({
    "CV Name": list(final_cleaned_resumes.keys()),
    "Final Score": final_scores
})


# Ranking All CVS


In [34]:
final_results = final_results.sort_values(by="Final Score", ascending=False).reset_index(drop=True)


final_results["Rank"] = final_results.index + 1


final_results = final_results[["Rank", "CV Name", "Final Score"]]


final_results


Unnamed: 0,Rank,CV Name,Final Score
0,1,Niruta-Adhikari-FlowCV-Resume-20251207.pdf,5.97751
1,2,Sahajadi_LohaniCV.pdf,3.594771
2,3,Kriti Bista CV.pdf,3.4008
3,4,Krish Bikram Thapa.pdf,3.155487
4,5,Bibek Shrestha.pdf,3.136272
5,6,Krishal Ale.pdf,3.136272
6,7,Manoj Ghimire.pdf,3.136272
7,8,PRABHA RAI CV.pdf,3.136272
