In [6]:
# ================= INSTALL =================
!pip install python-docx

# ================= IMPORTS =================
import os
import pandas as pd
import re
from docx import Document
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ================= CLEAN TEXT =================
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9 ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ================= SKILL EXTRACTION =================
def extract_skills(text):
    skills = [
        "python", "machine learning", "data analysis",
        "software development", "pandas", "numpy",
        "html", "css", "javascript"
    ]
    return " ".join(skill for skill in skills if skill in text)

# ================= JOB DESCRIPTION =================
job_description = """
Looking for a candidate with skills in Python,
machine learning, data analysis, and software development.
"""

job_description = extract_skills(clean_text(job_description))

# ================= READ ALL CV FILES =================
resumes = []
names = []

for file in os.listdir():
    # WORD CV
    if file.endswith(".docx"):
        doc = Document(file)
        text = " ".join(p.text for p in doc.paragraphs)
        resumes.append(extract_skills(clean_text(text)))
        names.append(file)


    # CSV CV
    elif file.endswith(".csv"):
        df = pd.read_csv(file)
        text = " ".join(df["resume_text"].astype(str))
        resumes.append(extract_skills(clean_text(text)))
        names.append(file)


# ================= TF-IDF & SIMILARITY =================
documents = resumes + [job_description]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

scores = cosine_similarity(tfidf_matrix[:-1], tfidf_matrix[-1]).flatten() * 100

# ================= RANKING =================
result = pd.DataFrame({
    "CV File": names,
    "Match Percentage": scores.round(2)
})

result = result.sort_values(by="Match Percentage", ascending=False)
result["Rank"] = range(1, len(result) + 1)

result




Unnamed: 0,CV File,Match Percentage,Rank
0,r.csv,100.0,1
3,e.csv,100.0,2
2,r - resumes.csv.csv,12.77,3
1,t.csv,0.0,4
4,Resume.docx,0.0,5
