# Install Dependencies

In [1]:
!pip install PyPDF2 nltk
!pip install llama-index-llms-groq llama-index
!pip install sentence-transformers



# Imports, Setup, and Basic Variables

## Imports

In [2]:
import io
import re
import nltk
import PyPDF2
from google.colab import files

## NLTK downloads

In [3]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Attempt to import Groq LLM

In [4]:
try:
    from llama_index.llms.groq import Groq
    from llama_index.core.llms import ChatMessage
    USE_GROQ = True
except ImportError:
    USE_GROQ = False

## For partial code-based parse

In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## For semantic matching

In [6]:
try:
    from sentence_transformers import SentenceTransformer, util
    USE_SEMANTIC = True
except ImportError:
    USE_SEMANTIC = False

## Groq API key

In [7]:
GROQ_API_KEY = "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"

## A small list of known skills for code-based detection

In [8]:
KNOWN_SKILLS = ["python", "java", "aws", "machine learning", "html", "css", "javascript"]

# Helper Functions

##1) Resume Parsing

In [9]:
def extract_text_from_pdf(pdf_file) -> str:
    """
    Extract text from PDF using PyPDF2.
    """
    reader = PyPDF2.PdfReader(pdf_file)
    all_text = []
    for page in reader.pages:
        text = page.extract_text() or ""
        all_text.append(text)
    return "\n".join(all_text)

In [10]:
def code_based_parse(resume_text: str, known_skills=None) -> dict:
    """
    1) Regex for phone/email
    2) Basic skill detection
    3) Return partial parse dict
    """
    phone_pattern = r'\+?\d[\d\s\-.()]{7,}\d'
    phones = re.findall(phone_pattern, resume_text)

    email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
    emails = re.findall(email_pattern, resume_text)

    detected_skills = []
    if known_skills:
        lower_txt = resume_text.lower()
        for skill in known_skills:
            if skill.lower() in lower_txt:
                detected_skills.append(skill)

    partial_data = {
        "phones": list(set(phones)),
        "emails": list(set(emails)),
        "skills": list(set(detected_skills))
    }
    return partial_data

In [11]:
def finalize_summary_with_llm(resume_text: str, partial_data: dict) -> str:
    """
    LLM verifies the partial parse, removing incorrect info,
    finalizes the summary with subpoints like Education, Experience, etc.
    If LLM not available or invalid key -> fallback message.
    """
    if not USE_GROQ:
        return "Groq LLM not installed. Partial parse:\n" + str(partial_data)

    system_msg = "You finalize resume data, removing incorrect info, returning subpoints like Education, Experience, Skills, etc."
    user_msg = f"""
Here is a partial parse from code logic:
Phones: {partial_data.get("phones", [])}
Emails: {partial_data.get("emails", [])}
Skills: {partial_data.get("skills", [])}

Resume text:
{resume_text}

INSTRUCTIONS:
1) Verify these fields. If any are incorrect, remove or correct them.
2) Summarize the entire resume in subpoints: Education, Experience, Projects, Skills, etc.
3) Return the final summary in bullet points.
"""

    try:
        llm = Groq(model="llama-3.1-8b-instant", api_key=GROQ_API_KEY)
        messages = [
            ChatMessage(role="system", content=system_msg),
            ChatMessage(role="user", content=user_msg),
        ]
        response = llm.chat(messages)
        return str(response)
    except Exception as e:
        return f"LLM call failed: {e}\nPartial parse data:\n{partial_data}"

## 2) Matching Approaches

In [12]:
def lemma_tokenize(text: str) -> set:
    """
    Tokenize -> remove stopwords -> lemmatize -> set.
    """
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words("english"))
    sents = nltk.sent_tokenize(text)
    lemmas = set()
    for s in sents:
        tokens = nltk.word_tokenize(s)
        for tok in tokens:
            tok_lower = tok.lower()
            if tok_lower.isalnum() and tok_lower not in stop_words:
                lemmas.add(lemmatizer.lemmatize(tok_lower))
    return lemmas

In [13]:
def lemma_based_match(jd_text: str, resume_text: str) -> dict:
    """
    1) Compute Jaccard similarity in [0..100]
    2) Return matched/unmatched tokens
    """
    jd_lemmas = lemma_tokenize(jd_text)
    resume_lemmas = lemma_tokenize(resume_text)

    if not jd_lemmas or not resume_lemmas:
        return {"score": 0.0, "matched": [], "jd_only": [], "resume_only": []}

    intersection = jd_lemmas & resume_lemmas
    union = jd_lemmas | resume_lemmas
    jaccard = (len(intersection) / len(union)) * 100
    return {
        "score": round(jaccard, 2),
        "matched": sorted(list(intersection)),
        "jd_only": sorted(list(jd_lemmas - intersection)),
        "resume_only": sorted(list(resume_lemmas - intersection))
    }

In [14]:
def semantic_match(jd_text: str, resume_text: str, threshold=0.4) -> dict:
    """
    1) Overall text similarity
    2) Sentence-level from JD to Resume
    """
    if not USE_SEMANTIC:
        return {
            "overall_score": 0.0,
            "matched_sents": [],
            "unmatched_sents": [],
            "note": "No sentence-transformers installed"
        }

    model = SentenceTransformer("all-MiniLM-L6-v2")
    # Overall
    jd_emb = model.encode(jd_text, convert_to_tensor=True)
    res_emb = model.encode(resume_text, convert_to_tensor=True)
    sim_val = float(util.cos_sim(jd_emb, res_emb)[0][0])
    overall_score = round(sim_val * 100, 2)

    # Sentence-level
    matched_sents = []
    unmatched_sents = []
    lines = nltk.sent_tokenize(jd_text)
    for line in lines:
        line_emb = model.encode(line, convert_to_tensor=True)
        score_f = float(util.cos_sim(line_emb, res_emb)[0][0])
        score_pct = round(score_f * 100, 2)
        if score_f >= threshold:
            matched_sents.append((line, score_pct))
        else:
            unmatched_sents.append((line, score_pct))

    return {
        "overall_score": overall_score,
        "matched_sents": matched_sents,
        "unmatched_sents": unmatched_sents
    }

In [15]:
def combined_score(lemma_val: float, sem_val: float, lemma_weight=0.5, sem_weight=0.5) -> float:
    """
    Weighted average of lemma-based + semantic approach
    """
    return round((lemma_weight * lemma_val) + (sem_weight * sem_val), 2)

# Main Code Execution

In [16]:
def main():
    # 1) Prompt user for JD
    print("Paste your job description below, then press Enter:")
    jd_text = input().strip()
    if not jd_text:
        print("No job description provided. Exiting.")
        return

    # 2) Ask user to upload PDF resume
    print("Please upload your PDF resume now in the Colab file upload widget.")
    uploaded_files = files.upload()
    if not uploaded_files:
        print("No file uploaded. Exiting.")
        return

    for filename in uploaded_files.keys():
        if not filename.lower().endswith(".pdf"):
            print(f"Skipping file '{filename}' since it's not a PDF.")
            continue

        pdf_stream = io.BytesIO(uploaded_files[filename])
        resume_raw = extract_text_from_pdf(pdf_stream)

        print("\n--- RAW Resume Text ---\n")
        print(resume_raw)

        # 3) Code-based partial parse
        partial_data = code_based_parse(resume_raw, known_skills=KNOWN_SKILLS)
        print("\n--- Partial Parse (Code-based) ---\n", partial_data)

        # 4) LLM finalization
        final_summary = finalize_summary_with_llm(resume_raw, partial_data)
        print("\n--- Finalized Summary (LLM) ---\n")
        print(final_summary)

        # 5) Lemma-based match
        lemma_result = lemma_based_match(jd_text, resume_raw)
        lemma_score = lemma_result["score"]
        matched = lemma_result["matched"]
        jd_only = lemma_result["jd_only"]
        res_only = lemma_result["resume_only"]

        print("\n--- Lemma-Based Matching ---\n")
        print(f"Score: {lemma_score}%")
        print("Matched tokens:", matched)
        print("JD-only tokens:", jd_only)
        print("Resume-only tokens:", res_only)

        # 6) Semantic match
        sem_result = semantic_match(jd_text, resume_raw, threshold=0.4)
        sem_score = sem_result["overall_score"]
        matched_sents = sem_result["matched_sents"]
        unmatched_sents = sem_result["unmatched_sents"]

        print("\n--- Semantic Matching ---\n")
        print(f"Overall score: {sem_score}%")
        print("\nMatched JD Sentences:")
        for line, sc in matched_sents:
            print(f"- {line} ({sc}%)")
        print("\nUnmatched JD Sentences:")
        for line, sc in unmatched_sents:
            print(f"- {line} ({sc}%)")

        # 7) Combined Score
        combo = combined_score(lemma_score, sem_score, lemma_weight=0.5, sem_weight=0.5)
        print(f"\n--- Combined Matching Score: {combo}% ---\n")

if __name__ == "__main__":
    main()


Paste your job description below, then press Enter:
Job Title: Full Stack Software Engineer  Overview: We are seeking a Full Stack Software Engineer to join our dynamic team. In this role, you will design, develop, and maintain modern web applications while collaborating closely with cross-functional teams. You will work on both front-end and back-end components and, when needed, integrate data analytics or machine learning elements to enhance our products.  Key Responsibilities:  Web Application Development: Develop, test, and deploy responsive web applications using modern frameworks (e.g., React, Next.js, Node.js, Django).  Back-End & Database Management: Design and maintain scalable back-end systems and databases (SQL and NoSQL) for efficient data storage and retrieval.  Cloud & DevOps: Leverage cloud platforms (AWS, Azure) for application deployment and scalability, and work with CI/CD pipelines to ensure robust and continuous delivery.  Data & Machine Learning Integration: When r

Saving Kushal Patel.pdf to Kushal Patel (3).pdf

--- RAW Resume Text ---

Kushal Patel
Vadodara, India
/ne+91-9879580177 kushalpatel0265@gmail.com Linked In /gtbGitHub /cdeCodeforces /cdeLeetcode
EDUCATION
Nirma University 10/2022 – Present
B.Tech Computer Science and Engineering - Percentage -8.23 Ahmedabad, India
Parth School of Science and Competition 06/2020 – 06/2022
Higher Secondary - Science - Percentage -90% Vadodara, India
EXPERIENCE
Dhyey Consultancy 05/2024 – 07/2024
Machine Learning Intern Vadodara, India
•Executed SQL queries on a music database, analyzing 10,000+ records to identify sales trends and track
popularity across genres.
•Built detailed reports correlating track plays with sales, influencing promotional strategies.
•Improved marketing by leveraging analysis results to refine promotional tactics.
PROJECTS
Code Craft |Next.js, Convex, Clerk, TypeScript 01/2024
•Implemented a SaaS code editor with Next.js, Convex, Clerk, and TypeScript, supporting 10 programming
la

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



--- Semantic Matching ---

Overall score: 34.23%

Matched JD Sentences:
- Data & Machine Learning Integration: When required, integrate data analytics and machine learning components to support intelligent features in our applications. (43.58%)

Unmatched JD Sentences:
- Job Title: Full Stack Software Engineer  Overview: We are seeking a Full Stack Software Engineer to join our dynamic team. (27.65%)
- In this role, you will design, develop, and maintain modern web applications while collaborating closely with cross-functional teams. (26.96%)
- You will work on both front-end and back-end components and, when needed, integrate data analytics or machine learning elements to enhance our products. (39.14%)
- Key Responsibilities:  Web Application Development: Develop, test, and deploy responsive web applications using modern frameworks (e.g., React, Next.js, Node.js, Django). (19.28%)
- Back-End & Database Management: Design and maintain scalable back-end systems and databases (SQL and N