In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
folder_path = "/content/drive/MyDrive/Colab Notebooks/CVs1"

In [3]:
import os

pdfs = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
print("Total PDFs found:", len(pdfs))
print(pdfs[:5])


Total PDFs found: 200
['C1791.pdf', 'C1781.pdf', 'C1164.pdf', 'C1627.pdf', 'C1236.pdf']


In [4]:
!pip install pdfplumber tqdm




In [5]:
import os
import pdfplumber
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

def extract_all_resumes(folder_path):
    resume_texts = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(path)
            resume_texts.append({
                "filename": filename,
                "text": text
            })
    return resume_texts

In [6]:
all_resume_texts = extract_all_resumes(folder_path)



In [7]:
print(all_resume_texts[0]["filename"])
print(all_resume_texts[0]["text"])

C1791.pdf
Candidate Resume (ID: C1791)
Name: April Duarte
Email: aprilduarte34@gmail.com
Phone: +1-918-8393
Education
Bachelor of Engineering in Information Technology (2014-2018)
Concentrated on database management, networking, and cybersecurity.
Diploma in Software Engineering (2013-2015)
Hands-on experience in full-stack web development and mobile app creation.
Work Experience
Software Engineer at XYZ Corp (2018-2022)
Developed scalable backend applications, improved system efficiency by 30%, and led agile
development sprints.
Skills
Cloud Computing - Expert in AWS, Azure, and GCP; implemented scalable cloud architectures for
enterprise applications.
Certifications
Certified Ethical Hacker (CEH) - Demonstrated proficiency in ethical hacking, network security, and
vulnerability assessment.
Achievements
Developed an AI chatbot - Built a chatbot that reduced customer support tickets by 40%, enhancing
user experience and efficiency.
Tech Stack
Cybersecurity Tools: Metasploit, Wireshark,

In [8]:
! pip install langchain_groq



In [9]:
from langchain_groq import ChatGroq

In [10]:
llm = ChatGroq(
    model_name="llama3-8b-8192",
    groq_api_key="YOUR_GROQ_API_KEY"
)

In [11]:
! pip install langchain



In [12]:
from langchain.prompts import PromptTemplate

In [13]:
prompt_template = PromptTemplate(
    input_variables=["Resume_text"],
    template='''You are an AI assistant specialized in extracting and summarizing key information from Resume Texts. Given a Resume text data, analyze
it and return a structured summary in valid JSON format, capturing only the most essential details.

Resume Texts:
{Resume_text}

Return the following output strictly in the JSON format:

"Full Name": "[Extracted Full Name of Candidate]",
"Email": "[Extracted Email Address of Candidate (if available)]",
"Phone Number": "[Extracted Phone Number of Candidate (if available)]",
"Education":[
     "Extracted Education 1 (Degree,Field,University,Location,Year)",
     "Extracted Education 2 (Degree,Field,University,Location,Year)",
     "Extracted Education 3 (Degree,Field,University,Location,Year)"
],
"Experiences": [
    "Experience 1 in any company with role and duration",
    "Experience 2 in any company with role and duration"
],
"Skills": [
    "Extracted technical or soft skill 1",
    "Extracted technical or soft skill 2"
],
"Core Qualifications": [
    " qualification 1 ",
    " qualification 2 "
]
"Tech Stack":[
    "Tech Stack 1",
    "Tech Stack 2"
]
Guidelines:

Ensure the JSON output is valid and properly formatted.

Omit any section if the relevant information is not available in the Resume text data.

Keep responses concise and free from unnecessary text.'''
)

In [14]:
from langchain.chains import LLMChain
chain = LLMChain(llm=llm, prompt=prompt_template)

  chain = LLMChain(llm=llm, prompt=prompt_template)


In [15]:
!pip install chromadb




In [41]:
import chromadb
from chromadb.config import Settings

client = chromadb.Client(Settings(
    persist_directory="/content/chroma_resume_db",
    anonymized_telemetry=False
))

collection = client.get_or_create_collection(name="resumes")


InternalError: Database error: error returned from database: (code: 1) no such table: tenants

In [17]:
!pip install chromadb sentence-transformers




In [18]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to('cuda')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [19]:
import torch

if torch.cuda.is_available():
    print("yes")

yes


In [20]:
import json
import uuid

def embed_text(text):
    return model.encode(text, convert_to_numpy=True)

for resume in all_resume_texts:
    structured_info = chain.run(Resume_text=resume["text"])
    doc_text = json.dumps(structured_info)

    embedding = embed_text(doc_text)
    doc_id = str(uuid.uuid4())

    collection.add(
        documents=[doc_text],
        ids=[doc_id],
        embeddings=[embedding],
        metadatas=[{"filename": resume["filename"]}]
    )


  structured_info = chain.run(Resume_text=resume["text"])


In [21]:
query = "Cybersecurity expert with cloud certifications"
query_embedding = embed_text(query)

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

for doc in results["documents"][0]:
    print(doc)


"Here is the extracted summary in valid JSON format:\n\n```\n{\n  \"Full Name\": \"Charles Garcia\",\n  \"Email\": \"charlesgarcia79@gmail.com\",\n  \"Phone Number\": \"+1-516-6814\",\n  \"Education\": [\n    \"Bachelor of Engineering in Information Technology, unknown university, unknown location, 2014-2018\"\n  ],\n  \"Experiences\": [\n    \"Data Scientist at ABC Inc. (2019-2023)\",\n    \"Software Engineer at XYZ Corp (2018-2022)\"\n  ],\n  \"Skills\": [\n    \"Cloud Computing - Expert in AWS, Azure, and GCP\",\n    \"Certified Ethical Hacker (CEH) - Demonstrated proficiency in ethical hacking, network security, and vulnerability assessment\"\n  ],\n  \"Core Qualifications\": [\n    \"Built predictive models that enhanced decision-making processes, reducing operational costs by 25%\",\n    \"Developed an AI chatbot - Built a chatbot that reduced customer support tickets by 40%, enhancing user experience and efficiency\"\n  ],\n  \"Tech Stack\": [\n    \"Python\", \"TensorFlow\", \"

In [30]:
from langchain.prompts import PromptTemplate

job_prompt_template = PromptTemplate(
    input_variables=["Job_Description"],
    template="""
You are a helpful AI assistant that extracts structured information from job descriptions.
Given a job posting, extract key structured information in valid JSON format.

Job Description:
{Job_Description}

Return the output in the following JSON format:

{{
  "Job Title": "[Extracted Job Title]",
  "Responsibilities": [
    "Responsibility 1",
    "Responsibility 2"
  ],
  "Qualifications": [
    "Qualification 1",
    "Qualification 2"
  ],
  "Skills": [
    "Skill 1",
    "Skill 2"
  ],
  "Tech Stack": [
    "Technology 1",
    "Technology 2"
  ]
}}

Guidelines:
- If technologies or tools are mentioned (like Python, Tableau), add them to 'Tech Stack'.
- Put soft/hard skills (like problem-solving, communication) into 'Skills'.
- Ensure proper JSON formatting with no extra text.
"""
)


In [31]:
from langchain.chains import LLMChain

job_chain = LLMChain(llm=llm, prompt=job_prompt_template)


In [32]:
Job_Description = '''
Data Scientist,"Job Description:
We are looking for a skilled Data Scientist to analyze complex datasets, develop predictive models, and provide actionable insights. You will collaborate with cross-functional teams to optimize business strategies and drive data-driven decision-making.

Responsibilities:
Collect, clean, and analyze large datasets.
Develop and deploy machine learning models.
Build predictive analytics solutions to improve business outcomes.
Communicate findings through reports and visualizations.
Stay updated with advancements in data science and AI.

Qualifications:
Bachelor’s or Master’s degree in Data Science, Computer Science, or a related field.
Proficiency in Python, R, SQL, and machine learning frameworks.
Experience with data visualization tools like Tableau or Power BI.
Strong analytical and problem-solving skills.
Ability to work independently and in a team environment."
'''

response = job_chain.run(Job_Description=Job_Description)
print(response)


Here is the extracted information in the specified JSON format:

{
  "Job Title": "Data Scientist",
  "Responsibilities": [
    "Collect, clean, and analyze large datasets.",
    "Develop and deploy machine learning models.",
    "Build predictive analytics solutions to improve business outcomes.",
    "Communicate findings through reports and visualizations.",
    "Stay updated with advancements in data science and AI."
  ],
  "Qualifications": [
    "Bachelor’s or Master’s degree in Data Science, Computer Science, or a related field.",
    "Proficiency in Python, R, SQL, and machine learning frameworks.",
    "Experience with data visualization tools like Tableau or Power BI.",
    "Strong analytical and problem-solving skills.",
    "Ability to work independently and in a team environment."
  ],
  "Skills": [
    "Strong analytical and problem-solving skills.",
    "Ability to work independently and in a team environment.",
    "Communication"
  ],
  "Tech Stack": [
    "Python",
  

In [36]:
def flatten_json_for_embedding(data):
    parts = []
    if isinstance(data, dict):
        for k, v in data.items():
            if isinstance(v, list):
                parts.append(" ".join(v))
            elif isinstance(v, str):
                parts.append(v)
    return " ".join(parts)


In [37]:
job_flattened = flatten_json_for_embedding(Job_Description)
job_embedding = embed_text(job_flattened)


In [39]:
result = collection.query(
    query_embeddings=[job_embedding],
    n_results=len(all_resume_texts),
    include=['distances', 'documents', 'metadatas']
)
distances = result['distances'][0]
documents = result['documents'][0]
metadatas = result['metadatas'][0]

scored_resumes = sorted(zip(distances, documents, metadatas), key=lambda x: x[0])


InternalError: Error getting collection: Database error: error returned from database: (code: 1) no such table: collections