In [10]:
!pip install pdfplumber python-docx spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
import pdfplumber
import re
import json
import spacy
from docx import Document
from google.colab import files

nlp = spacy.load("en_core_web_sm")


In [12]:
uploaded = files.upload()
resume_path = next(iter(uploaded))
print("Uploaded file:", resume_path)


Saving sample_resume.pdf to sample_resume (5).pdf
Uploaded file: sample_resume (5).pdf


In [13]:
text = ""

if resume_path.lower().endswith(".pdf"):
    try:
        with pdfplumber.open(resume_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print("PDF read error:", e)

elif resume_path.lower().endswith(".docx"):
    try:
        doc = Document(resume_path)
        for para in doc.paragraphs:
            text += para.text + "\n"
    except Exception as e:
        print("DOCX read error:", e)
else:
    print("Unsupported format. Please upload a .pdf or .docx")

if not text.strip():
    print("No text extracted from resume. Please check the file.")
else:
    print("Text extracted. Preview below:\n")
    print(text[:1000])

lines = [line.strip() for line in text.split("\n") if line.strip()]
full_text = " ".join(lines)


Text extracted. Preview below:

Vijay Pagare
(+91)889XXXXX28|xyz@gmail.com|Miraroad,Thane,MH,INDhttps://
www.linkedin.com/in/xyz/
Afrontend-leaningsoftwareengineerwhohas4.5+yearsofexperienceinbuildingandmaintaininghigh-quality(B2B)
saasproductsandwebapplications.Provenabilitytoworkindependentlyandaspartofateaminfast-moving,
resource-constraintenvironmentswhereshortturnaroundtimesareanorm.Exceptionalatleveraginginterpersonalskills
tofacilitateacollaborativerelationshipamongcross-functionalteamstogettheworkdone.Excellentproblem-solverwith
anaptitudefortroubleshootingandtheabilitytoquicklymasternewskills,technology,orarole.
PROFESSIONALEXPERIENCE
PROPELLOR.AI Pune-Remote
SoftwareEngineer-Frontend August2021–Present
● Architected,builtandmaintainedbusinesscriticalmodulesforadatauni  cationandvisualisation
platform.
○ Introduced20+chartsincludingsankey,wordcloud,heatmap,tree,bubble,Map-Indiaand
USA,withafewcustombarcharts&tables.BuiltthemusingSVG,Canvas,andOpen-source
librarieslikeApacheEch

In [14]:
# Extract name
first_name = ""
last_name = ""
if lines:
    name_line = lines[0]
    name_parts = name_line.split()
    if len(name_parts) > 0:
        first_name = name_parts[0].strip().lower()
    if len(name_parts) > 1:
        last_name = " ".join(name_parts[1:]).strip().lower()

# Extract email
email_match = re.search(r"[\w.-]+@[\w.-]+", full_text)
email = email_match.group(0) if email_match else ""

# Extract phone number including masked formats like 889XXXXX28
phone_match = re.search(r"(?:(?:\+91|91)[\s\-]?)?[6-9][0-9Xx]{2}X{3,}[0-9]{2}", full_text)
phone = ""
if phone_match:
    phone = phone_match.group(0).replace(" ", "").replace("-", "")
    if not phone.startswith("+91") and not phone.startswith("91"):
        phone = "+91" + phone
    elif phone.startswith("91") and not phone.startswith("+"):
        phone = "+" + phone

print("Phone:\n", phone)

def extract_city_state(address_text):
    doc = nlp(address_text)
    city = ""
    state = ""

    # Known state abbreviations to full names
    known_states = {
        "MH": "Maharashtra", "KA": "Karnataka", "DL": "Delhi", "TN": "Tamil Nadu",
        "GJ": "Gujarat", "UP": "Uttar Pradesh", "WB": "West Bengal", "RJ": "Rajasthan",
        "AP": "Andhra Pradesh", "TS": "Telangana", "PB": "Punjab", "HR": "Haryana"
    }

    # Check if there's a state abbreviation
    state_match = re.search(r"\b([A-Z]{2})\b", address_text)
    if state_match:
        state_abbr = state_match.group(1)
        state = known_states.get(state_abbr, state_abbr)

    # Detect city from spaCy entities
    for ent in doc.ents:
        if ent.label_ == "GPE":
            city = ent.text
            break

    return {"city": city, "state": state, "country": "India"}

# Example usage:
address_text = "Miraroad, Thane, MH, IND"
extract_city_state(address_text)

print("ADDRESS:\n", extract_city_state(full_text))

summary = ""
if full_text.strip():
    # Normalize whitespace
    full_text = re.sub(r'\s+', ' ', full_text).strip()

    # Try to extract section explicitly titled "summary" or "profile"
    lines = full_text.split(". ")  # Split by sentences instead of lines for cleaner breaks
    start_idx = -1
    for idx, line in enumerate(lines):
        if 'summary' in line.lower() or 'profile' in line.lower():
            start_idx = idx
            break

    if start_idx != -1:
        summary_lines = []
        for i in range(start_idx + 1, len(lines)):
            if any(keyword in lines[i].lower() for keyword in ['experience', 'education', 'skills']):
                break
            summary_lines.append(lines[i].strip())
        summary = ". ".join(summary_lines)
    else:
        # Fallback: use first 3–4 full sentences from SpaCy
        doc = nlp(full_text)
        summary = " ".join([sent.text.strip() for sent in list(doc.sents)[:4]])

    summary = re.sub(r'\s+', ' ', summary).strip()

# Output
print("SUMMARY:\n", summary)



Phone:
 +91889XXXXX28
ADDRESS:
 {'city': 'Thane', 'state': 'Maharashtra', 'country': 'India'}
SUMMARY:
 Vijay Pagare (+91)889XXXXX28|xyz@gmail.com|Miraroad,Thane,MH,INDhttps:// www.linkedin.com/in/xyz/ Afrontend-leaningsoftwareengineerwhohas4.5+yearsofexperienceinbuildingandmaintaininghigh-quality(B2B) saasproductsandwebapplications. Provenabilitytoworkindependentlyandaspartofateaminfast-moving, resource-constraintenvironmentswhereshortturnaroundtimesareanorm. Exceptionalatleveraginginterpersonalskills tofacilitateacollaborativerelationshipamongcross-functionalteamstogettheworkdone. Excellentproblem-solverwith anaptitudefortroubleshootingandtheabilitytoquicklymasternewskills,technology,orarole.


In [15]:
keywords = [
    "react", "nextjs", "javascript", "typescript", "redux", "html", "css", "tailwindcss", "scss",
    "git", "nodejs", "linux", "material", "es6", "rxjs", "echarts", "d3.js", "three.js", "socket",
    "pwa", "mongodb", "express", "bootstrap", "angular", "ant design", "python", "sql"
]
skills = [{"skill": kw} for kw in keywords if kw.lower() in full_text.lower()]


In [16]:
import re
import json

# Example lines (replace this with lines = full_text.splitlines())
lines = [
    "Education",
    "Bachelor of Engineering - Computers",
    "Rajiv Gandhi Institute of Technology"
]

education_history = []
for i, line in enumerate(lines):
    if any(word in line.lower() for word in ['education', 'qualification']):
        for j in range(i+1, min(i+6, len(lines))):
            degree_match = re.search(r'(bachelor|master|mba|b\.tech|m\.tech|bsc|msc)[^,\n]*', lines[j], re.IGNORECASE)
            if degree_match:
                degree = degree_match.group().strip()
                institute = lines[j+1].strip() if j+1 < len(lines) else ""
                education_history.append({
                    "degree": degree.lower(),
                    "name": institute.lower(),
                    "from_date": " ",
                    "to_date": " "
                })
                break  # Only first degree match is needed
        break

print(json.dumps(education_history, indent=2))


[
  {
    "degree": "bachelor of engineering - computers",
    "name": "rajiv gandhi institute of technology",
    "from_date": " ",
    "to_date": " "
  }
]


In [17]:
import re

# Example resume text from a PDF (replace this with extracted PDF text)
full_text = """
Professional Experience

PROPELLOR.AI
Software Engineer - Frontend 08-01-2021 - 12-12-2023
Architected, built and maintained business critical modules for a data unification and visualization platform.

ERAGAP VENTURES
Software Engineer - Founder - 08-01-2021
Worked on web-based saas tools, media initiatives, and client projects.

FLEXILOANS
Software Engineer - Frontend 06-01-2019 - Present
Built client onboarding and lead generation platform.

LUMINAIRE ACADEMY
Lecturer - Founder 01-01-2015 - 01-01-2019
Taught physics and mentored students.
"""

# Split into lines
lines = [line.strip() for line in full_text.split('\n') if line.strip()]
work_history = []
exp_section = False

for i, line in enumerate(lines):
    if 'experience' in line.lower() or 'professional background' in line.lower():
        exp_section = True
        continue
    if exp_section:
        if line.lower().startswith('education') or line.lower().startswith('skills'):
            break
        year_match = re.findall(r'(20\d{2}|19\d{2})', line)
        if "-" in line and len(year_match) > 0:
            company = lines[i - 1] if i > 0 else ""
            title = line
            from_date = year_match[0]
            to_date = year_match[1] if len(year_match) > 1 else "Present"
            description = " ".join(lines[i+1:i+5]) if i+1 < len(lines) else ""
            description = re.sub(r'\s+', ' ', description)
            work_history.append({
                "company": company,
                "title": title,
                "from_date": from_date,
                "to_date": to_date,
                "description": description.strip()
            })

import json
print(json.dumps(work_history, indent=2))


[
  {
    "company": "PROPELLOR.AI",
    "title": "Software Engineer - Frontend 08-01-2021 - 12-12-2023",
    "from_date": "2021",
    "to_date": "2023",
    "description": "Architected, built and maintained business critical modules for a data unification and visualization platform. ERAGAP VENTURES Software Engineer - Founder - 08-01-2021 Worked on web-based saas tools, media initiatives, and client projects."
  },
  {
    "company": "ERAGAP VENTURES",
    "title": "Software Engineer - Founder - 08-01-2021",
    "from_date": "2021",
    "to_date": "Present",
    "description": "Worked on web-based saas tools, media initiatives, and client projects. FLEXILOANS Software Engineer - Frontend 06-01-2019 - Present Built client onboarding and lead generation platform."
  },
  {
    "company": "FLEXILOANS",
    "title": "Software Engineer - Frontend 06-01-2019 - Present",
    "from_date": "2019",
    "to_date": "Present",
    "description": "Built client onboarding and lead generation platfor

In [18]:
final_output = {
    "first_name": first_name,
    "last_name": last_name,
    "email": email,
    "phone": phone,
    "address": {
        "city": "city",
        "state": "state",
        "country": "country"
    },
    "summary": summary,
    "skills": skills,
    "education_history": education_history,
    "work_history": work_history
}

with open("resume_output.json", "w") as f:
    json.dump(final_output, f, indent=4)

files.download("resume_output.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>