In [1]:
!pip install spacy tqdm




In [2]:
import json

# Load file
with open('/content/Entity Recognition in Resumes.json', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f if line.strip()]

print(f"Loaded {len(data)} resumes")


Loaded 220 resumes


In [16]:
def convert_to_spacy_format(data):
    spacy_data = []

    for record in data:
        text = record["content"]
        entity_set = set()
        entities = []

        for ann in record["annotation"]:
            if not ann["label"]:
                continue
            label = ann["label"][0]
            for point in ann["points"]:
                entity_text = point["text"]
                start = point["start"]
                end = point["end"]

                # Exact match
                if text[start:end] == entity_text:
                    ent_tuple = (start, end, label)
                else:
                    # Fallback to find()
                    corrected_start = text.find(entity_text)
                    if corrected_start != -1:
                        corrected_end = corrected_start + len(entity_text)
                        ent_tuple = (corrected_start, corrected_end, label)
                    else:
                        continue  # skip if still can't align

                if ent_tuple not in entity_set:
                    entity_set.add(ent_tuple)

        # 🧹 Remove overlapping spans
        sorted_entities = sorted(list(entity_set), key=lambda x: x[0])
        non_overlapping = []
        last_end = -1
        for start, end, label in sorted_entities:
            if start >= last_end:
                non_overlapping.append((start, end, label))
                last_end = end

        spacy_data.append((text, {"entities": non_overlapping}))

    return spacy_data

training_data = convert_to_spacy_format(data)
print(f"✅ Final training samples: {len(training_data)}")


✅ Final training samples: 220


In [17]:
training_data = convert_to_spacy_format(data)


In [43]:
import spacy
from spacy.training.example import Example
from spacy.util import minibatch, compounding
import random

# Create a blank English NLP pipeline
nlp = spacy.blank("en")

# Add the NER pipeline component
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

# Add all unique labels to the NER model
for _, annotations in training_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

import warnings
from spacy.training import Example
from spacy.util import minibatch, compounding

# Optional: suppress warning flood
warnings.filterwarnings("ignore", category=UserWarning, module="spacy")

# Disable other pipeline components for training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()

    for i in range(20):  # Number of epochs
        random.shuffle(training_data)
        losses = {}
        batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
        skipped = 0  # Count how many were skipped

        for batch in batches:
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                try:
                    example = Example.from_dict(doc, annotations)
                    nlp.update([example], drop=0.3, losses=losses)
                except Exception:
                    skipped += 1
                    continue







In [23]:
output_dir = "/content/ner_resume_model"

# Save model to disk
nlp.to_disk(output_dir)
print(f"✅ Model saved to: {output_dir}")


✅ Model saved to: /content/ner_resume_model


In [24]:
import spacy

# Load your saved model
model_path = "/content/ner_resume_model"
nlp = spacy.load(model_path)
print("✅ Model loaded successfully.")

✅ Model loaded successfully.


FROM PDF

In [38]:
import spacy
import pdfplumber
import re
from collections import defaultdict
import json

# Load your trained spaCy NER model
model_path = "/content/ner_resume_model"
nlp = spacy.load(model_path)
print("✅ Model loaded successfully.")

# Load and read text from PDF
pdf_path = "/content/X4c50D3WQk.pdf"  # Replace with your file path
text = ""
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text += page.extract_text() + "\n"

# Use your NER model
doc = nlp(text)

# Extract entities from model
entities = defaultdict(list)
for ent in doc.ents:
    entities[ent.label_].append(ent.text.strip())

print("\n📄 Extracted Entities from PDF:")
for label, values in entities.items():
    for val in values:
        print(f"{val} ➤ {label}")

# Regex Post-processing
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
phone_match = re.search(r'\+91[-\s]?[0-9]{10}', text)
linkedin_match = re.search(r'linkedin\.com\/[^\s]+', text)

print("\n🔧 Regex Post-processing:")
if email_match:
    print(f"Email ➤ {email_match.group()}")
if phone_match:
    print(f"Phone ➤ {phone_match.group()}")
if linkedin_match:
    print(f"LinkedIn ➤ {linkedin_match.group()}")


✅ Model loaded successfully.

📄 Extracted Entities from PDF:
Rahul Sharma ➤ Name
Bachelor of Technology in Computer Science ➤ Degree
Indian Institute of Technology Bombay (IIT Bombay) ➤ College Name
Software Engineer ➤ Designation
2019 ➤ Graduation Year

🔧 Regex Post-processing:
Email ➤ rahul.sharma@email.com
Phone ➤ +91-9876543210
LinkedIn ➤ linkedin.com/in/rahulsharma


FROM TXT

In [41]:
# 📄 Upload + Parse + Extract from TXT resume
from google.colab import files
import re

# Upload the resume
uploaded = files.upload()
resume_path = list(uploaded.keys())[0]

# Read text content
with open(resume_path, "r", encoding="utf-8") as f:
    resume_text = f.read()

# Run the trained NLP model
doc = nlp(resume_text)

# Show extracted entities
print("\n🔍 Extracted Entities:")
for ent in doc.ents:
    print(f"{ent.label_:25s} ➤ {ent.text}")

# Regex post-processing for additional info
email_match = re.search(r'[\w\.-]+@[\w\.-]+', resume_text)
phone_match = re.search(r'\+91[-\s]?[0-9]{10}', resume_text)
linkedin_match = re.search(r'linkedin\.com\/[^\s]+', resume_text)

print("\n🔧 Regex Post-processing:")
if email_match:
    print(f"Email    ➤ {email_match.group()}")
if phone_match:
    print(f"Phone    ➤ {phone_match.group()}")
if linkedin_match:
    print(f"LinkedIn ➤ {linkedin_match.group()}")


Saving sample_resume_ananya.txt to sample_resume_ananya (1).txt

🔍 Extracted Entities:
Name                      ➤ Ananya Sharma
Degree                    ➤ M.S. in Software Engineering, Carnegie Mellon University
Graduation Year           ➤ 2020
Degree                    ➤ B.E. in Information Technology
Graduation Year           ➤ 2017
Skills                    ➤ Java, Python, AWS, Docker, Kubernetes, REST APIs, PostgreSQL


🔧 Regex Post-processing:
Email    ➤ ananya.sharma24@email.com
Phone    ➤ +91 9123456789
LinkedIn ➤ linkedin.com/in/ananyasharma


In [42]:
# 📄 Upload + Parse + Extract from TXT resume
from google.colab import files
import re

# Upload the resume
uploaded = files.upload()
resume_path = list(uploaded.keys())[0]

# Read text content
with open(resume_path, "r", encoding="utf-8") as f:
    resume_text = f.read()

# Run the trained NLP model
doc = nlp(resume_text)

# Show extracted entities
print("\n🔍 Extracted Entities:")
for ent in doc.ents:
    print(f"{ent.label_:25s} ➤ {ent.text}")

# Regex post-processing for additional info
email_match = re.search(r'[\w\.-]+@[\w\.-]+', resume_text)
phone_match = re.search(r'\+91[-\s]?[0-9]{10}', resume_text)
linkedin_match = re.search(r'linkedin\.com\/[^\s]+', resume_text)

print("\n🔧 Regex Post-processing:")
if email_match:
    print(f"Email    ➤ {email_match.group()}")
if phone_match:
    print(f"Phone    ➤ {phone_match.group()}")
if linkedin_match:
    print(f"LinkedIn ➤ {linkedin_match.group()}")


Saving Rahul Verma.txt to Rahul Verma (1).txt

🔍 Extracted Entities:
Name                      ➤ Rahul Verma
Degree                    ➤ M.Tech in Data Science, Indian Institute of Technology
Degree                    ➤ B.Tech in Computer Science, NIT Trichy —
Graduation Year           ➤ 2019

🔧 Regex Post-processing:
Email    ➤ rahul.verma@gmail.com
Phone    ➤ +91-9876543210
LinkedIn ➤ linkedin.com/in/rahul-verma
