# 🔍 Metadata Extraction Test Notebook
This notebook helps you test the `extract_metadata()` pipeline using a custom text chunk input.

You can modify the `text` variable in the next cell to test different examples.

In [None]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import spacy
from sentence_transformers import SentenceTransformer, util
import os
import json
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [26]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import spacy
from sentence_transformers import SentenceTransformer, util
import os
import json

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_trf")
except Exception:
    nlp = spacy.load("en_core_web_sm")

# Helper: normalize entity
def normalize_entity(e):
    text = re.sub(r'\s+', ' ', re.sub(r'\.', '', e.lower())).strip()
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

# Intent keywords and examples (copy from your scripts)
intent_keywords = {
    "claim_process": ["claim", "process", "file", "submit", "insurance"],
    "case_status": ["status", "case", "update", "progress", "judgement", "order", "appeal"],
    "document_request": ["document", "request", "copies", "forms"],
    "technical_support": ["error", "issue", "problem", "technical"],
    "general_info": ["information", "contact", "hours", "location"],
    "resume_info": [
        "skills", "resume", "cv", "proficiencies", "abilities", "expertise", "competencies", "qualifications",
        "experience", "work", "education", "background", "certifications", "projects", "programming", "languages",
        "achievements", "awards", "contact", "career", "summary", "tools", "technologies", "roles", "responsibilities",
        "soft skills", "applicant", "candidate", "developer", "engineer", "profile", "professional", "employment", "history",
        "management", "software", "admin", "implementation", "tracking", "project", "system", "solution", "platform", "application",
        "team", "lead", "player", "restocking", "inventory", "tournament", "manual", "implemented"
    ]
}
project_root = os.environ.get('PROJECT_ROOT', os.getcwd())
intent_examples_path = os.path.join(project_root,'..','data', 'intent_categories', 'intent_examples.json')
with open(intent_examples_path, 'r', encoding='utf-8') as f:
    intent_examples = json.load(f)
# intent_examples = {
#     "resume_info": [
#         "What skills are listed in the resume?",
#         "Show me the proficiencies in this CV.",
#         "List the abilities mentioned in the candidate's resume.",
#         "What expertise does the applicant have?",
#         "Which competencies are present in the resume?",
#         "What qualifications are included in the CV?",
#         "List the work experience of the candidate.",
#         "What is the educational background of the applicant?",
#         "Show me the certifications in this resume.",
#         "What are the technical skills mentioned?",
#         "Summarize the professional experience section.",
#         "What projects has the candidate worked on?",
#         "List the programming languages known by the applicant.",
#         "What are the achievements or awards?",
#         "Show me the contact information in the resume.",
#         "What is the career objective or summary?",
#         "List the tools and technologies used by the candidate.",
#         "What is the total experience in years?",
#         "Show me the roles and responsibilities held.",
#         "What are the soft skills mentioned?",
#         "List the languages spoken by the applicant.",
#         "What is the applicant's job title?",
#         "Describe the applicant's professional profile.",
#         "What companies has the candidate worked for?",
#         "List the frameworks and libraries used.",
#         "What cloud platforms does the candidate have experience with?",
#         "What development methodologies are mentioned?",
#         "List the certifications and licenses.",
#         "What leadership roles has the candidate held?",
#         "Summarize the applicant's employment history.",
#         "What is the candidate's GitHub or portfolio link?",
#         "Describe the candidate's experience in management and software projects.",
#         "What inventory or tracking systems has the applicant implemented?",
#         "List any admin or manual processes managed by the candidate.",
#         "What experience does the candidate have with tournaments or players?",
#         "Describe the candidate's role in restocking or inventory management.",
#         "What solutions or platforms has the applicant developed or led?",
#         "List any applications or systems the candidate has worked on.",
#         "What teams has the candidate led or been a part of?",
#         "Describe the candidate's experience with project implementation."
#     ],
#     "claim_process": ["How do I file a claim?", "What is the process for submitting an insurance claim?"],
#     "case_status": ["What is the current status of the case?", "Show me the progress of case number 511605."],
#     "document_request": ["Can I get a copy of the case order?", "How do I request the judgment document?"],
#     "technical_support": ["I have a technical issue.", "There is a problem with the system."],
#     "general_info": ["What is the purpose of this document?", "Give me a summary of the file."]
# }
intent_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def get_intent(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc]
    detected_intent = None
    max_matches = 0
    for intent, keywords in intent_keywords.items():
        matches = sum(kw in tokens for kw in keywords)
        if matches > max_matches:
            max_matches = matches
            detected_intent = intent
    intent_confidence = max_matches / max(1, len(intent_keywords.get(detected_intent, []))) if detected_intent else 0.0
    if not detected_intent or max_matches == 0:
        query_emb = intent_model.encode(text, convert_to_tensor=True)
        best_intent, best_score = None, 0
        for intent, examples in intent_examples.items():
            example_embs = intent_model.encode(examples, convert_to_tensor=True)
            scores = util.pytorch_cos_sim(query_emb, example_embs)
            max_score = scores.max().item()
            if max_score > best_score:
                best_score = max_score
                best_intent = intent
        if best_score > 0.35:
            detected_intent = best_intent
            intent_confidence = best_score
    if not detected_intent:
        detected_intent = "general_info"
        intent_confidence = 0.0
    return detected_intent, intent_confidence, None

def extract_metadata(text):
    doc = nlp(text)
    # Entities
    entities = [normalize_entity(ent.text) for ent in doc.ents]
    # Keywords (nouns, proper nouns, not stopwords)
    keywords = [normalize_entity(token.text) for token in doc if token.pos_ in ["NOUN", "PROPN"] and not token.is_stop and token.lemma_.lower() not in ENGLISH_STOP_WORDS and len(token.text) > 2]
    # Intent
    detected_intent, intent_confidence, _ = get_intent(text)
    return {
        "entities": entities,
        "keywords": keywords,
        "intent": detected_intent,
        "intent_confidence": intent_confidence
    }


In [45]:
import sys
import sentence_transformers
import transformers
import spacy

# Print environment and config for debugging
try:
    print('spaCy version:', spacy.__version__)
except Exception:
    print('spaCy not available')
try:
    print('SentenceTransformers version:', sentence_transformers.__version__)
except Exception:
    print('SentenceTransformers not available')
try:
    print('Transformers version:', transformers.__version__)
except Exception:
    print('Transformers not available')
try:
    print('spaCy model:', nlp.meta['name'] if nlp else 'None')
except Exception:
    print('spaCy model: None')
try:
    print('Intent keywords:', intent_keywords)
except Exception:
    print('Intent keywords: not loaded')
try:
    print('Intent examples:', list(intent_examples.keys()))
except Exception:
    print('Intent examples: not loaded')
if 'text' in globals():
    print('Text sample:', repr(text[:500]))
else:
    print('No text loaded yet.')


spaCy version: 3.8.7
SentenceTransformers version: 4.1.0
Transformers version: 4.52.4
spaCy model: core_web_trf
Intent keywords: {'claim_process': ['claim', 'process', 'file', 'submit', 'insurance'], 'case_status': ['status', 'case', 'update', 'progress', 'judgement', 'order', 'appeal'], 'document_request': ['document', 'request', 'copies', 'forms'], 'technical_support': ['error', 'issue', 'problem', 'technical'], 'general_info': ['information', 'contact', 'hours', 'location'], 'resume_info': ['skills', 'resume', 'cv', 'proficiencies', 'abilities', 'expertise', 'competencies', 'qualifications', 'experience', 'work', 'education', 'background', 'certifications', 'projects', 'programming', 'languages', 'achievements', 'awards', 'contact', 'career', 'summary', 'tools', 'technologies', 'roles', 'responsibilities', 'soft skills', 'applicant', 'candidate', 'developer', 'engineer', 'profile', 'professional', 'employment', 'history', 'management', 'software', 'admin', 'implementation', 'trackin

In [1]:
%pip install python-dotenv

import sys
import os, json, yaml

project_root = 'c:\\New folder (5)\\new-search-models'
sys.path.insert(0, project_root)

# Load config.yaml using the correct path
config_path = os.path.join(project_root,  'config.yaml')
if os.path.exists(config_path):
	with open(config_path, 'r') as f:
		config = yaml.safe_load(f)
else:
	print(f"⚠️ config.yaml not found at {config_path}")
	config = {}

from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, 'config', '.env'))
from scripts.entity_utils import normalize_entity, get_spacy_nlp
from scripts.search_pipeline import get_openai_embedding
from scripts.intent_utils import get_intent
# from scripts.metadata import extract_metadata  # <-- You must have your full extraction code in this file


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm
  model.load_state_dict(torch.load(filelike, map_location=device))


In [7]:
with open(r'C:\New folder (5)\new-search-models\data\chunks\Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt', 'r', encoding='utf-8') as file:
    text = file.read()
print('✅ Sample Text Loaded')

✅ Sample Text Loaded


In [42]:
metadata = extract_metadata(text)

metadata

{'entities': ['devzen',
  'spring boot spring security',
  'spring batch',
  'elasticsearch',
  'angular typescript',
  'spring boot angular',
  'aw ec2 s3 rd iam',
  'spring boot',
  'angular',
  '25',
  '50',
  'portal',
  '20',
  'six month of launch',
  '40',
  '90'],
 'keywords': ['java',
  'stack',
  'developer',
  'devzen',
  'software',
  'solution',
  'jwt',
  'authentication',
  'verification',
  'apis',
  'spring',
  'boot',
  'spring',
  'security',
  'batch',
  'processing',
  'workflow',
  'spring',
  'batch',
  'scale',
  'datum',
  'management',
  'rest',
  'apis',
  'time',
  'inventory',
  'tracking',
  'alert',
  'stock',
  'management',
  'elasticsearch',
  'search',
  'retrieval',
  'volume',
  'application',
  'component',
  'angular',
  'typescript',
  'product',
  'management',
  'payment',
  'gateway',
  'integration',
  'spring',
  'boot',
  'angular',
  'transaction',
  'dashboard',
  'key',
  'insight',
  'tournament',
  'inventory',
  'management',
  'appli

In [43]:
from pprint import pprint
print('✅ Extracted Metadata:')
pprint(metadata)

✅ Extracted Metadata:
{'entities': ['devzen',
              'spring boot spring security',
              'spring batch',
              'elasticsearch',
              'angular typescript',
              'spring boot angular',
              'aw ec2 s3 rd iam',
              'spring boot',
              'angular',
              '25',
              '50',
              'portal',
              '20',
              'six month of launch',
              '40',
              '90'],
 'intent': 'resume_info',
 'intent_confidence': 0.28846153846153844,
 'keywords': ['java',
              'stack',
              'developer',
              'devzen',
              'software',
              'solution',
              'jwt',
              'authentication',
              'verification',
              'apis',
              'spring',
              'boot',
              'spring',
              'security',
              'batch',
              'processing',
              'workflow',
              'spring',
      

In [14]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Test alternative intent detection techniques

# 1. Simple keyword matching (baseline)
def detect_intent_keywords(text, intent_keywords):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc]
    scores = {}
    for intent, keywords in intent_keywords.items():
        matches = sum(kw in tokens for kw in keywords)
        scores[intent] = matches
    best_intent = max(scores, key=scores.get)
    confidence = scores[best_intent] / max(1, len(intent_keywords[best_intent]))
    return best_intent, confidence

# 2. Embedding similarity (SentenceTransformer, already used in get_intent)
def detect_intent_embedding(text, intent_examples, intent_model):
    query_emb = intent_model.encode(text, convert_to_tensor=True)
    best_intent, best_score = None, 0
    for intent, examples in intent_examples.items():
        example_embs = intent_model.encode(examples, convert_to_tensor=True)
        scores = util.pytorch_cos_sim(query_emb, example_embs)
        max_score = scores.max().item()
        if max_score > best_score:
            best_score = max_score
            best_intent = intent
    return best_intent, best_score

# 3. Regex-based intent detection (very basic)
def detect_intent_regex(text):
    patterns = {
        "resume_info": r"\b(resume|cv|skills|experience|project|education|certification)\b",
        "claim_process": r"\b(claim|insurance|submit|file)\b",
        "case_status": r"\b(status|case|update|progress|judgement|order|appeal)\b",
        "document_request": r"\b(document|request|copy|copies|form)\b",
        "technical_support": r"\b(error|issue|problem|technical)\b",
        "general_info": r"\b(information|contact|hours|location|summary|purpose)\b"
    }
    for intent, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return intent, 1.0
    return "general_info", 0.0

# Run all techniques on the loaded text
print("== Simple Keyword Matching ==")
intent_kw, conf_kw = detect_intent_keywords(text, intent_keywords)
print(f"Intent: {intent_kw}, Confidence: {conf_kw:.2f}")

print("\n== Embedding Similarity ==")
intent_emb, conf_emb = detect_intent_embedding(text, intent_examples, intent_model)
print(f"Intent: {intent_emb}, Similarity: {conf_emb:.2f}")

print("\n== Regex-based Detection ==")
intent_rgx, conf_rgx = detect_intent_regex(text)
print(f"Intent: {intent_rgx}, Confidence: {conf_rgx:.2f}")
# 4. Majority voting among techniques

def majority_vote(*intents):
    count = Counter(intents)
    best, freq = count.most_common(1)[0]
    return best, freq / len(intents)

intents = [intent_kw, intent_emb, intent_rgx]
majority_intent, majority_conf = majority_vote(*intents)
print("\n== Majority Voting ==")
print(f"Intent: {majority_intent}, Confidence: {majority_conf:.2f}")

# 5. Bag-of-words cosine similarity (very basic)

def detect_intent_bow(text, intent_examples):
    vectorizer = CountVectorizer().fit([text] + [ex for exs in intent_examples.values() for ex in exs])
    text_vec = vectorizer.transform([text])
    best_intent, best_score = None, 0
    for intent, examples in intent_examples.items():
        ex_vecs = vectorizer.transform(examples)
        sim = cosine_similarity(text_vec, ex_vecs).max()
        if sim > best_score:
            best_score = sim
            best_intent = intent
    return best_intent, best_score

intent_bow, conf_bow = detect_intent_bow(text, intent_examples)
print("\n== Bag-of-Words Cosine Similarity ==")
print(f"Intent: {intent_bow}, Similarity: {conf_bow:.2f}")

== Simple Keyword Matching ==
Intent: resume_info, Confidence: 0.29

== Embedding Similarity ==
Intent: resume_info, Confidence: 0.29

== Embedding Similarity ==
Intent: resume_info, Similarity: 0.41

== Regex-based Detection ==
Intent: resume_info, Confidence: 1.00

== Majority Voting ==
Intent: resume_info, Confidence: 1.00

== Bag-of-Words Cosine Similarity ==
Intent: resume_info, Similarity: 0.32
Intent: resume_info, Similarity: 0.41

== Regex-based Detection ==
Intent: resume_info, Confidence: 1.00

== Majority Voting ==
Intent: resume_info, Confidence: 1.00

== Bag-of-Words Cosine Similarity ==
Intent: resume_info, Similarity: 0.32


## using both regex and the Transformers NER pipeline, along with KeyBERT for keyword extraction. 

In [10]:
# --- Alternative Metadata Extraction: Regex + Transformers NER ---
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from keybert import KeyBERT
import re

# Load NER pipeline (Roberta large NER)
ner_pipe = pipeline(
    "ner",
    model=AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    tokenizer=AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    aggregation_strategy="simple",
    device=-1  # CPU
)

# KeyBERT for keyword extraction
keyword_model = KeyBERT()

def extract_metadata_alt(text):
    # Entities using transformers NER
    entities = set()
    for ent in ner_pipe(text):
        if ent['score'] > 0.8:
            entities.add(ent['word'].strip().lower())
    # Regex for names (simple)
    name_matches = re.findall(r'([A-Z][a-z]+(?: [A-Z][a-z]+)+)', text)
    for name in name_matches:
        entities.add(name.lower())
    # Keywords using KeyBERT
    keywords = [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)]
    # Intent using previous get_intent
    detected_intent, intent_confidence, _ = get_intent(text)
    return {
        "entities": sorted(list(entities)),
        "keywords": keywords,
        "intent": detected_intent,
        "intent_confidence": intent_confidence
    }

# alt_metadata = extract_metadata_alt(text)
# print('✅ Alternative Metadata Extraction:')
# from pprint import pprint
# pprint(alt_metadata)

Device set to use cpu


## 🚀 Production-Grade Intent Detection Solutions

For real-world, scalable, and robust intent detection, consider these best practices:

- **Hybrid Pipeline:** Combine fast rule-based/keyword/regex checks for high-precision intents with embedding-based similarity for flexible, robust matching.
- **ML/Deep Learning Models:** Use fine-tuned transformer models (e.g., BERT, RoBERTa, DistilBERT) for intent classification if you have enough labeled data.
- **Fallbacks:** Always provide a fallback (e.g., "general_info") for ambiguous or low-confidence cases.
- **Confidence Thresholds:** Use thresholds to decide when to trust a prediction or escalate to a human/manual review.
- **Monitoring & Logging:** Log predictions, confidence, and input for continuous improvement and error analysis.
- **Versioning:** Version your models, configs, and intent definitions for reproducibility and safe updates.
- **Batch & Real-Time Support:** Design your pipeline to work both in batch (offline) and real-time (API) modes.

Below is a modular, production-ready intent detection pipeline you can adapt and extend.

In [44]:
import logging
from typing import Dict, Any

# Configure logging for production
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

class ProductionIntentDetector:
    def __init__(self, intent_keywords, intent_examples, intent_model, threshold=0.35):
        self.intent_keywords = intent_keywords
        self.intent_examples = intent_examples
        self.intent_model = intent_model
        self.threshold = threshold

    def detect(self, text: str) -> Dict[str, Any]:
        # 1. Fast rule-based keyword/lemmatized match
        doc = nlp(text.lower())
        tokens = [token.lemma_ for token in doc]
        detected_intent = None
        max_matches = 0
        for intent, keywords in self.intent_keywords.items():
            matches = sum(kw in tokens for kw in keywords)
            if matches > max_matches:
                max_matches = matches
                detected_intent = intent
        intent_confidence = max_matches / max(1, len(self.intent_keywords.get(detected_intent, []))) if detected_intent else 0.0
        # 2. Embedding similarity fallback
        if not detected_intent or max_matches == 0:
            query_emb = self.intent_model.encode(text, convert_to_tensor=True)
            best_intent, best_score = None, 0
            for intent, examples in self.intent_examples.items():
                example_embs = self.intent_model.encode(examples, convert_to_tensor=True)
                scores = util.pytorch_cos_sim(query_emb, example_embs)
                max_score = scores.max().item()
                if max_score > best_score:
                    best_score = max_score
                    best_intent = intent
            if best_score > self.threshold:
                detected_intent = best_intent
                intent_confidence = best_score
        # 3. Fallback to general_info
        if not detected_intent:
            detected_intent = "general_info"
            intent_confidence = 0.0
        # 4. Logging for monitoring
        logging.info(f"Intent: {detected_intent}, Confidence: {intent_confidence:.2f}, Text: {text[:80]}...")
        return {
            "intent": detected_intent,
            "intent_confidence": intent_confidence
        }

# Usage example
prod_detector = ProductionIntentDetector(intent_keywords, intent_examples, intent_model, threshold=0.35)
chunk_dir = os.path.join(project_root, '..', 'data', 'chunks')
chunk_files = glob.glob(os.path.join(chunk_dir, '*.txt'))

chunk_intents = {}
for chunk_path in chunk_files:
    with open(chunk_path, 'r', encoding='utf-8') as f:
        chunk_text = f.read()
    result = prod_detector.detect(chunk_text)
    chunk_intents[os.path.basename(chunk_path)] = result

print('✅ Intent detection for all chunks:')
for fname, res in chunk_intents.items():
    print(f"{fname}: {res}")
# prod_result = prod_detector.detect(text)
# print('✅ Production-Grade Intent Detection:')
# print(prod_result)

2025-06-22 12:08:29,087 INFO Intent: resume_info, Confidence: 0.29, Text: Java Full Stack Developer 2023 - Present Devzen Software Solutions Developed sec...
2025-06-22 12:08:29,277 INFO Intent: resume_info, Confidence: 0.21, Text: reducing base by 20 within six months of launch manual administration time by 40...
2025-06-22 12:08:29,277 INFO Intent: resume_info, Confidence: 0.21, Text: reducing base by 20 within six months of launch manual administration time by 40...
2025-06-22 12:08:29,758 INFO Intent: resume_info, Confidence: 0.25, Text: JIMSON RATNAM KAPAVARAPU Java Fullstack Developer 91 9154631932 jimsonjimmy2008g...
2025-06-22 12:08:29,758 INFO Intent: resume_info, Confidence: 0.25, Text: JIMSON RATNAM KAPAVARAPU Java Fullstack Developer 91 9154631932 jimsonjimmy2008g...
2025-06-22 12:08:30,121 INFO Intent: resume_info, Confidence: 0.21, Text: real-time updates for matches fixtures Implemented secure role-based access cont...
2025-06-22 12:08:30,121 INFO Intent: resume_info, Co

✅ Intent detection for all chunks:
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt: {'intent': 'resume_info', 'intent_confidence': 0.28846153846153844}
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk2.txt: {'intent': 'resume_info', 'intent_confidence': 0.21153846153846154}
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk3.txt: {'intent': 'resume_info', 'intent_confidence': 0.25}
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk4.txt: {'intent': 'resume_info', 'intent_confidence': 0.21153846153846154}
MHC_CaseStatus_511605_chunk1.txt: {'intent': 'case_status', 'intent_confidence': 0.42857142857142855}
MHC_CaseStatus_511605_chunk10.txt: {'intent': 'claim_process', 'intent_confidence': 0.2}
MHC_CaseStatus_511605_chunk11.txt: {'intent': 'claim_process', 'intent_confidence': 0.4}
MHC_CaseStatus_511605_chunk12.txt: {'intent': 'case_status', 'intent_confidence': 0.2857142857142857}
MHC_CaseStatus_511605_chunk13.txt: {'intent': 'case_status', 'intent_confidence': 0.28571428571428

In [12]:
from pprint import pprint

# Test and compare all metadata extraction techniques side by side

print("== spaCy + SentenceTransformer Pipeline ==")
spacy_metadata = extract_metadata(text)
pprint(spacy_metadata)

print("\n== Transformers NER + KeyBERT Pipeline ==")
alt_metadata = extract_metadata_alt(text)
pprint(alt_metadata)

print("\n== Comparison ==")
print("Entities overlap:", set(spacy_metadata['entities']) & set(alt_metadata['entities']))
print("Keywords overlap:", set(spacy_metadata['keywords']) & set(alt_metadata['keywords']))
print("Intent (spaCy pipeline):", spacy_metadata['intent'], "| Confidence:", spacy_metadata['intent_confidence'])
print("Intent (Transformers+KeyBERT):", alt_metadata['intent'], "| Confidence:", alt_metadata['intent_confidence'])


== spaCy + SentenceTransformer Pipeline ==
{'entities': ['devzen',
              'spring boot spring security',
              'spring batch',
              'elasticsearch',
              'angular typescript',
              'spring boot angular',
              'aw ec2 s3 rd iam',
              'spring boot',
              'angular',
              '25',
              '50',
              'portal',
              '20',
              'six month of launch',
              '40',
              '90'],
 'intent': 'resume_info',
 'intent_confidence': 0.28846153846153844,
 'keywords': ['java',
              'stack',
              'developer',
              'devzen',
              'software',
              'solution',
              'jwt',
              'authentication',
              'verification',
              'apis',
              'spring',
              'boot',
              'spring',
              'security',
              'batch',
              'processing',
              'workflow',
         

In [16]:
from transformers import pipeline
from collections import Counter

# Advanced Intent Detection Techniques

# 1. Zero-shot classification with HuggingFace Transformers (e.g., facebook/bart-large-mnli)

zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def detect_intent_zero_shot(text, candidate_labels):
    result = zero_shot_classifier(text, candidate_labels)
    best_intent = result['labels'][0]
    confidence = result['scores'][0]
    return best_intent, confidence

candidate_labels = list(intent_examples.keys())
intent_zs, conf_zs = detect_intent_zero_shot(text, candidate_labels)
print("== Zero-Shot Classification ==")
print(f"Intent: {intent_zs}, Confidence: {conf_zs:.2f}")

# 2. Fine-tuned intent classification model (if you have labeled data)
# (Placeholder: You would train a classifier using your labeled dataset, e.g., using sklearn, Keras, or HuggingFace Trainer.)

# 3. Ensemble: Combine multiple techniques (majority vote, weighted average, etc.)
# Ensure detect_intent_keywords is defined (use from cell 7 if available)
if 'detect_intent_keywords' not in globals():
    def detect_intent_keywords(text, intent_keywords):
        doc = nlp(text.lower())
        tokens = [token.lemma_ for token in doc]
        scores = {}
        for intent, keywords in intent_keywords.items():
            matches = sum(kw in tokens for kw in keywords)
            scores[intent] = matches
        best_intent = max(scores, key=scores.get)
        confidence = scores[best_intent] / max(1, len(intent_keywords[best_intent]))
        return best_intent, confidence

def ensemble_intent_detection(text):
    results = []
    # Keyword
    kw_intent, kw_conf = detect_intent_keywords(text, intent_keywords)
    results.append((kw_intent, kw_conf))
    # Embedding
    emb_intent, emb_conf = detect_intent_embedding(text, intent_examples, intent_model)
    results.append((emb_intent, emb_conf))
    # Regex
    rgx_intent, rgx_conf = detect_intent_regex(text)
    results.append((rgx_intent, rgx_conf))
    # Zero-shot
    zs_intent, zs_conf = detect_intent_zero_shot(text, candidate_labels)
    results.append((zs_intent, zs_conf))
    # Majority vote
    intents = [intent for intent, _ in results]
    best, freq = Counter(intents).most_common(1)[0]
    return best, freq / len(results), results

ensemble_intent, ensemble_conf, all_results = ensemble_intent_detection(text)
print("\n== Ensemble Intent Detection ==")
print(f"Intent: {ensemble_intent}, Confidence: {ensemble_conf:.2f}")
print("All results:", all_results)

Device set to use cuda:0


== Zero-Shot Classification ==
Intent: case_status, Confidence: 0.18

== Ensemble Intent Detection ==
Intent: resume_info, Confidence: 0.75
All results: [('resume_info', 0.28846153846153844), ('resume_info', 0.4149797260761261), ('resume_info', 1.0), ('case_status', 0.17770253121852875)]

== Ensemble Intent Detection ==
Intent: resume_info, Confidence: 0.75
All results: [('resume_info', 0.28846153846153844), ('resume_info', 0.4149797260761261), ('resume_info', 1.0), ('case_status', 0.17770253121852875)]


## Fine-Tuned Transformer Classifier for Intent Detection

This section demonstrates how to train and use a transformer (DistilBERT) for intent classification using Hugging Face Transformers. You need a labeled dataset (text, intent) for this. The example below uses realistic intent examples from your domain.

In [49]:
import pandas as pd

# Example labeled data for intent fine-tuning (expand with more real samples for best results)
data = [
    {"text": "How do I file a claim?", "label": "claim_process"},
    {"text": "What is the process for submitting an insurance claim?", "label": "claim_process"},
    {"text": "What is the current status of the case?", "label": "case_status"},
    {"text": "Show me the progress of case number 511605.", "label": "case_status"},
    {"text": "Can I get a copy of the case order?", "label": "document_request"},
    {"text": "How do I request the judgment document?", "label": "document_request"},
    {"text": "What skills are listed in the resume?", "label": "resume_info"},
    {"text": "List the programming languages known by the applicant.", "label": "resume_info"},
    {"text": "Who is the presiding judge for this case?", "label": "court_details"},
    {"text": "Who are the parties involved in this case?", "label": "party_information"},
    {"text": "When was the last hearing held?", "label": "hearing_information"},
    {"text": "I have a technical issue with the system.", "label": "technical_support"},
    {"text": "Give me a summary of the file.", "label": "general_info"},
    {"text": "Tell me about this document.", "label": "general_info"},
]
df = pd.DataFrame(expanded_data)
df

Unnamed: 0,text,label
0,How do I file a claim?,claim_process
1,What is the process for submitting an insuranc...,claim_process
2,I want to submit a new claim for my car accident.,claim_process
3,Guide me through the claim submission steps.,claim_process
4,Where do I upload my claim documents?,claim_process
...,...,...
231,the petitioner side in the context of Section ...,case_status
232,contend that once the search was over on 25012...,claim_process
233,been issued which has either not been issued o...,case_status
234,petitioner has relied upon various judgments a...,claim_process


In [50]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# 1. Prepare label mappings
df['label_id'] = df['label'].astype('category').cat.codes
label2id = {label: i for i, label in enumerate(df['label'].astype('category').cat.categories)}
id2label = {i: label for label, i in label2id.items()}

# 2. Convert to Hugging Face Dataset
# Rename 'label_id' to 'labels' for Trainer compatibility
df_for_hf = df.rename(columns={'label_id': 'labels'})
# Remove the 'label' column (string) to avoid Trainer confusion
df_for_hf = df_for_hf.drop(columns=['label'])
dataset = Dataset.from_pandas(df_for_hf)

# 3. Tokenize
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)
dataset = dataset.map(preprocess, batched=True)

# 4. Train/Test split
split = dataset.train_test_split(test_size=0.2)
train_ds, test_ds = split["train"], split["test"]

# 5. Model and Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id)
training_args = TrainingArguments(
    output_dir="./intent_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    do_eval=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

# 6. Train (this will take a few minutes on CPU, much faster on GPU)
trainer.train()

Map: 100%|██████████| 236/236 [00:00<00:00, 1289.62 examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.3039,1.110577
2,1.0479,0.944183
3,0.7932,0.825132


TrainOutput(global_step=141, training_loss=1.1095424970836505, metrics={'train_runtime': 11.7058, 'train_samples_per_second': 48.181, 'train_steps_per_second': 12.045, 'total_flos': 9339950886912.0, 'train_loss': 1.1095424970836505, 'epoch': 3.0})

In [53]:
import glob

# 7. Inference: Predict intent for new text
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    pred_id = logits.argmax(dim=1).item()
    intent = id2label[pred_id]
    confidence = logits.softmax(dim=1)[0, pred_id].item()
    return intent, confidence
# def predict_intent(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
#     device = next(model.parameters()).device
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     outputs = model(**inputs)
#     pred_id = outputs.logits.argmax(dim=1).item()
#     return id2label[pred_id]
chunk_dir = os.path.join(project_root, '..', 'data', 'chunks')
chunk_files = glob.glob(os.path.join(chunk_dir, '*.txt'))



# Example usage
# print(predict_intent(open(r'C:\New folder (5)\new-search-models\data\chunks\MHC_CaseStatus_511605_chunk1.txt', 'r', encoding='utf-8').read()))
for chunk_path in chunk_files:
    with open(chunk_path, 'r', encoding='utf-8') as f:
        chunk_text = f.read()
    print(f"{os.path.basename(chunk_path)}: {predict_intent(chunk_text)}")
# print(predict_intent("List the skills in this resume."))
# print(predict_intent("Who is the presiding judge?"))
# print(predict_intent("I have a technical issue with the system."))

Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt: ('resume_info', 0.6400291323661804)
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk2.txt: ('resume_info', 0.6304816007614136)
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk3.txt: ('resume_info', 0.640763521194458)
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk4.txt: ('resume_info', 0.6496602296829224)
MHC_CaseStatus_511605_chunk1.txt: ('case_status', 0.8032860159873962)
MHC_CaseStatus_511605_chunk10.txt: ('claim_process', 0.4902017116546631)
MHC_CaseStatus_511605_chunk11.txt: ('case_status', 0.5927583575248718)
MHC_CaseStatus_511605_chunk12.txt: ('case_status', 0.7081291079521179)
MHC_CaseStatus_511605_chunk13.txt: ('case_status', 0.7823941707611084)
MHC_CaseStatus_511605_chunk14.txt: ('claim_process', 0.5909293293952942)
MHC_CaseStatus_511605_chunk15.txt: ('case_status', 0.5937987565994263)
MHC_CaseStatus_511605_chunk16.txt: ('claim_process', 0.508154034614563)
MHC_CaseStatus_511605_chunk17.txt: ('claim_process', 0.45

In [47]:
# 📥 Build labeled training data from correct_intents.txt
import re

correct_intents_path = r'C:\New folder (5)\new-search-models\correct_intents.txt'
chunk_dir = os.path.join(project_root, '..', 'data', 'chunks')

labeled_from_file = []
with open(correct_intents_path, 'r', encoding='utf-8') as f:
    for line in f:
        m = re.match(r'([^:]+):.*?\'intent\': \'([^\']+)\'', line)
        if m:
            fname, intent = m.group(1).strip(), m.group(2).strip()
            chunk_path = os.path.join(chunk_dir, fname)
            if os.path.exists(chunk_path):
                with open(chunk_path, 'r', encoding='utf-8') as cf:
                    chunk_text = cf.read()
                labeled_from_file.append({"text": chunk_text, "label": intent})
            else:
                print(f"⚠️ Chunk file not found: {chunk_path}")
        else:
            print(f"⚠️ Could not parse line: {line.strip()}")

print(f"Loaded {len(labeled_from_file)} labeled examples from correct_intents.txt.")
# Add these to your expanded_data list before retraining:
# expanded_data.extend(labeled_from_file)


Loaded 88 labeled examples from correct_intents.txt.


In [48]:
# ➕ Add labeled examples from correct_intents.txt to your training data
if 'expanded_data' in globals() and 'labeled_from_file' in globals():
    expanded_data.extend(labeled_from_file)
    print(f"expanded_data now has {len(expanded_data)} examples (including those from correct_intents.txt).")
else:
    print("⚠️ Make sure both expanded_data and labeled_from_file are defined before running this cell.")


expanded_data now has 236 examples (including those from correct_intents.txt).


In [None]:
## 🛠️ Iterative Improvement: Add Misclassified Examples

# If you notice misclassified chunks, copy their text and true intent below. Add them to your training data to help the model learn from its mistakes. Retrain and re-evaluate for better accuracy.
# Example: Add misclassified examples to your training data
# Replace the text and label with your real misclassified cases
misclassified_examples = [
    {"text": "<Paste misclassified chunk text here>", "label": "<correct_intent>"},
    # Example:
    # {"text": "The case was closed on 2023-05-01.", "label": "case_status"},
    # {"text": "Please send me a copy of the final judgment.", "label": "document_request"},
]

# Add these to your expanded_data list before retraining:
# expanded_data.extend(misclassified_examples)


## ⚠️ Expand Your Labeled Dataset for Better Classifier Performance

Your current labeled dataset is too small and unbalanced, which causes the model to predict the same intent for most inputs. 

- Add at least 10–20 diverse, realistic examples for each intent category.
- Include edge cases and ambiguous queries.
- The more varied and representative your data, the better your classifier will perform.

After expanding, retrain the model and re-run the evaluation cell below.

In [40]:

# 8. Evaluate classifier performance on the test set
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Get true and predicted labels for the test set
true_labels = [id2label[i] for i in test_ds['labels']]
pred_labels = []
for text in test_ds['text']:
    pred = predict_intent(text)
    pred_labels.append(pred)

# Accuracy
acc = accuracy_score(true_labels, pred_labels)
print(f"Test Accuracy: {acc:.2f}")

# Classification report
print("\nClassification Report:")
print(classification_report(true_labels, pred_labels))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(true_labels, pred_labels, labels=list(label2id.keys())))

Test Accuracy: 0.58

Classification Report:
                   precision    recall  f1-score   support

      case_status       1.00      1.00      1.00         2
    claim_process       0.50      0.50      0.50         2
 document_request       0.50      1.00      0.67         3
     general_info       0.00      0.00      0.00         3
      resume_info       0.50      1.00      0.67         1
technical_support       0.00      0.00      0.00         1

         accuracy                           0.58        12
        macro avg       0.42      0.58      0.47        12
     weighted avg       0.42      0.58      0.47        12

Confusion Matrix:
[[2 0 0 0 0 0]
 [0 1 1 0 0 0]
 [0 0 3 0 0 0]
 [0 1 1 0 1 0]
 [0 0 0 0 1 0]
 [0 0 1 0 0 0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 🏷️ Expand Labeled Data for Better Intent Classification

To improve classifier performance, add more diverse and realistic examples for each intent. This helps the model generalize and reduces bias toward majority classes. Below is an expanded dataset template you can use and modify for your domain.

In [10]:
## 🏷️ Expand Labeled Data for Better Intent Classification

# To improve classifier performance, add more diverse and realistic examples for each intent. This helps the model generalize and reduces bias toward majority classes. Below is an expanded dataset template you can use and modify for your domain.
# Template: Expanded labeled data for intent fine-tuning
# Copy, edit, and expand this list with your real examples
expanded_data = [
    # claim_process
    {"text": "How do I file a claim?", "label": "claim_process"},
    {"text": "What is the process for submitting an insurance claim?", "label": "claim_process"},
    {"text": "I want to submit a new claim for my car accident.", "label": "claim_process"},
    {"text": "Guide me through the claim submission steps.", "label": "claim_process"},
    {"text": "Where do I upload my claim documents?", "label": "claim_process"},
    {"text": "How long does it take to process a claim?", "label": "claim_process"},
    {"text": "Can I check the status of my insurance claim?", "label": "claim_process"},
    {"text": "What documents are needed to file a claim?", "label": "claim_process"},
    {"text": "Is there a deadline for submitting claims?", "label": "claim_process"},
    {"text": "Can I cancel a claim after submitting?", "label": "claim_process"},
    # case_status
    {"text": "What is the current status of the case?", "label": "case_status"},
    {"text": "Show me the progress of case number 511605.", "label": "case_status"},
    {"text": "Has a judgment been issued in my case?", "label": "case_status"},
    {"text": "Is my case still pending?", "label": "case_status"},
    {"text": "When is the next hearing for my case?", "label": "case_status"},
    {"text": "What was the outcome of the last court session?", "label": "case_status"},
    {"text": "Who is the presiding judge for this case?", "label": "case_status"},
    {"text": "Has an appeal been filed?", "label": "case_status"},
    {"text": "Is there an order available for my case?", "label": "case_status"},
    {"text": "What is the next step in my case?", "label": "case_status"},
    # document_request
    {"text": "Can I get a copy of the case order?", "label": "document_request"},
    {"text": "How do I request the judgment document?", "label": "document_request"},
    {"text": "I need certified copies of my case documents.", "label": "document_request"},
    {"text": "Where can I download the court forms?", "label": "document_request"},
    {"text": "Request a copy of the final order.", "label": "document_request"},
    {"text": "How do I obtain previous hearing transcripts?", "label": "document_request"},
    {"text": "Can I get a digital copy of my case file?", "label": "document_request"},
    {"text": "What is the fee for document requests?", "label": "document_request"},
    {"text": "How long does it take to receive requested documents?", "label": "document_request"},
    {"text": "Is there a limit to the number of documents I can request?", "label": "document_request"},
    # resume_info
    {"text": "What skills are listed in the resume?", "label": "resume_info"},
    {"text": "List the programming languages known by the applicant.", "label": "resume_info"},
    {"text": "Show me the candidate's work experience.", "label": "resume_info"},
    {"text": "What certifications does the applicant have?", "label": "resume_info"},
    {"text": "Summarize the professional experience section.", "label": "resume_info"},
    {"text": "List the tools and technologies used by the candidate.", "label": "resume_info"},
    {"text": "What is the educational background of the applicant?", "label": "resume_info"},
    {"text": "What are the achievements or awards?", "label": "resume_info"},
    {"text": "Show me the contact information in the resume.", "label": "resume_info"},
    {"text": "What is the career objective or summary?", "label": "resume_info"},
    # technical_support
    {"text": "I have a technical issue with the system.", "label": "technical_support"},
    {"text": "There is a problem with the website.", "label": "technical_support"},
    {"text": "I can't log in to my account.", "label": "technical_support"},
    {"text": "The upload button is not working.", "label": "technical_support"},
    {"text": "How do I reset my password?", "label": "technical_support"},
    {"text": "The page is loading very slowly.", "label": "technical_support"},
    {"text": "I received an error message while submitting my form.", "label": "technical_support"},
    {"text": "The system crashed during my session.", "label": "technical_support"},
    {"text": "How do I contact technical support?", "label": "technical_support"},
    {"text": "The app keeps freezing.", "label": "technical_support"},
    # general_info
    {"text": "Give me a summary of the file.", "label": "general_info"},
    {"text": "Tell me about this document.", "label": "general_info"},
    {"text": "What is the purpose of this document?", "label": "general_info"},
    {"text": "Provide general information about the case.", "label": "general_info"},
    {"text": "What are the office hours?", "label": "general_info"},
    {"text": "How do I contact the support team?", "label": "general_info"},
    {"text": "Where is the office located?", "label": "general_info"},
    {"text": "What services are offered?", "label": "general_info"},
    {"text": "How do I register for an account?", "label": "general_info"},
    {"text": "What is the refund policy?", "label": "general_info"},
]

# You can now use expanded_data instead of the old 'data' list for training your classifier.


## keyword

# 🔑 Keyword Extraction Techniques: Code Examples

Below are code snippets for the most common keyword extraction methods. You can run these to compare results on your text.

In [19]:
# 1. TF-IDF (Term Frequency–Inverse Document Frequency)
from sklearn.feature_extraction.text import TfidfVectorizer

text = open(r'C:\New folder (5)\new-search-models\data\chunks\Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt', 'r', encoding='utf-8').read()

def extract_keywords_tfidf(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
    tfidf = vectorizer.fit_transform([text])
    scores = zip(vectorizer.get_feature_names_out(), tfidf.toarray()[0])
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return [w for w, s in sorted_scores[:top_n]]

print('TF-IDF keywords:', extract_keywords_tfidf(text))

TF-IDF keywords: ['management', 'developed', 'user', 'document', 'spring', 'tournament', 'angular', 'implemented', 'secure', 'security']


In [56]:
%pip install rake-nltk

Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [63]:


# 2. RAKE (Rapid Automatic Keyword Extraction)
from rake_nltk import Rake

def extract_keywords_rake(text, top_n=10):
    rake = Rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()[:top_n]

print('RAKE keywords:', extract_keywords_rake(text))

RAKE keywords: ['based authentication verification apis using spring boot spring security built batch processing workflows', '50 satisfaction scores enhanced document management improved tournament efficiency implemented robust features', 'publication key achievements strengthened system security improved user interaction leveraged angular', 'docker ensuring consistency across development production environments designed cicd pipelines', 'volume applications developed dynamic responsive ui components using angular typescript', 'time inventory tracking alerts optimizing stock management integrated elasticsearch', 'automate workflow improve deployment efficiency deployed scalable applications', 'automated build test deployment processes utilized github actions', 'overall tournament management process robust admin control real', 'responsive ui using angular enhancing user experience']


In [60]:
%pip install yake

Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
     ---------------------------------------- 0.0/60.2 kB ? eta -:--:--
     ---------------------------------------- 60.2/60.2 kB 3.1 MB/s eta 0:00:00
Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Collecting jellyfish
  Downloading jellyfish-1.2.0-cp310-cp310-win_amd64.whl (217 kB)
     ---------------------------------------- 0.0/217.1 kB ? eta -:--:--
     -------------------------------------- 217.1/217.1 kB 6.7 MB/s eta 0:00:00
Collecting segtok
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Installing collected packages: tabulate, segtok, jellyfish, yake
Successfully installed jellyfish-1.2.0 segtok-1.5.11 tabulate-0.9.0 yake-0.4.8
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [64]:


# 3. YAKE (Yet Another Keyword Extractor)
import yake

def extract_keywords_yake(text, top_n=10):
    kw_extractor = yake.KeywordExtractor(top=top_n, stopwords=None)
    keywords = kw_extractor.extract_keywords(text)
    return [kw for kw, score in keywords]

print('YAKE keywords:', extract_keywords_yake(text))

YAKE keywords: ['Spring Boot Spring', 'Spring Boot Angular', 'Boot Spring Security', 'Spring Security Built', 'Software Solutions Developed', 'Interaction Leveraged Angular', 'Full Stack Developer', 'Present Devzen Software', 'Devzen Software Solutions', 'System Security Improved']


In [36]:
# 5. KeyBERT (Embedding-based, already in your notebook)
from keybert import KeyBERT
keyword_model = KeyBERT()
keywords = [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)]

print('KeyBERT keywords:', [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)])

KeyBERT keywords: ['apis', 'authentication', 'api', 'aws', 'jwt', 'cloud', 'workflows', 'angular', 'secure', 'java']


In [67]:
# 6. spaCy POS-based (Nouns, Noun Phrases)
def extract_keywords_spacy(text, top_n=10):
    doc = nlp(text)
    noun_chunks = list(set(chunk.text.strip().lower() for chunk in doc.noun_chunks))
    nouns = list(set(token.lemma_ for token in doc if token.pos_ == 'NOUN' and not token.is_stop))
    return (noun_chunks + nouns)[:top_n]

print('spaCy POS keywords:', extract_keywords_spacy(text))

spaCy POS keywords: ['large-scale data management', 'dynamic responsive ui components', 'product management', 'seamless document creation socialization', 'a', 'real-time inventory tracking alerts', 'user authentication', 'spring boot', 'a secure api', 'github actions']


---

You can compare the outputs of these techniques on your sample text. For best results, try ensemble or hybrid approaches (e.g., combine TF-IDF, KeyBERT, and POS-based keywords).

# 🏆 Recommended Industry Pipeline: Robust Keyword Extraction

This pipeline combines the strengths of multiple methods for high accuracy and robustness, suitable for production and real-world data:

- **KeyBERT** (embedding-based, semantic): Best for context-aware, relevant keywords.
- **TF-IDF** (statistical): Captures frequent, document-specific terms.
- **spaCy POS-based**: Ensures inclusion of important noun phrases.
- **Ensemble/Hybrid**: Merges and deduplicates keywords from all methods.

You can further filter or rank keywords by frequency, relevance, or domain-specific rules.

In [None]:
# --- Robust Keyword Extraction Pipeline ---
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

# Load models (reuse if already loaded)
keyword_model = KeyBERT()
nlp = spacy.load("en_core_web_trf") if spacy.util.is_package("en_core_web_trf") else spacy.load("en_core_web_sm")

# 1. KeyBERT keywords
keybert_keywords = [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)]

# 2. TF-IDF keywords
def extract_keywords_tfidf(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
    tfidf = vectorizer.fit_transform([text])
    scores = zip(vectorizer.get_feature_names_out(), tfidf.toarray()[0])
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return [w for w, s in sorted_scores[:top_n]]
tfidf_keywords = extract_keywords_tfidf(text)

# 3. spaCy POS-based keywords
def extract_keywords_spacy(text, top_n=10):
    doc = nlp(text)
    noun_chunks = list(set(chunk.text.strip().lower() for chunk in doc.noun_chunks))
    nouns = list(set(token.lemma_ for token in doc if token.pos_ == 'NOUN' and not token.is_stop))
    return (noun_chunks + nouns)[:top_n]
spacy_keywords = extract_keywords_spacy(text)

# 4. Hybrid/ensemble: merge and deduplicate
all_keywords = keybert_keywords + tfidf_keywords + spacy_keywords
unique_keywords = []
for kw in all_keywords:
    if kw not in unique_keywords:
        unique_keywords.append(kw)

print("KeyBERT:", keybert_keywords)
print("TF-IDF:", tfidf_keywords)
print("spaCy POS:", spacy_keywords)
print("\n---\nEnsemble (deduplicated):", unique_keywords)

KeyBERT: ['apis', 'authentication', 'api', 'aws', 'jwt', 'cloud', 'workflows', 'angular', 'secure', 'java']
TF-IDF: ['management', 'developed', 'user', 'document', 'spring', 'tournament', 'angular', 'implemented', 'secure', 'security']
spaCy POS: ['high-volume applications', 'user authentication', 'unauthorized access functionality', 'large-scale data management', '50 satisfaction scores', 'live score updates', 'user interaction', 'seamless transactions', 'a secure api', 'scalable applications']

---
Ensemble (deduplicated): ['apis', 'authentication', 'api', 'aws', 'jwt', 'cloud', 'workflows', 'angular', 'secure', 'java', 'management', 'developed', 'user', 'document', 'spring', 'tournament', 'implemented', 'security', 'high-volume applications', 'user authentication', 'unauthorized access functionality', 'large-scale data management', '50 satisfaction scores', 'live score updates', 'user interaction', 'seamless transactions', 'a secure api', 'scalable applications']


**Best Practices:**
- Adjust `top_n` for each method based on your needs.
- Optionally, filter out keywords that are too short, too common, or not domain-relevant.
- For domain-specific tasks, add custom rules or fine-tune KeyBERT with a domain model.
- For very high accuracy, consider adding a supervised NER/sequence labeling model as a final filter.

## Entity Extraction

# 🏷️ Entity Extraction Techniques: Code Examples

Below are code snippets for common entity extraction methods. You can run these to compare results on your text.

In [30]:
# 1. spaCy Named Entity Recognition (NER)
import spacy

# Load spaCy model (already loaded as nlp in previous cells)
doc = nlp(text)
spacy_entities = [(ent.text, ent.label_) for ent in doc.ents]
print('spaCy Entities:', spacy_entities)

spaCy Entities: [('Devzen', 'ORG'), ('Spring Boot Spring Security', 'PRODUCT'), ('Spring Batch', 'PRODUCT'), ('Elasticsearch', 'PRODUCT'), ('Angular TypeScript', 'PRODUCT'), ('Spring Boot Angular', 'PRODUCT'), ('AWS EC2 S3 RDS IAM', 'PRODUCT'), ('Spring Boot', 'PRODUCT'), ('Angular', 'PRODUCT'), ('25', 'CARDINAL'), ('50', 'CARDINAL'), ('portals', 'ORG'), ('20', 'CARDINAL'), ('six months of launch', 'DATE'), ('40', 'CARDINAL'), ('90', 'CARDINAL')]


In [31]:
# 2. Transformers-based NER (e.g., HuggingFace pipeline)
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

ner_pipe = pipeline(
    "ner",
    model=AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    tokenizer=AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    aggregation_strategy="simple",
    device=-1  # CPU
)
transformers_entities = [(ent['word'], ent['entity_group'], ent['score']) for ent in ner_pipe(text) if ent['score'] > 0.8]
print('Transformers NER Entities:', transformers_entities)

Device set to use cpu


Transformers NER Entities: [(' Java Full Stack Developer 20', 'MISC', np.float32(0.8558725)), (' Devzen Software Solutions', 'ORG', np.float32(0.99328136)), (' JWT-based', 'MISC', np.float32(0.95012635)), (' Spring Boot Spring Security', 'MISC', np.float32(0.98164487)), (' Spring Batch', 'MISC', np.float32(0.9609308)), (' Elasticsearch', 'MISC', np.float32(0.9405725)), (' Angular TypeScript', 'MISC', np.float32(0.990668)), (' Spring Boot Angular', 'MISC', np.float32(0.9826927)), (' Docker', 'MISC', np.float32(0.97396314)), (' GitHub Actions', 'MISC', np.float32(0.9847411)), (' AWS EC2 S3 RDS IAM', 'MISC', np.float32(0.9680392)), (' Spring Boot', 'MISC', np.float32(0.9820712)), (' Angular', 'MISC', np.float32(0.98616624)), (' Angular', 'MISC', np.float32(0.98883575))]


In [32]:
# 3. Regex-based Entity Extraction (for custom patterns)
import re

# Example: Extract email addresses and dates
emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
dates = re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text)
print('Emails:', emails)
print('Dates:', dates)

Emails: []
Dates: []


In [56]:
# 4. Ensemble/Hybrid: Combine spaCy, Transformers, and Regex
def extract_entities_hybrid(text):
    entities = set()
    entity_types = []
    entity_details = []
    # spaCy
    for ent in nlp(text).ents:
        entities.add(ent.text)
        entity_types.append(ent.label_)
        entity_details.append({
            "text": ent.text,
            "type": ent.label_,
            "score": None
        })
    # Transformers
    for ent in ner_pipe(text):
        if ent['score'] > 0.8:
            entities.add(ent['word'])
            entity_types.append(ent['entity_group'])
            entity_details.append({
                "text": ent['word'],
                "type": ent['entity_group'],
                "score": ent['score']
            })
    # Regex (add more patterns as needed)
    for email in re.findall(r'[\w\.-]+@[\w\.-]+', text):
        entities.add(email)
        entity_types.append("EMAIL")
        entity_details.append({
            "text": email,
            "type": "EMAIL",
            "score": None
        })
    for date in re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text):
        entities.add(date)
        entity_types.append("DATE")
        entity_details.append({
            "text": date,
            "type": "DATE",
            "score": None
        })
    return {
        "entities": sorted(entities),
        "entity_types": entity_types,
        "entity_details": entity_details
    }

# for chunk_path in chunk_files:
#     with open(chunk_path, 'r', encoding='utf-8') as f:
#         chunk_text = f.read()
#     print(f"{os.path.basename(chunk_path)}: {extract_entities_hybrid(chunk_text)}")

    # print('Hybrid Entities:', extract_entities_hybrid(text))

---

You can compare the outputs of these techniques on your sample text. For best results, use a hybrid approach and add domain-specific regex patterns as needed.

In [57]:
# Utility: Deduplicate entity_types in structured metadata output

def get_metadata_structured_dedup(
    text, 
    filename="", 
    document_name="", 
    summary="", 
    embedding=None
):
    keywords = unique_keywords
    intent, intent_confidence = predict_intent(text)
    ner_results = extract_entities_hybrid(text)
    entities = ner_results.get("entities", [])
    entity_types = list(dict.fromkeys(ner_results.get("entity_types", [])))  # Deduplicate, preserve order
    entity_details = ner_results.get("entity_details", [])
    if embedding is None:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.distilbert(**inputs)
            embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().tolist()
    return {
        "keyword": keywords,
        "intent": intent,
        "intent_confidence": intent_confidence,
        "entities": entities,
        "entity_types": entity_types,
        "entity_details": entity_details,
        "summary": summary,
        "embedding": embedding,
        "text": text,
        "document_name": document_name,
        "filename": filename
    }

# Example usage:
metadata_structured_dedup = get_metadata_structured_dedup(
    text=chunk_text,
    filename=chunk_path,
    document_name="Jimson_Ratnam_JavaFullStackDeveloper_2+years",
    summary="",
)
import pprint
pprint.pprint(metadata_structured_dedup)

{'document_name': 'Jimson_Ratnam_JavaFullStackDeveloper_2+years',
 'embedding': [-0.3491048812866211,
               0.14838749170303345,
               -0.98483806848526,
               -1.0254418849945068,
               1.3563048839569092,
               0.4853346049785614,
               -0.3664737641811371,
               1.0934932231903076,
               -0.041868966072797775,
               0.3456886410713196,
               -0.9854051470756531,
               0.397349089384079,
               -0.6585312485694885,
               0.2738799750804901,
               0.8393633961677551,
               1.3065966367721558,
               0.025908133015036583,
               0.10038670897483826,
               -0.8247460722923279,
               0.09762020409107208,
               0.32058557868003845,
               -0.1351531445980072,
               -0.3738860785961151,
               0.3770873546600342,
               -0.10935547202825546,
               -0.12841953337192535,
     