# 🔍 Metadata Extraction Test Notebook
This notebook helps you test the `extract_metadata()` pipeline using a custom text chunk input.

You can modify the `text` variable in the next cell to test different examples.

In [4]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import spacy
from sentence_transformers import SentenceTransformer, util
import os
import json

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_trf")
except Exception:
    nlp = spacy.load("en_core_web_sm")

# Helper: normalize entity
def normalize_entity(e):
    text = re.sub(r'\s+', ' ', re.sub(r'\.', '', e.lower())).strip()
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

# Intent keywords and examples (copy from your scripts)
intent_keywords = {
    "claim_process": ["claim", "process", "file", "submit", "insurance"],
    "case_status": ["status", "case", "update", "progress", "judgement", "order", "appeal"],
    "document_request": ["document", "request", "copies", "forms"],
    "technical_support": ["error", "issue", "problem", "technical"],
    "general_info": ["information", "contact", "hours", "location"],
    "resume_info": [
        "skills", "resume", "cv", "proficiencies", "abilities", "expertise", "competencies", "qualifications",
        "experience", "work", "education", "background", "certifications", "projects", "programming", "languages",
        "achievements", "awards", "contact", "career", "summary", "tools", "technologies", "roles", "responsibilities",
        "soft skills", "applicant", "candidate", "developer", "engineer", "profile", "professional", "employment", "history",
        "management", "software", "admin", "implementation", "tracking", "project", "system", "solution", "platform", "application",
        "team", "lead", "player", "restocking", "inventory", "tournament", "manual", "implemented"
    ]
}
project_root = os.environ.get('PROJECT_ROOT', os.getcwd())
intent_examples_path = os.path.join(project_root,'data', 'intent_categories', 'intent_examples.json')
with open(intent_examples_path, 'r', encoding='utf-8') as f:
    intent_examples = json.load(f)
# intent_examples = {
#     "resume_info": [
#         "What skills are listed in the resume?",
#         "Show me the proficiencies in this CV.",
#         "List the abilities mentioned in the candidate's resume.",
#         "What expertise does the applicant have?",
#         "Which competencies are present in the resume?",
#         "What qualifications are included in the CV?",
#         "List the work experience of the candidate.",
#         "What is the educational background of the applicant?",
#         "Show me the certifications in this resume.",
#         "What are the technical skills mentioned?",
#         "Summarize the professional experience section.",
#         "What projects has the candidate worked on?",
#         "List the programming languages known by the applicant.",
#         "What are the achievements or awards?",
#         "Show me the contact information in the resume.",
#         "What is the career objective or summary?",
#         "List the tools and technologies used by the candidate.",
#         "What is the total experience in years?",
#         "Show me the roles and responsibilities held.",
#         "What are the soft skills mentioned?",
#         "List the languages spoken by the applicant.",
#         "What is the applicant's job title?",
#         "Describe the applicant's professional profile.",
#         "What companies has the candidate worked for?",
#         "List the frameworks and libraries used.",
#         "What cloud platforms does the candidate have experience with?",
#         "What development methodologies are mentioned?",
#         "List the certifications and licenses.",
#         "What leadership roles has the candidate held?",
#         "Summarize the applicant's employment history.",
#         "What is the candidate's GitHub or portfolio link?",
#         "Describe the candidate's experience in management and software projects.",
#         "What inventory or tracking systems has the applicant implemented?",
#         "List any admin or manual processes managed by the candidate.",
#         "What experience does the candidate have with tournaments or players?",
#         "Describe the candidate's role in restocking or inventory management.",
#         "What solutions or platforms has the applicant developed or led?",
#         "List any applications or systems the candidate has worked on.",
#         "What teams has the candidate led or been a part of?",
#         "Describe the candidate's experience with project implementation."
#     ],
#     "claim_process": ["How do I file a claim?", "What is the process for submitting an insurance claim?"],
#     "case_status": ["What is the current status of the case?", "Show me the progress of case number 511605."],
#     "document_request": ["Can I get a copy of the case order?", "How do I request the judgment document?"],
#     "technical_support": ["I have a technical issue.", "There is a problem with the system."],
#     "general_info": ["What is the purpose of this document?", "Give me a summary of the file."]
# }
intent_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def get_intent(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc]
    detected_intent = None
    max_matches = 0
    for intent, keywords in intent_keywords.items():
        matches = sum(kw in tokens for kw in keywords)
        if matches > max_matches:
            max_matches = matches
            detected_intent = intent
    intent_confidence = max_matches / max(1, len(intent_keywords.get(detected_intent, []))) if detected_intent else 0.0
    if not detected_intent or max_matches == 0:
        query_emb = intent_model.encode(text, convert_to_tensor=True)
        best_intent, best_score = None, 0
        for intent, examples in intent_examples.items():
            example_embs = intent_model.encode(examples, convert_to_tensor=True)
            scores = util.pytorch_cos_sim(query_emb, example_embs)
            max_score = scores.max().item()
            if max_score > best_score:
                best_score = max_score
                best_intent = intent
        if best_score > 0.35:
            detected_intent = best_intent
            intent_confidence = best_score
    if not detected_intent:
        detected_intent = "general_info"
        intent_confidence = 0.0
    return detected_intent, intent_confidence, None

def extract_metadata(text):
    doc = nlp(text)
    # Entities
    entities = [normalize_entity(ent.text) for ent in doc.ents]
    # Keywords (nouns, proper nouns, not stopwords)
    keywords = [normalize_entity(token.text) for token in doc if token.pos_ in ["NOUN", "PROPN"] and not token.is_stop and token.lemma_.lower() not in ENGLISH_STOP_WORDS and len(token.text) > 2]
    # Intent
    detected_intent, intent_confidence, _ = get_intent(text)
    return {
        "entities": entities,
        "keywords": keywords,
        "intent": detected_intent,
        "intent_confidence": intent_confidence
    }


In [45]:
import sys
import sentence_transformers
import transformers
import spacy

# Print environment and config for debugging
try:
    print('spaCy version:', spacy.__version__)
except Exception:
    print('spaCy not available')
try:
    print('SentenceTransformers version:', sentence_transformers.__version__)
except Exception:
    print('SentenceTransformers not available')
try:
    print('Transformers version:', transformers.__version__)
except Exception:
    print('Transformers not available')
try:
    print('spaCy model:', nlp.meta['name'] if nlp else 'None')
except Exception:
    print('spaCy model: None')
try:
    print('Intent keywords:', intent_keywords)
except Exception:
    print('Intent keywords: not loaded')
try:
    print('Intent examples:', list(intent_examples.keys()))
except Exception:
    print('Intent examples: not loaded')
if 'text' in globals():
    print('Text sample:', repr(text[:500]))
else:
    print('No text loaded yet.')


spaCy version: 3.8.7
SentenceTransformers version: 4.1.0
Transformers version: 4.52.4
spaCy model: core_web_trf
Intent keywords: {'claim_process': ['claim', 'process', 'file', 'submit', 'insurance'], 'case_status': ['status', 'case', 'update', 'progress', 'judgement', 'order', 'appeal'], 'document_request': ['document', 'request', 'copies', 'forms'], 'technical_support': ['error', 'issue', 'problem', 'technical'], 'general_info': ['information', 'contact', 'hours', 'location'], 'resume_info': ['skills', 'resume', 'cv', 'proficiencies', 'abilities', 'expertise', 'competencies', 'qualifications', 'experience', 'work', 'education', 'background', 'certifications', 'projects', 'programming', 'languages', 'achievements', 'awards', 'contact', 'career', 'summary', 'tools', 'technologies', 'roles', 'responsibilities', 'soft skills', 'applicant', 'candidate', 'developer', 'engineer', 'profile', 'professional', 'employment', 'history', 'management', 'software', 'admin', 'implementation', 'trackin

In [1]:
%pip install python-dotenv

import sys
import os, json, yaml

project_root = 'c:\\New folder (5)\\new-search-models'
sys.path.insert(0, project_root)

# Load config.yaml using the correct path
config_path = os.path.join(project_root,  'config.yaml')
if os.path.exists(config_path):
	with open(config_path, 'r') as f:
		config = yaml.safe_load(f)
else:
	print(f"⚠️ config.yaml not found at {config_path}")
	config = {}

from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, 'config', '.env'))
from scripts.entity_utils import normalize_entity, get_spacy_nlp
from scripts.search_pipeline import get_openai_embedding
from scripts.intent_utils import get_intent
# from scripts.metadata import extract_metadata  # <-- You must have your full extraction code in this file


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm
  model.load_state_dict(torch.load(filelike, map_location=device))


In [7]:
with open(r'C:\New folder (5)\new-search-models\data\chunks\Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt', 'r', encoding='utf-8') as file:
    text = file.read()
print('✅ Sample Text Loaded')

✅ Sample Text Loaded


In [42]:
metadata = extract_metadata(text)

metadata

{'entities': ['devzen',
  'spring boot spring security',
  'spring batch',
  'elasticsearch',
  'angular typescript',
  'spring boot angular',
  'aw ec2 s3 rd iam',
  'spring boot',
  'angular',
  '25',
  '50',
  'portal',
  '20',
  'six month of launch',
  '40',
  '90'],
 'keywords': ['java',
  'stack',
  'developer',
  'devzen',
  'software',
  'solution',
  'jwt',
  'authentication',
  'verification',
  'apis',
  'spring',
  'boot',
  'spring',
  'security',
  'batch',
  'processing',
  'workflow',
  'spring',
  'batch',
  'scale',
  'datum',
  'management',
  'rest',
  'apis',
  'time',
  'inventory',
  'tracking',
  'alert',
  'stock',
  'management',
  'elasticsearch',
  'search',
  'retrieval',
  'volume',
  'application',
  'component',
  'angular',
  'typescript',
  'product',
  'management',
  'payment',
  'gateway',
  'integration',
  'spring',
  'boot',
  'angular',
  'transaction',
  'dashboard',
  'key',
  'insight',
  'tournament',
  'inventory',
  'management',
  'appli

In [43]:
from pprint import pprint
print('✅ Extracted Metadata:')
pprint(metadata)

✅ Extracted Metadata:
{'entities': ['devzen',
              'spring boot spring security',
              'spring batch',
              'elasticsearch',
              'angular typescript',
              'spring boot angular',
              'aw ec2 s3 rd iam',
              'spring boot',
              'angular',
              '25',
              '50',
              'portal',
              '20',
              'six month of launch',
              '40',
              '90'],
 'intent': 'resume_info',
 'intent_confidence': 0.28846153846153844,
 'keywords': ['java',
              'stack',
              'developer',
              'devzen',
              'software',
              'solution',
              'jwt',
              'authentication',
              'verification',
              'apis',
              'spring',
              'boot',
              'spring',
              'security',
              'batch',
              'processing',
              'workflow',
              'spring',
      

In [14]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Test alternative intent detection techniques

# 1. Simple keyword matching (baseline)
def detect_intent_keywords(text, intent_keywords):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc]
    scores = {}
    for intent, keywords in intent_keywords.items():
        matches = sum(kw in tokens for kw in keywords)
        scores[intent] = matches
    best_intent = max(scores, key=scores.get)
    confidence = scores[best_intent] / max(1, len(intent_keywords[best_intent]))
    return best_intent, confidence

# 2. Embedding similarity (SentenceTransformer, already used in get_intent)
def detect_intent_embedding(text, intent_examples, intent_model):
    query_emb = intent_model.encode(text, convert_to_tensor=True)
    best_intent, best_score = None, 0
    for intent, examples in intent_examples.items():
        example_embs = intent_model.encode(examples, convert_to_tensor=True)
        scores = util.pytorch_cos_sim(query_emb, example_embs)
        max_score = scores.max().item()
        if max_score > best_score:
            best_score = max_score
            best_intent = intent
    return best_intent, best_score

# 3. Regex-based intent detection (very basic)
def detect_intent_regex(text):
    patterns = {
        "resume_info": r"\b(resume|cv|skills|experience|project|education|certification)\b",
        "claim_process": r"\b(claim|insurance|submit|file)\b",
        "case_status": r"\b(status|case|update|progress|judgement|order|appeal)\b",
        "document_request": r"\b(document|request|copy|copies|form)\b",
        "technical_support": r"\b(error|issue|problem|technical)\b",
        "general_info": r"\b(information|contact|hours|location|summary|purpose)\b"
    }
    for intent, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return intent, 1.0
    return "general_info", 0.0

# Run all techniques on the loaded text
print("== Simple Keyword Matching ==")
intent_kw, conf_kw = detect_intent_keywords(text, intent_keywords)
print(f"Intent: {intent_kw}, Confidence: {conf_kw:.2f}")

print("\n== Embedding Similarity ==")
intent_emb, conf_emb = detect_intent_embedding(text, intent_examples, intent_model)
print(f"Intent: {intent_emb}, Similarity: {conf_emb:.2f}")

print("\n== Regex-based Detection ==")
intent_rgx, conf_rgx = detect_intent_regex(text)
print(f"Intent: {intent_rgx}, Confidence: {conf_rgx:.2f}")
# 4. Majority voting among techniques

def majority_vote(*intents):
    count = Counter(intents)
    best, freq = count.most_common(1)[0]
    return best, freq / len(intents)

intents = [intent_kw, intent_emb, intent_rgx]
majority_intent, majority_conf = majority_vote(*intents)
print("\n== Majority Voting ==")
print(f"Intent: {majority_intent}, Confidence: {majority_conf:.2f}")

# 5. Bag-of-words cosine similarity (very basic)

def detect_intent_bow(text, intent_examples):
    vectorizer = CountVectorizer().fit([text] + [ex for exs in intent_examples.values() for ex in exs])
    text_vec = vectorizer.transform([text])
    best_intent, best_score = None, 0
    for intent, examples in intent_examples.items():
        ex_vecs = vectorizer.transform(examples)
        sim = cosine_similarity(text_vec, ex_vecs).max()
        if sim > best_score:
            best_score = sim
            best_intent = intent
    return best_intent, best_score

intent_bow, conf_bow = detect_intent_bow(text, intent_examples)
print("\n== Bag-of-Words Cosine Similarity ==")
print(f"Intent: {intent_bow}, Similarity: {conf_bow:.2f}")

== Simple Keyword Matching ==
Intent: resume_info, Confidence: 0.29

== Embedding Similarity ==
Intent: resume_info, Confidence: 0.29

== Embedding Similarity ==
Intent: resume_info, Similarity: 0.41

== Regex-based Detection ==
Intent: resume_info, Confidence: 1.00

== Majority Voting ==
Intent: resume_info, Confidence: 1.00

== Bag-of-Words Cosine Similarity ==
Intent: resume_info, Similarity: 0.32
Intent: resume_info, Similarity: 0.41

== Regex-based Detection ==
Intent: resume_info, Confidence: 1.00

== Majority Voting ==
Intent: resume_info, Confidence: 1.00

== Bag-of-Words Cosine Similarity ==
Intent: resume_info, Similarity: 0.32


## using both regex and the Transformers NER pipeline, along with KeyBERT for keyword extraction. 

In [10]:
# --- Alternative Metadata Extraction: Regex + Transformers NER ---
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from keybert import KeyBERT
import re

# Load NER pipeline (Roberta large NER)
ner_pipe = pipeline(
    "ner",
    model=AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    tokenizer=AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    aggregation_strategy="simple",
    device=-1  # CPU
)

# KeyBERT for keyword extraction
keyword_model = KeyBERT()

def extract_metadata_alt(text):
    # Entities using transformers NER
    entities = set()
    for ent in ner_pipe(text):
        if ent['score'] > 0.8:
            entities.add(ent['word'].strip().lower())
    # Regex for names (simple)
    name_matches = re.findall(r'([A-Z][a-z]+(?: [A-Z][a-z]+)+)', text)
    for name in name_matches:
        entities.add(name.lower())
    # Keywords using KeyBERT
    keywords = [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)]
    # Intent using previous get_intent
    detected_intent, intent_confidence, _ = get_intent(text)
    return {
        "entities": sorted(list(entities)),
        "keywords": keywords,
        "intent": detected_intent,
        "intent_confidence": intent_confidence
    }

# alt_metadata = extract_metadata_alt(text)
# print('✅ Alternative Metadata Extraction:')
# from pprint import pprint
# pprint(alt_metadata)

Device set to use cpu


## 🚀 Production-Grade Intent Detection Solutions

For real-world, scalable, and robust intent detection, consider these best practices:

- **Hybrid Pipeline:** Combine fast rule-based/keyword/regex checks for high-precision intents with embedding-based similarity for flexible, robust matching.
- **ML/Deep Learning Models:** Use fine-tuned transformer models (e.g., BERT, RoBERTa, DistilBERT) for intent classification if you have enough labeled data.
- **Fallbacks:** Always provide a fallback (e.g., "general_info") for ambiguous or low-confidence cases.
- **Confidence Thresholds:** Use thresholds to decide when to trust a prediction or escalate to a human/manual review.
- **Monitoring & Logging:** Log predictions, confidence, and input for continuous improvement and error analysis.
- **Versioning:** Version your models, configs, and intent definitions for reproducibility and safe updates.
- **Batch & Real-Time Support:** Design your pipeline to work both in batch (offline) and real-time (API) modes.

Below is a modular, production-ready intent detection pipeline you can adapt and extend.

In [None]:
import logging
from typing import Dict, Any

# Configure logging for production
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

class ProductionIntentDetector:
    def __init__(self, intent_keywords, intent_examples, intent_model, threshold=0.35):
        self.intent_keywords = intent_keywords
        self.intent_examples = intent_examples
        self.intent_model = intent_model
        self.threshold = threshold

    def detect(self, text: str) -> Dict[str, Any]:
        # 1. Fast rule-based keyword/lemmatized match
        doc = nlp(text.lower())
        tokens = [token.lemma_ for token in doc]
        detected_intent = None
        max_matches = 0
        for intent, keywords in self.intent_keywords.items():
            matches = sum(kw in tokens for kw in keywords)
            if matches > max_matches:
                max_matches = matches
                detected_intent = intent
        intent_confidence = max_matches / max(1, len(self.intent_keywords.get(detected_intent, []))) if detected_intent else 0.0
        # 2. Embedding similarity fallback
        if not detected_intent or max_matches == 0:
            query_emb = self.intent_model.encode(text, convert_to_tensor=True)
            best_intent, best_score = None, 0
            for intent, examples in self.intent_examples.items():
                example_embs = self.intent_model.encode(examples, convert_to_tensor=True)
                scores = util.pytorch_cos_sim(query_emb, example_embs)
                max_score = scores.max().item()
                if max_score > best_score:
                    best_score = max_score
                    best_intent = intent
            if best_score > self.threshold:
                detected_intent = best_intent
                intent_confidence = best_score
        # 3. Fallback to general_info
        if not detected_intent:
            detected_intent = "general_info"
            intent_confidence = 0.0
        # 4. Logging for monitoring
        logging.info(f"Intent: {detected_intent}, Confidence: {intent_confidence:.2f}, Text: {text[:80]}...")
        return {
            "intent": detected_intent,
            "intent_confidence": intent_confidence
        }

# Usage example
prod_detector = ProductionIntentDetector(intent_keywords, intent_examples, intent_model, threshold=0.35)
chunk_dir = os.path.join(project_root, '..', 'data', 'chunks')
chunk_files = glob.glob(os.path.join(chunk_dir, '*.txt'))

chunk_intents = {}
for chunk_path in chunk_files:
    with open(chunk_path, 'r', encoding='utf-8') as f:
        chunk_text = f.read()
    result = prod_detector.detect(chunk_text)
    chunk_intents[os.path.basename(chunk_path)] = result

print('✅ Intent detection for all chunks:')
for fname, res in chunk_intents.items():
    print(f"{fname}: {res}")
# prod_result = prod_detector.detect(text)
# print('✅ Production-Grade Intent Detection:')
# print(prod_result)

2025-06-21 08:58:14,084 INFO Intent: resume_info, Confidence: 0.29, Text: Java Full Stack Developer 2023 - Present Devzen Software Solutions Developed sec...
2025-06-21 08:58:14,277 INFO Intent: resume_info, Confidence: 0.21, Text: reducing base by 20 within six months of launch manual administration time by 40...
2025-06-21 08:58:14,277 INFO Intent: resume_info, Confidence: 0.21, Text: reducing base by 20 within six months of launch manual administration time by 40...
2025-06-21 08:58:14,719 INFO Intent: resume_info, Confidence: 0.25, Text: JIMSON RATNAM KAPAVARAPU Java Fullstack Developer 91 9154631932 jimsonjimmy2008g...
2025-06-21 08:58:14,719 INFO Intent: resume_info, Confidence: 0.25, Text: JIMSON RATNAM KAPAVARAPU Java Fullstack Developer 91 9154631932 jimsonjimmy2008g...
2025-06-21 08:58:15,055 INFO Intent: resume_info, Confidence: 0.21, Text: real-time updates for matches fixtures Implemented secure role-based access cont...
2025-06-21 08:58:15,055 INFO Intent: resume_info, Co

✅ Intent detection for all chunks:
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt: {'intent': 'resume_info', 'intent_confidence': 0.28846153846153844}
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk2.txt: {'intent': 'resume_info', 'intent_confidence': 0.21153846153846154}
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk3.txt: {'intent': 'resume_info', 'intent_confidence': 0.25}
Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk4.txt: {'intent': 'resume_info', 'intent_confidence': 0.21153846153846154}
MHC_CaseStatus_511605_chunk1.txt: {'intent': 'case_status', 'intent_confidence': 0.42857142857142855}
MHC_CaseStatus_511605_chunk10.txt: {'intent': 'claim_process', 'intent_confidence': 0.2}
MHC_CaseStatus_511605_chunk11.txt: {'intent': 'claim_process', 'intent_confidence': 0.4}
MHC_CaseStatus_511605_chunk12.txt: {'intent': 'case_status', 'intent_confidence': 0.2857142857142857}
MHC_CaseStatus_511605_chunk13.txt: {'intent': 'case_status', 'intent_confidence': 0.28571428571428

In [12]:
from pprint import pprint

# Test and compare all metadata extraction techniques side by side

print("== spaCy + SentenceTransformer Pipeline ==")
spacy_metadata = extract_metadata(text)
pprint(spacy_metadata)

print("\n== Transformers NER + KeyBERT Pipeline ==")
alt_metadata = extract_metadata_alt(text)
pprint(alt_metadata)

print("\n== Comparison ==")
print("Entities overlap:", set(spacy_metadata['entities']) & set(alt_metadata['entities']))
print("Keywords overlap:", set(spacy_metadata['keywords']) & set(alt_metadata['keywords']))
print("Intent (spaCy pipeline):", spacy_metadata['intent'], "| Confidence:", spacy_metadata['intent_confidence'])
print("Intent (Transformers+KeyBERT):", alt_metadata['intent'], "| Confidence:", alt_metadata['intent_confidence'])


== spaCy + SentenceTransformer Pipeline ==
{'entities': ['devzen',
              'spring boot spring security',
              'spring batch',
              'elasticsearch',
              'angular typescript',
              'spring boot angular',
              'aw ec2 s3 rd iam',
              'spring boot',
              'angular',
              '25',
              '50',
              'portal',
              '20',
              'six month of launch',
              '40',
              '90'],
 'intent': 'resume_info',
 'intent_confidence': 0.28846153846153844,
 'keywords': ['java',
              'stack',
              'developer',
              'devzen',
              'software',
              'solution',
              'jwt',
              'authentication',
              'verification',
              'apis',
              'spring',
              'boot',
              'spring',
              'security',
              'batch',
              'processing',
              'workflow',
         

In [16]:
from transformers import pipeline
from collections import Counter

# Advanced Intent Detection Techniques

# 1. Zero-shot classification with HuggingFace Transformers (e.g., facebook/bart-large-mnli)

zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def detect_intent_zero_shot(text, candidate_labels):
    result = zero_shot_classifier(text, candidate_labels)
    best_intent = result['labels'][0]
    confidence = result['scores'][0]
    return best_intent, confidence

candidate_labels = list(intent_examples.keys())
intent_zs, conf_zs = detect_intent_zero_shot(text, candidate_labels)
print("== Zero-Shot Classification ==")
print(f"Intent: {intent_zs}, Confidence: {conf_zs:.2f}")

# 2. Fine-tuned intent classification model (if you have labeled data)
# (Placeholder: You would train a classifier using your labeled dataset, e.g., using sklearn, Keras, or HuggingFace Trainer.)

# 3. Ensemble: Combine multiple techniques (majority vote, weighted average, etc.)
# Ensure detect_intent_keywords is defined (use from cell 7 if available)
if 'detect_intent_keywords' not in globals():
    def detect_intent_keywords(text, intent_keywords):
        doc = nlp(text.lower())
        tokens = [token.lemma_ for token in doc]
        scores = {}
        for intent, keywords in intent_keywords.items():
            matches = sum(kw in tokens for kw in keywords)
            scores[intent] = matches
        best_intent = max(scores, key=scores.get)
        confidence = scores[best_intent] / max(1, len(intent_keywords[best_intent]))
        return best_intent, confidence

def ensemble_intent_detection(text):
    results = []
    # Keyword
    kw_intent, kw_conf = detect_intent_keywords(text, intent_keywords)
    results.append((kw_intent, kw_conf))
    # Embedding
    emb_intent, emb_conf = detect_intent_embedding(text, intent_examples, intent_model)
    results.append((emb_intent, emb_conf))
    # Regex
    rgx_intent, rgx_conf = detect_intent_regex(text)
    results.append((rgx_intent, rgx_conf))
    # Zero-shot
    zs_intent, zs_conf = detect_intent_zero_shot(text, candidate_labels)
    results.append((zs_intent, zs_conf))
    # Majority vote
    intents = [intent for intent, _ in results]
    best, freq = Counter(intents).most_common(1)[0]
    return best, freq / len(results), results

ensemble_intent, ensemble_conf, all_results = ensemble_intent_detection(text)
print("\n== Ensemble Intent Detection ==")
print(f"Intent: {ensemble_intent}, Confidence: {ensemble_conf:.2f}")
print("All results:", all_results)

Device set to use cuda:0


== Zero-Shot Classification ==
Intent: case_status, Confidence: 0.18

== Ensemble Intent Detection ==
Intent: resume_info, Confidence: 0.75
All results: [('resume_info', 0.28846153846153844), ('resume_info', 0.4149797260761261), ('resume_info', 1.0), ('case_status', 0.17770253121852875)]

== Ensemble Intent Detection ==
Intent: resume_info, Confidence: 0.75
All results: [('resume_info', 0.28846153846153844), ('resume_info', 0.4149797260761261), ('resume_info', 1.0), ('case_status', 0.17770253121852875)]
