# 🔍 Metadata Extraction Test Notebook
This notebook helps you test the `extract_metadata()` pipeline using a custom text chunk input.

You can modify the `text` variable in the next cell to test different examples.

In [40]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import spacy
from sentence_transformers import SentenceTransformer, util
import os
import json

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_trf")
except Exception:
    nlp = spacy.load("en_core_web_sm")

# Helper: normalize entity
def normalize_entity(e):
    text = re.sub(r'\s+', ' ', re.sub(r'\.', '', e.lower())).strip()
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

# Intent keywords and examples (copy from your scripts)
intent_keywords = {
    "claim_process": ["claim", "process", "file", "submit", "insurance"],
    "case_status": ["status", "case", "update", "progress", "judgement", "order", "appeal"],
    "document_request": ["document", "request", "copies", "forms"],
    "technical_support": ["error", "issue", "problem", "technical"],
    "general_info": ["information", "contact", "hours", "location"],
    "resume_info": [
        "skills", "resume", "cv", "proficiencies", "abilities", "expertise", "competencies", "qualifications",
        "experience", "work", "education", "background", "certifications", "projects", "programming", "languages",
        "achievements", "awards", "contact", "career", "summary", "tools", "technologies", "roles", "responsibilities",
        "soft skills", "applicant", "candidate", "developer", "engineer", "profile", "professional", "employment", "history",
        "management", "software", "admin", "implementation", "tracking", "project", "system", "solution", "platform", "application",
        "team", "lead", "player", "restocking", "inventory", "tournament", "manual", "implemented"
    ]
}
project_root = os.environ.get('PROJECT_ROOT', os.getcwd())
intent_examples_path = os.path.join(project_root, 'data', 'intent_categories', 'intent_examples.json')
with open(intent_examples_path, 'r', encoding='utf-8') as f:
    intent_examples = json.load(f)
# intent_examples = {
#     "resume_info": [
#         "What skills are listed in the resume?",
#         "Show me the proficiencies in this CV.",
#         "List the abilities mentioned in the candidate's resume.",
#         "What expertise does the applicant have?",
#         "Which competencies are present in the resume?",
#         "What qualifications are included in the CV?",
#         "List the work experience of the candidate.",
#         "What is the educational background of the applicant?",
#         "Show me the certifications in this resume.",
#         "What are the technical skills mentioned?",
#         "Summarize the professional experience section.",
#         "What projects has the candidate worked on?",
#         "List the programming languages known by the applicant.",
#         "What are the achievements or awards?",
#         "Show me the contact information in the resume.",
#         "What is the career objective or summary?",
#         "List the tools and technologies used by the candidate.",
#         "What is the total experience in years?",
#         "Show me the roles and responsibilities held.",
#         "What are the soft skills mentioned?",
#         "List the languages spoken by the applicant.",
#         "What is the applicant's job title?",
#         "Describe the applicant's professional profile.",
#         "What companies has the candidate worked for?",
#         "List the frameworks and libraries used.",
#         "What cloud platforms does the candidate have experience with?",
#         "What development methodologies are mentioned?",
#         "List the certifications and licenses.",
#         "What leadership roles has the candidate held?",
#         "Summarize the applicant's employment history.",
#         "What is the candidate's GitHub or portfolio link?",
#         "Describe the candidate's experience in management and software projects.",
#         "What inventory or tracking systems has the applicant implemented?",
#         "List any admin or manual processes managed by the candidate.",
#         "What experience does the candidate have with tournaments or players?",
#         "Describe the candidate's role in restocking or inventory management.",
#         "What solutions or platforms has the applicant developed or led?",
#         "List any applications or systems the candidate has worked on.",
#         "What teams has the candidate led or been a part of?",
#         "Describe the candidate's experience with project implementation."
#     ],
#     "claim_process": ["How do I file a claim?", "What is the process for submitting an insurance claim?"],
#     "case_status": ["What is the current status of the case?", "Show me the progress of case number 511605."],
#     "document_request": ["Can I get a copy of the case order?", "How do I request the judgment document?"],
#     "technical_support": ["I have a technical issue.", "There is a problem with the system."],
#     "general_info": ["What is the purpose of this document?", "Give me a summary of the file."]
# }
intent_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def get_intent(text):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc]
    detected_intent = None
    max_matches = 0
    for intent, keywords in intent_keywords.items():
        matches = sum(kw in tokens for kw in keywords)
        if matches > max_matches:
            max_matches = matches
            detected_intent = intent
    intent_confidence = max_matches / max(1, len(intent_keywords.get(detected_intent, []))) if detected_intent else 0.0
    if not detected_intent or max_matches == 0:
        query_emb = intent_model.encode(text, convert_to_tensor=True)
        best_intent, best_score = None, 0
        for intent, examples in intent_examples.items():
            example_embs = intent_model.encode(examples, convert_to_tensor=True)
            scores = util.pytorch_cos_sim(query_emb, example_embs)
            max_score = scores.max().item()
            if max_score > best_score:
                best_score = max_score
                best_intent = intent
        if best_score > 0.35:
            detected_intent = best_intent
            intent_confidence = best_score
    if not detected_intent:
        detected_intent = "general_info"
        intent_confidence = 0.0
    return detected_intent, intent_confidence, None

def extract_metadata(text):
    doc = nlp(text)
    # Entities
    entities = [normalize_entity(ent.text) for ent in doc.ents]
    # Keywords (nouns, proper nouns, not stopwords)
    keywords = [normalize_entity(token.text) for token in doc if token.pos_ in ["NOUN", "PROPN"] and not token.is_stop and token.lemma_.lower() not in ENGLISH_STOP_WORDS and len(token.text) > 2]
    # Intent
    detected_intent, intent_confidence, _ = get_intent(text)
    return {
        "entities": entities,
        "keywords": keywords,
        "intent": detected_intent,
        "intent_confidence": intent_confidence
    }


In [45]:
import sys
import sentence_transformers
import transformers
import spacy

# Print environment and config for debugging
try:
    print('spaCy version:', spacy.__version__)
except Exception:
    print('spaCy not available')
try:
    print('SentenceTransformers version:', sentence_transformers.__version__)
except Exception:
    print('SentenceTransformers not available')
try:
    print('Transformers version:', transformers.__version__)
except Exception:
    print('Transformers not available')
try:
    print('spaCy model:', nlp.meta['name'] if nlp else 'None')
except Exception:
    print('spaCy model: None')
try:
    print('Intent keywords:', intent_keywords)
except Exception:
    print('Intent keywords: not loaded')
try:
    print('Intent examples:', list(intent_examples.keys()))
except Exception:
    print('Intent examples: not loaded')
if 'text' in globals():
    print('Text sample:', repr(text[:500]))
else:
    print('No text loaded yet.')


spaCy version: 3.8.7
SentenceTransformers version: 4.1.0
Transformers version: 4.52.4
spaCy model: core_web_trf
Intent keywords: {'claim_process': ['claim', 'process', 'file', 'submit', 'insurance'], 'case_status': ['status', 'case', 'update', 'progress', 'judgement', 'order', 'appeal'], 'document_request': ['document', 'request', 'copies', 'forms'], 'technical_support': ['error', 'issue', 'problem', 'technical'], 'general_info': ['information', 'contact', 'hours', 'location'], 'resume_info': ['skills', 'resume', 'cv', 'proficiencies', 'abilities', 'expertise', 'competencies', 'qualifications', 'experience', 'work', 'education', 'background', 'certifications', 'projects', 'programming', 'languages', 'achievements', 'awards', 'contact', 'career', 'summary', 'tools', 'technologies', 'roles', 'responsibilities', 'soft skills', 'applicant', 'candidate', 'developer', 'engineer', 'profile', 'professional', 'employment', 'history', 'management', 'software', 'admin', 'implementation', 'trackin

In [16]:
%pip install python-dotenv

import sys
import os, json, yaml

project_root = 'c:\\New folder (5)\\new-search-models'
sys.path.insert(0, project_root)

# Load config.yaml using the correct path
config_path = os.path.join(project_root,  'config.yaml')
if os.path.exists(config_path):
	with open(config_path, 'r') as f:
		config = yaml.safe_load(f)
else:
	print(f"⚠️ config.yaml not found at {config_path}")
	config = {}

from dotenv import load_dotenv
load_dotenv(os.path.join(project_root, 'config', '.env'))
from scripts.entity_utils import normalize_entity, get_spacy_nlp
from scripts.search_pipeline import get_openai_embedding
from scripts.intent_utils import get_intent
# from scripts.metadata import extract_metadata  # <-- You must have your full extraction code in this file



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





Device set to use cpu


In [41]:
with open(r'C:\New folder (5)\new-search-models\data\chunks\Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt', 'r', encoding='utf-8') as file:
    text = file.read()
print('✅ Sample Text Loaded')

✅ Sample Text Loaded


In [42]:
metadata = extract_metadata(text)

metadata

{'entities': ['devzen',
  'spring boot spring security',
  'spring batch',
  'elasticsearch',
  'angular typescript',
  'spring boot angular',
  'aw ec2 s3 rd iam',
  'spring boot',
  'angular',
  '25',
  '50',
  'portal',
  '20',
  'six month of launch',
  '40',
  '90'],
 'keywords': ['java',
  'stack',
  'developer',
  'devzen',
  'software',
  'solution',
  'jwt',
  'authentication',
  'verification',
  'apis',
  'spring',
  'boot',
  'spring',
  'security',
  'batch',
  'processing',
  'workflow',
  'spring',
  'batch',
  'scale',
  'datum',
  'management',
  'rest',
  'apis',
  'time',
  'inventory',
  'tracking',
  'alert',
  'stock',
  'management',
  'elasticsearch',
  'search',
  'retrieval',
  'volume',
  'application',
  'component',
  'angular',
  'typescript',
  'product',
  'management',
  'payment',
  'gateway',
  'integration',
  'spring',
  'boot',
  'angular',
  'transaction',
  'dashboard',
  'key',
  'insight',
  'tournament',
  'inventory',
  'management',
  'appli

In [43]:
from pprint import pprint
print('✅ Extracted Metadata:')
pprint(metadata)

✅ Extracted Metadata:
{'entities': ['devzen',
              'spring boot spring security',
              'spring batch',
              'elasticsearch',
              'angular typescript',
              'spring boot angular',
              'aw ec2 s3 rd iam',
              'spring boot',
              'angular',
              '25',
              '50',
              'portal',
              '20',
              'six month of launch',
              '40',
              '90'],
 'intent': 'resume_info',
 'intent_confidence': 0.28846153846153844,
 'keywords': ['java',
              'stack',
              'developer',
              'devzen',
              'software',
              'solution',
              'jwt',
              'authentication',
              'verification',
              'apis',
              'spring',
              'boot',
              'spring',
              'security',
              'batch',
              'processing',
              'workflow',
              'spring',
      