# Keyword

In [12]:


from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# 1. Prepare label mappings
df = pd.DataFrame(expanded_data)
df['label_id'] = df['label'].astype('category').cat.codes
label2id = {label: i for i, label in enumerate(df['label'].astype('category').cat.categories)}
id2label = {i: label for label, i in label2id.items()}

# 2. Convert to Hugging Face Dataset
# Rename 'label_id' to 'labels' for Trainer compatibility
df_for_hf = df.rename(columns={'label_id': 'labels'})
# Remove the 'label' column (string) to avoid Trainer confusion
df_for_hf = df_for_hf.drop(columns=['label'])
dataset = Dataset.from_pandas(df_for_hf)

# 3. Tokenize
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)
dataset = dataset.map(preprocess, batched=True)

# 4. Train/Test split
split = dataset.train_test_split(test_size=0.2)
train_ds, test_ds = split["train"], split["test"]

# 5. Model and Trainer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id)
training_args = TrainingArguments(
    output_dir="./intent_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    do_eval=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
)

# 6. Train (this will take a few minutes on CPU, much faster on GPU)
trainer.train()

ReadTimeout: (ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: d47c8f26-aafb-49d8-a6bf-91d459e9005d)')

PI

In [None]:
# 📥 Build labeled training data from correct_intents.txt
import re

correct_intents_path = r'C:\New folder (5)\new-search-models\correct_intents.txt'
chunk_dir = os.path.join(project_root, '..', 'data', 'chunks')

labeled_from_file = []
with open(correct_intents_path, 'r', encoding='utf-8') as f:
    for line in f:
        m = re.match(r'([^:]+):.*?\'intent\': \'([^\']+)\'', line)
        if m:
            fname, intent = m.group(1).strip(), m.group(2).strip()
            chunk_path = os.path.join(chunk_dir, fname)
            if os.path.exists(chunk_path):
                with open(chunk_path, 'r', encoding='utf-8') as cf:
                    chunk_text = cf.read()
                labeled_from_file.append({"text": chunk_text, "label": intent})
            else:
                print(f"⚠️ Chunk file not found: {chunk_path}")
        else:
            print(f"⚠️ Could not parse line: {line.strip()}")

print(f"Loaded {len(labeled_from_file)} labeled examples from correct_intents.txt.")
# Add these to your expanded_data list before retraining:
# expanded_data.extend(labeled_from_file)

import glob

# 7. Inference: Predict intent for new text
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    pred_id = logits.argmax(dim=1).item()
    intent = id2label[pred_id]
    confidence = logits.softmax(dim=1)[0, pred_id].item()
    return intent, confidence
# def predict_intent(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
#     device = next(model.parameters()).device
#     inputs = {k: v.to(device) for k, v in inputs.items()}
#     outputs = model(**inputs)
#     pred_id = outputs.logits.argmax(dim=1).item()
#     return id2label[pred_id]
chunk_dir = os.path.join(project_root, '..', 'data', 'chunks')
chunk_files = glob.glob(os.path.join(chunk_dir, '*.txt'))



# Example usage
# print(predict_intent(open(r'C:\New folder (5)\new-search-models\data\chunks\MHC_CaseStatus_511605_chunk1.txt', 'r', encoding='utf-8').read()))
for chunk_path in chunk_files:
    with open(chunk_path, 'r', encoding='utf-8') as f:
        chunk_text = f.read()
    print(f"{os.path.basename(chunk_path)}: {predict_intent(chunk_text)}")
# print(predict_intent("List the skills in this resume."))
# print(predict_intent("Who is the presiding judge?"))
# print(predict_intent("I have a technical issue with the system."))

# ➕ Add labeled examples from correct_intents.txt to your training data
if 'expanded_data' in globals() and 'labeled_from_file' in globals():
    expanded_data.extend(labeled_from_file)
    print(f"expanded_data now has {len(expanded_data)} examples (including those from correct_intents.txt).")
else:
    print("⚠️ Make sure both expanded_data and labeled_from_file are defined before running this cell.")

import pandas as pd

# Example labeled data for intent fine-tuning (expand with more real samples for best results)
# data = [
#     {"text": "How do I file a claim?", "label": "claim_process"},
#     {"text": "What is the process for submitting an insurance claim?", "label": "claim_process"},
#     {"text": "What is the current status of the case?", "label": "case_status"},
#     {"text": "Show me the progress of case number 511605.", "label": "case_status"},
#     {"text": "Can I get a copy of the case order?", "label": "document_request"},
#     {"text": "How do I request the judgment document?", "label": "document_request"},
#     {"text": "What skills are listed in the resume?", "label": "resume_info"},
#     {"text": "List the programming languages known by the applicant.", "label": "resume_info"},
#     {"text": "Who is the presiding judge for this case?", "label": "court_details"},
#     {"text": "Who are the parties involved in this case?", "label": "party_information"},
#     {"text": "When was the last hearing held?", "label": "hearing_information"},
#     {"text": "I have a technical issue with the system.", "label": "technical_support"},
#     {"text": "Give me a summary of the file.", "label": "general_info"},
#     {"text": "Tell me about this document.", "label": "general_info"},
# ]
df = pd.DataFrame(expanded_data)
df


NameError: name 'os' is not defined

In [9]:
## 🏷️ Expand Labeled Data for Better Intent Classification

# To improve classifier performance, add more diverse and realistic examples for each intent. This helps the model generalize and reduces bias toward majority classes. Below is an expanded dataset template you can use and modify for your domain.
# Template: Expanded labeled data for intent fine-tuning
# Copy, edit, and expand this list with your real examples
expanded_data = [
    # claim_process
    {"text": "How do I file a claim?", "label": "claim_process"},
    {"text": "What is the process for submitting an insurance claim?", "label": "claim_process"},
    {"text": "I want to submit a new claim for my car accident.", "label": "claim_process"},
    {"text": "Guide me through the claim submission steps.", "label": "claim_process"},
    {"text": "Where do I upload my claim documents?", "label": "claim_process"},
    {"text": "How long does it take to process a claim?", "label": "claim_process"},
    {"text": "Can I check the status of my insurance claim?", "label": "claim_process"},
    {"text": "What documents are needed to file a claim?", "label": "claim_process"},
    {"text": "Is there a deadline for submitting claims?", "label": "claim_process"},
    {"text": "Can I cancel a claim after submitting?", "label": "claim_process"},
    # case_status
    {"text": "What is the current status of the case?", "label": "case_status"},
    {"text": "Show me the progress of case number 511605.", "label": "case_status"},
    {"text": "Has a judgment been issued in my case?", "label": "case_status"},
    {"text": "Is my case still pending?", "label": "case_status"},
    {"text": "When is the next hearing for my case?", "label": "case_status"},
    {"text": "What was the outcome of the last court session?", "label": "case_status"},
    {"text": "Who is the presiding judge for this case?", "label": "case_status"},
    {"text": "Has an appeal been filed?", "label": "case_status"},
    {"text": "Is there an order available for my case?", "label": "case_status"},
    {"text": "What is the next step in my case?", "label": "case_status"},
    # document_request
    {"text": "Can I get a copy of the case order?", "label": "document_request"},
    {"text": "How do I request the judgment document?", "label": "document_request"},
    {"text": "I need certified copies of my case documents.", "label": "document_request"},
    {"text": "Where can I download the court forms?", "label": "document_request"},
    {"text": "Request a copy of the final order.", "label": "document_request"},
    {"text": "How do I obtain previous hearing transcripts?", "label": "document_request"},
    {"text": "Can I get a digital copy of my case file?", "label": "document_request"},
    {"text": "What is the fee for document requests?", "label": "document_request"},
    {"text": "How long does it take to receive requested documents?", "label": "document_request"},
    {"text": "Is there a limit to the number of documents I can request?", "label": "document_request"},
    # resume_info
    {"text": "What skills are listed in the resume?", "label": "resume_info"},
    {"text": "List the programming languages known by the applicant.", "label": "resume_info"},
    {"text": "Show me the candidate's work experience.", "label": "resume_info"},
    {"text": "What certifications does the applicant have?", "label": "resume_info"},
    {"text": "Summarize the professional experience section.", "label": "resume_info"},
    {"text": "List the tools and technologies used by the candidate.", "label": "resume_info"},
    {"text": "What is the educational background of the applicant?", "label": "resume_info"},
    {"text": "What are the achievements or awards?", "label": "resume_info"},
    {"text": "Show me the contact information in the resume.", "label": "resume_info"},
    {"text": "What is the career objective or summary?", "label": "resume_info"},
    # technical_support
    {"text": "I have a technical issue with the system.", "label": "technical_support"},
    {"text": "There is a problem with the website.", "label": "technical_support"},
    {"text": "I can't log in to my account.", "label": "technical_support"},
    {"text": "The upload button is not working.", "label": "technical_support"},
    {"text": "How do I reset my password?", "label": "technical_support"},
    {"text": "The page is loading very slowly.", "label": "technical_support"},
    {"text": "I received an error message while submitting my form.", "label": "technical_support"},
    {"text": "The system crashed during my session.", "label": "technical_support"},
    {"text": "How do I contact technical support?", "label": "technical_support"},
    {"text": "The app keeps freezing.", "label": "technical_support"},
    # general_info
    {"text": "Give me a summary of the file.", "label": "general_info"},
    {"text": "Tell me about this document.", "label": "general_info"},
    {"text": "What is the purpose of this document?", "label": "general_info"},
    {"text": "Provide general information about the case.", "label": "general_info"},
    {"text": "What are the office hours?", "label": "general_info"},
    {"text": "How do I contact the support team?", "label": "general_info"},
    {"text": "Where is the office located?", "label": "general_info"},
    {"text": "What services are offered?", "label": "general_info"},
    {"text": "How do I register for an account?", "label": "general_info"},
    {"text": "What is the refund policy?", "label": "general_info"},
]

# You can now use expanded_data instead of the old 'data' list for training your classifier.


In [10]:
import pandas as pd

# Example labeled data for intent fine-tuning (expand with more real samples for best results)
# data = [
#     {"text": "How do I file a claim?", "label": "claim_process"},
#     {"text": "What is the process for submitting an insurance claim?", "label": "claim_process"},
#     {"text": "What is the current status of the case?", "label": "case_status"},
#     {"text": "Show me the progress of case number 511605.", "label": "case_status"},
#     {"text": "Can I get a copy of the case order?", "label": "document_request"},
#     {"text": "How do I request the judgment document?", "label": "document_request"},
#     {"text": "What skills are listed in the resume?", "label": "resume_info"},
#     {"text": "List the programming languages known by the applicant.", "label": "resume_info"},
#     {"text": "Who is the presiding judge for this case?", "label": "court_details"},
#     {"text": "Who are the parties involved in this case?", "label": "party_information"},
#     {"text": "When was the last hearing held?", "label": "hearing_information"},
#     {"text": "I have a technical issue with the system.", "label": "technical_support"},
#     {"text": "Give me a summary of the file.", "label": "general_info"},
#     {"text": "Tell me about this document.", "label": "general_info"},
# ]
df = pd.DataFrame(expanded_data)
df

Unnamed: 0,text,label
0,How do I file a claim?,claim_process
1,What is the process for submitting an insuranc...,claim_process
2,I want to submit a new claim for my car accident.,claim_process
3,Guide me through the claim submission steps.,claim_process
4,Where do I upload my claim documents?,claim_process
5,How long does it take to process a claim?,claim_process
6,Can I check the status of my insurance claim?,claim_process
7,What documents are needed to file a claim?,claim_process
8,Is there a deadline for submitting claims?,claim_process
9,Can I cancel a claim after submitting?,claim_process


In [18]:
import os
import re
import glob
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

class IntentClassifier:
    def __init__(self, expanded_data, project_root, model_name="distilbert-base-uncased"):
        self.expanded_data = expanded_data
        self.project_root = project_root
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = None
        self.label2id = None
        self.id2label = None
        self.trainer = None
        self.train_ds = None
        self.test_ds = None

    def build_label_mappings(self, df):
        df['label_id'] = df['label'].astype('category').cat.codes
        self.label2id = {label: i for i, label in enumerate(df['label'].astype('category').cat.categories)}
        self.id2label = {i: label for label, i in self.label2id.items()}
        return df

    def prepare_dataset(self):
        df = pd.DataFrame(self.expanded_data)
        df = self.build_label_mappings(df)
        df_for_hf = df.rename(columns={'label_id': 'labels'}).drop(columns=['label'])
        dataset = Dataset.from_pandas(df_for_hf)
        dataset = dataset.map(lambda example: self.tokenizer(example["text"], truncation=True, padding="max_length", max_length=64), batched=True)
        split = dataset.train_test_split(test_size=0.2)
        self.train_ds, self.test_ds = split["train"], split["test"]

    def setup_model_and_trainer(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, num_labels=len(self.label2id), id2label=self.id2label, label2id=self.label2id)
        training_args = TrainingArguments(
            output_dir="./intent_model",
            eval_strategy="epoch",
            save_strategy="epoch",
            do_eval=True,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            num_train_epochs=3,
            logging_steps=10,
            load_best_model_at_end=True,
        )
        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=self.train_ds,
            eval_dataset=self.test_ds,
            tokenizer=self.tokenizer,
        )

    def train(self):
        self.prepare_dataset()
        self.setup_model_and_trainer()
        self.trainer.train()

    def predict_intent(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
        device = next(self.model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        logits = outputs.logits
        pred_id = logits.argmax(dim=1).item()
        intent = self.id2label[pred_id]
        confidence = logits.softmax(dim=1)[0, pred_id].item()
        return intent, confidence

    def add_labeled_from_file(self, correct_intents_path):
        chunk_dir = os.path.join(self.project_root, 'data', 'chunks')
        labeled_from_file = []
        with open(correct_intents_path, 'r', encoding='utf-8') as f:
            for line in f:
                m = re.match(r'([^:]+):.*?\'intent\': \'([^\']+)\'', line)
                if m:
                    fname, intent = m.group(1).strip(), m.group(2).strip()
                    chunk_path = os.path.join(chunk_dir, fname)
                    if os.path.exists(chunk_path):
                        with open(chunk_path, 'r', encoding='utf-8') as cf:
                            chunk_text = cf.read()
                        labeled_from_file.append({"text": chunk_text, "label": intent})
                    else:
                        print(f"⚠️ Chunk file not found: {chunk_path}")
                else:
                    print(f"⚠️ Could not parse line: {line.strip()}")
        print(f"Loaded {len(labeled_from_file)} labeled examples from correct_intents.txt.")
        self.expanded_data.extend(labeled_from_file)
        print(f"expanded_data now has {len(self.expanded_data)} examples (including those from correct_intents.txt).")

    def batch_predict_chunks(self):
        chunk_dir = os.path.join(self.project_root, 'data', 'chunks')
        chunk_files = glob.glob(os.path.join(chunk_dir, '*.txt'))
        for chunk_path in chunk_files:
            with open(chunk_path, 'r', encoding='utf-8') as f:
                chunk_text = f.read()
            print(f"{os.path.basename(chunk_path)}: {self.predict_intent(chunk_text) , self.get_embedding(chunk_text)}")

    def get_embedding(self, text):
        # Get the embedding for the input text using the model's encoder
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
        device = next(self.model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            if hasattr(self.model, "distilbert"):
                outputs = self.model.distilbert(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
            elif hasattr(self.model, "bert"):
                outputs = self.model.bert(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :]
            elif hasattr(self.model, "roberta"):
                outputs = self.model.roberta(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :]
            else:
                outputs = self.model.base_model(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :]
        return embedding.cpu().numpy().flatten()



In [20]:
project_root = r'C:\Users\91807\Downloads\search_models'
classifier = IntentClassifier(expanded_data, project_root)
classifier.add_labeled_from_file(correct_intents_path)
classifier.train()
classifier.batch_predict_chunks()


NameError: name 'correct_intents_path' is not defined

# keywords

In [None]:

# # 1. TF-IDF (Term Frequency–Inverse Document Frequency)
# from sklearn.feature_extraction.text import TfidfVectorizer

# text = open(r'C:\New folder (5)\new-search-models\data\chunks\Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.txt', 'r', encoding='utf-8').read()

# def extract_keywords_tfidf(text, top_n=10):
#     vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
#     tfidf = vectorizer.fit_transform([text])
#     scores = zip(vectorizer.get_feature_names_out(), tfidf.toarray()[0])
#     sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
#     return [w for w, s in sorted_scores[:top_n]]

# print('TF-IDF keywords:', extract_keywords_tfidf(text))

# # 5. KeyBERT (Embedding-based, already in your notebook)
# from keybert import KeyBERT
# keyword_model = KeyBERT()
# keywords = [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)]

# print('KeyBERT keywords:', [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)])
# # 6. spaCy POS-based (Nouns, Noun Phrases)
# def extract_keywords_spacy(text, top_n=10):
#     doc = nlp(text)
#     noun_chunks = list(set(chunk.text.strip().lower() for chunk in doc.noun_chunks))
#     nouns = list(set(token.lemma_ for token in doc if token.pos_ == 'NOUN' and not token.is_stop))
#     return (noun_chunks + nouns)[:top_n]

# print('spaCy POS keywords:', extract_keywords_spacy(text))


# # --- Robust Keyword Extraction Pipeline ---
# from keybert import KeyBERT
# from sklearn.feature_extraction.text import TfidfVectorizer
# import spacy

# # Load models (reuse if already loaded)
# keyword_model = KeyBERT()
# nlp = spacy.load("en_core_web_trf") if spacy.util.is_package("en_core_web_trf") else spacy.load("en_core_web_sm")

# # 1. KeyBERT keywords
# keybert_keywords = [kw for kw, _ in keyword_model.extract_keywords(text, top_n=10)]

# # 2. TF-IDF keywords
# def extract_keywords_tfidf(text, top_n=10):
#     vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
#     tfidf = vectorizer.fit_transform([text])
#     scores = zip(vectorizer.get_feature_names_out(), tfidf.toarray()[0])
#     sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
#     return [w for w, s in sorted_scores[:top_n]]
# tfidf_keywords = extract_keywords_tfidf(text)

# # 3. spaCy POS-based keywords
# def extract_keywords_spacy(text, top_n=10):
#     doc = nlp(text)
#     noun_chunks = list(set(chunk.text.strip().lower() for chunk in doc.noun_chunks))
#     nouns = list(set(token.lemma_ for token in doc if token.pos_ == 'NOUN' and not token.is_stop))
#     return (noun_chunks + nouns)[:top_n]
# spacy_keywords = extract_keywords_spacy(text)

# # 4. Hybrid/ensemble: merge and deduplicate
# all_keywords = keybert_keywords + tfidf_keywords + spacy_keywords
# unique_keywords = []
# for kw in all_keywords:
#     if kw not in unique_keywords:
#         unique_keywords.append(kw)

# print("KeyBERT:", keybert_keywords)
# print("TF-IDF:", tfidf_keywords)
# print("spaCy POS:", spacy_keywords)
# print("\n---\nEnsemble (deduplicated):", unique_keywords)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keybert import KeyBERT
import spacy

class KeywordExtractor:
    def __init__(self, model=None, nlp_model=None):
        self.keyword_model = model if model is not None else KeyBERT()
        self.nlp = nlp_model if nlp_model is not None else spacy.load("en_core_web_trf") if spacy.util.is_package("en_core_web_trf") else spacy.load("en_core_web_sm")

    def extract_keywords_tfidf(self, text, top_n=10):
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
        tfidf = vectorizer.fit_transform([text])
        scores = zip(vectorizer.get_feature_names_out(), tfidf.toarray()[0])
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
        return [w for w, s in sorted_scores[:top_n]]

    def extract_keywords_keybert(self, text, top_n=10):
        return [kw for kw, _ in self.keyword_model.extract_keywords(text, top_n=top_n)]

    def extract_keywords_spacy(self, text, top_n=10):
        doc = self.nlp(text)
        noun_chunks = list(set(chunk.text.strip().lower() for chunk in doc.noun_chunks))
        nouns = list(set(token.lemma_ for token in doc if token.pos_ == 'NOUN' and not token.is_stop))
        return (noun_chunks + nouns)[:top_n]

    def extract_all(self, text, top_n=10):
        keybert_keywords = self.extract_keywords_keybert(text, top_n)
        tfidf_keywords = self.extract_keywords_tfidf(text, top_n)
        spacy_keywords = self.extract_keywords_spacy(text, top_n)
        all_keywords = keybert_keywords + tfidf_keywords + spacy_keywords
        unique_keywords = []
        for kw in all_keywords:
            if kw not in unique_keywords:
                unique_keywords.append(kw)
        return {
            # "keybert": keybert_keywords,
            # "tfidf": tfidf_keywords,
            # "spacy": spacy_keywords,
            "ensemble": unique_keywords
        }


In [27]:
keyword = KeywordExtractor(model=KeyBERT(), nlp_model=spacy.load("en_core_web_trf") if spacy.util.is_package("en_core_web_trf") else spacy.load("en_core_web_sm"))


In [28]:
keyword.extract_all(open(r'C:\New folder (5)\new-search-models\data\chunks\MHC_CaseStatus_511605_chunk1.txt', 'r', encoding='utf-8').read(), top_n=10)

{'ensemble': ['appeal',
  'decree',
  'procedure',
  'petition',
  'madras',
  'filed',
  'sriarumbayal',
  'court',
  'judicature',
  'cmpno6648',
  '2018',
  'delay',
  'assrno19304',
  'assrno19304 2018',
  'civil',
  'cmpno6648 2018',
  '2018 assrno19304',
  '2540',
  'a submission',
  'perusal',
  'respondents',
  'the entire affidavit',
  'their case',
  'the appeal suit',
  'the wife',
  'behalf',
  'they',
  'the petitioners']}

# Entities

In [None]:
# # 1. spaCy Named Entity Recognition (NER)
# import spacy

# # Load spaCy model (already loaded as nlp in previous cells)
# doc = nlp(text)
# spacy_entities = [(ent.text, ent.label_) for ent in doc.ents]
# print('spaCy Entities:', spacy_entities)

# # 2. Transformers-based NER (e.g., HuggingFace pipeline)
# from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# ner_pipe = pipeline(
#     "ner",
#     model=AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
#     tokenizer=AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
#     aggregation_strategy="simple",
#     device=-1  # CPU
# )
# transformers_entities = [(ent['word'], ent['entity_group'], ent['score']) for ent in ner_pipe(text) if ent['score'] > 0.8]
# print('Transformers NER Entities:', transformers_entities)

# # 3. Regex-based Entity Extraction (for custom patterns)
# import re

# # Example: Extract email addresses and dates
# emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
# dates = re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text)
# print('Emails:', emails)
# print('Dates:', dates)
# # Utility: Deduplicate entity_types in structured metadata output
# # 4. Ensemble/Hybrid: Combine spaCy, Transformers, and Regex
# def extract_entities_hybrid(text):
#     entities = set()
#     entity_types = []
#     entity_details = []
#     # spaCy
#     for ent in nlp(text).ents:
#         entities.add(ent.text)
#         entity_types.append(ent.label_)
#         entity_details.append({
#             "text": ent.text,
#             "type": ent.label_,
#             "score": None
#         })
#     # Transformers
#     for ent in ner_pipe(text):
#         if ent['score'] > 0.8:
#             entities.add(ent['word'])
#             entity_types.append(ent['entity_group'])
#             entity_details.append({
#                 "text": ent['word'],
#                 "type": ent['entity_group'],
#                 "score": ent['score']
#             })
#     # Regex (add more patterns as needed)
#     for email in re.findall(r'[\w\.-]+@[\w\.-]+', text):
#         entities.add(email)
#         entity_types.append("EMAIL")
#         entity_details.append({
#             "text": email,
#             "type": "EMAIL",
#             "score": None
#         })
#     for date in re.findall(r'\b\d{4}-\d{2}-\d{2}\b', text):
#         entities.add(date)
#         entity_types.append("DATE")
#         entity_details.append({
#             "text": date,
#             "type": "DATE",
#             "score": None
#         })
#     return {
#         "entities": sorted(entities),
#         "entity_types": entity_types,
#         "entity_details": entity_details
#     }

# # for chunk_path in chunk_files:
# #     with open(chunk_path, 'r', encoding='utf-8') as f:
# #         chunk_text = f.read()
# #     print(f"{os.path.basename(chunk_path)}: {extract_entities_hybrid(chunk_text)}")

#     # print('Hybrid Entities:', extract_entities_hybrid(text))
# def get_metadata_structured_dedup(
#     text, 
#     filename="", 
#     document_name="", 
#     summary="", 
#     embedding=None
# ):
#     keywords = unique_keywords
#     intent, intent_confidence = predict_intent(text)
#     ner_results = extract_entities_hybrid(text)
#     entities = ner_results.get("entities", [])
#     entity_types = list(dict.fromkeys(ner_results.get("entity_types", [])))  # Deduplicate, preserve order
#     entity_details = ner_results.get("entity_details", [])
#     if embedding is None:
#         inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64)
#         device = next(model.parameters()).device
#         inputs = {k: v.to(device) for k, v in inputs.items()}
#         with torch.no_grad():
#             outputs = model.distilbert(**inputs)
#             embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().tolist()
#     return {
#         "keyword": keywords,
#         "intent": intent,
#         "intent_confidence": intent_confidence,
#         "entities": entities,
#         "entity_types": entity_types,
#         "entity_details": entity_details,
#         "summary": summary,
#         "embedding": embedding,
#         "text": text,
#         "document_name": document_name,
#         "filename": filename
#     }

# # Example usage:
# metadata_structured_dedup = get_metadata_structured_dedup(
#     text=chunk_text,
#     filename=chunk_path,
#     document_name="Jimson_Ratnam_JavaFullStackDeveloper_2+years",
#     summary="",
# )
# import pprint
# pprint.pprint(metadata_structured_dedup)

In [3]:
class EntityExtractor:
    def __init__(self, nlp, ner_pipe):
        self.nlp = nlp
        self.ner_pipe = ner_pipe
        self.email_pattern = re.compile(r'[\w\.-]+@[\w\.-]+')
        self.date_pattern = re.compile(r'\b\d{4}-\d{2}-\d{2}\b')

    def extract_spacy(self, text):
        doc = self.nlp(text)
        return [(ent.text, ent.label_) for ent in doc.ents]

    def extract_transformers(self, text, score_threshold=0.8):
        return [
            (ent['word'], ent['entity_group'], ent['score'])
            for ent in self.ner_pipe(text)
            if ent['score'] > score_threshold
        ]

    def extract_regex(self, text):
        emails = self.email_pattern.findall(text)
        dates = self.date_pattern.findall(text)
        return {'emails': emails, 'dates': dates}

    def extract_entities_hybrid(self, text):
        entities = set()
        entity_types = []
        entity_details = []

        # spaCy
        for ent in self.nlp(text).ents:
            entities.add(ent.text)
            entity_types.append(ent.label_)
            entity_details.append({
                "text": ent.text,
                "type": ent.label_,
                "score": None
            })

        # Transformers
        for ent in self.ner_pipe(text):
            if ent['score'] > 0.8:
                entities.add(ent['word'])
                entity_types.append(ent['entity_group'])
                entity_details.append({
                    "text": ent['word'],
                    "type": ent['entity_group'],
                    "score": ent['score']
                })

        # Regex
        for email in self.email_pattern.findall(text):
            entities.add(email)
            entity_types.append("EMAIL")
            entity_details.append({
                "text": email,
                "type": "EMAIL",
                "score": None
            })
        for date in self.date_pattern.findall(text):
            entities.add(date)
            entity_types.append("DATE")
            entity_details.append({
                "text": date,
                "type": "DATE",
                "score": None
            })

        return {
            "entities": sorted(entities),
            "entity_types": set(entity_types),
            "entity_details": entity_details
        }


In [8]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

ner_pipe = pipeline(
    "ner",
    model=AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    tokenizer=AutoTokenizer.from_pretrained("Jean-Baptiste/roberta-large-ner-english"),
    aggregation_strategy="simple",
    device=-1  # CPU
)

# Ensure nlp is defined (should already be available from previous cells)
# If not, uncomment the following lines:
import spacy
nlp = spacy.load("en_core_web_trf") if spacy.util.is_package("en_core_web_trf") else spacy.load("en_core_web_sm")

entity = EntityExtractor(nlp, ner_pipe)

Device set to use cpu


In [9]:


entity.extract_entities_hybrid(open(r'C:\New folder (5)\new-search-models\data\output_data\Jimson_Ratnam_JavaFullStackDeveloper_2+years_chunk1.json', 'r', encoding='utf-8').read())

{'entities': ['20',
  '25',
  '40',
  '50',
  '90',
  'AWS',
  'AWS EC2 S3',
  'Angular',
  'Angular TypeScript',
  'Devzen Software Solutions',
  'Elasticsearch',
  'MISC',
  'ORG',
  'RDS IAM',
  'Spring Batch',
  'Spring Boot',
  'Spring Boot Angular',
  'Spring Boot Spring Security',
  'ach',
  'achi',
  'achie',
  'achiev',
  'achieve',
  'act',
  'acti',
  'ang',
  'angu',
  'angul',
  'angula',
  'api',
  'apis',
  'aws',
  'aws ec',
  'boot',
  'control',
  'dev',
  'deve',
  'devz',
  'devze',
  'devzen',
  'elastic',
  'elasticse',
  'elasticsea',
  'elasticsear',
  'elasticsearc',
  'enhan',
  'lev',
  'leve',
  'lever',
  'levera',
  'leverag',
  'leverage',
  'man',
  'mana',
  'manag',
  'manage',
  'managem',
  'manageme',
  'managemen',
  'management',
  'pre',
  'pres',
  'prese',
  'presen',
  'robust',
  'robust admin control real',
  'six months',
  'software',
  'sol',
  'solu',
  'solut',
  'soluti',
  'solutio',
  'solution',
  'spr',
  'spri'],
 'entity_types': 