In [None]:
import pandas as pd
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Load the pre-trained NER model from Hugging Face
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a Named Entity Recognition pipeline
nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

# Load the dataset
file_path = "news_excerpts_parsed.xlsx"  # Update with your actual file path
df = pd.read_excel(file_path)

# Function to perform NER on text
def extract_ner_entities(text):
    if isinstance(text, str):  # Ensure the input is a string
        ner_results = nlp_pipeline(text)
        extracted_entities = [(entity['word'], entity['entity'], entity['score']) for entity in ner_results]
        return extracted_entities
    return []

# Apply NER extraction to each text entry
df["NER_Entities"] = df["Text"].apply(extract_ner_entities)

# Save results to a new Excel file
df.to_excel("ner_extracted_results.xlsx", index=False)

print("NER extraction complete. Results saved to 'ner_extracted_results.xlsx'.")

In [None]:
import pandas as pd
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Load the pre-trained NER model from Hugging Face
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a Named Entity Recognition pipeline (with auto-aggregation)
nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Load the dataset
file_path = "news_excerpts_parsed.xlsx"  # Update with actual path
df = pd.read_excel(file_path)

# Function to extract unique NER entities with the highest confidence score
def extract_unique_ner_entities(text):
    if isinstance(text, str):  # Ensure input is a valid string
        ner_results = nlp_pipeline(text)
        
        entity_dict = {}  # Dictionary to track the highest confidence score per entity
        for entity in ner_results:
            entity_name = entity['word']
            entity_type = entity['entity_group']
            confidence = round(entity['score'], 4)
            
            # Store only the highest confidence occurrence
            if entity_name not in entity_dict or confidence > entity_dict[entity_name][1]:
                entity_dict[entity_name] = (entity_type, confidence)

        # Convert back to list format: [(Entity, Type, Confidence)]
        return [(name, details[0], details[1]) for name, details in entity_dict.items()]
    
    return []

# Apply the function to extract and filter unique NER entities
df["Final_NER_Entities"] = df["Text"].apply(extract_unique_ner_entities)

# Save results to a new Excel file
df.to_excel("cleaned_ner_results.xlsx", index=False)

print("✅ NER extraction complete. Duplicates removed. Results saved to 'cleaned_ner_results.xlsx'.")

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from collections import defaultdict
from fuzzywuzzy import process, fuzz
from sentence_transformers import SentenceTransformer, util

# Load the dataset
file_path = "cleaned_ner_results.xlsx"  # Update this with the actual file path
df = pd.read_excel(file_path)

print("✅ 1. Load a sentence transformer model for semantic similarity")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

print("✅ 2. Extract all unique entity names dynamically from the dataset")
all_entities = set()
for entity_list in df["Final_NER_Entities"]:
    for name, entity_type, _ in eval(entity_list):  # Convert string to list
        all_entities.add(name)

print("✅ 3. Compute word embeddings for all entity names using SentenceTransformer")
entity_embeddings = {
    entity: embedding_model.encode(entity, convert_to_tensor=True)
    for entity in all_entities
}

print("✅ 4. Automatically generate alias mappings using fuzzy matching and semantic similarity")
def build_alias_map(entities, threshold_fuzzy=85, threshold_semantic=0.85):
    """
    - Uses fuzzy matching to find similar text-based entities.
    - Uses cosine similarity to find semantically similar entities.
    - Groups variations under a single canonical entity name.
    """
    alias_map = {}
    processed = set()
    entity_list = list(entities)
    print("Number of entities to complete: ", len(entity_list))

    for i, entity in enumerate(entity_list):
        print("Completed entity: ", i)
        if entity in processed:
            continue

        # ✅ 4.1 Fuzzy matching to find textually similar entities
        match, score = process.extractOne(entity, entity_list[i+1:]) if entity_list[i+1:] else (None, 0)

        # ✅ 4.2 Compute semantic similarity (cosine similarity of embeddings)
        best_semantic_match = None
        best_semantic_score = 0

        for other_entity in entity_list:
            if entity == other_entity:
                continue
            semantic_score = util.pytorch_cos_sim(
                entity_embeddings[entity], entity_embeddings[other_entity]
            ).item()
            if semantic_score > best_semantic_score:
                best_semantic_match = other_entity
                best_semantic_score = semantic_score

        # ✅ 4.3 Choose the best match (either fuzzy or semantic)
        best_match = None
        if match and score >= threshold_fuzzy:
            best_match = match
        if best_semantic_match and best_semantic_score >= threshold_semantic:
            best_match = best_semantic_match

        # ✅ 4.4 If a valid match is found, group them under the longer/more formal name
        if best_match:
            canonical_name = max(entity, best_match, key=len)  # Keep the longer/more descriptive name
            alias_map[entity] = canonical_name
            alias_map[best_match] = canonical_name
            processed.add(entity)
            processed.add(best_match)

    return alias_map

print("✅ 5. Generate the alias mapping dynamically")
entity_aliases = build_alias_map(all_entities)

print("✅ 6. Function to apply alias mapping")
def normalize_entity(entity_name):
    return entity_aliases.get(entity_name, entity_name)  # Replace if alias exists

print("✅ 7. Apply alias mapping to NER entities")
df["Normalized_NER_Entities"] = df["Final_NER_Entities"].apply(
    lambda entities: [
        (normalize_entity(name), entity_type, confidence)
        for name, entity_type, confidence in eval(entities)
    ]
)

print("✅ 8. Save standardized entity results to a new file")
df.to_excel("generalized_normalized_ner_with_embeddings.xlsx", index=False)
print("✅ Auto-aliasing with word embeddings & fuzzy matching complete!")

# -------------------------------------------------------------
# ✅ RELATIONSHIP EXTRACTION (AFTER ENTITY STANDARDIZATION)
# -------------------------------------------------------------

# ✅ 9. Function to extract relationships while ensuring standardization
def extract_relationships(text, entities):
    """
    - Extracts relationships between entities within the same excerpt.
    - Uses standardized entity names.
    """
    relationships = []
    entity_names = [name for name, _, _ in entities]  # Extract only entity names
    
    # ✅ 9.1 Generate simple entity-entity relationships (for each entity pair)
    for i in range(len(entity_names)):
        for j in range(i + 1, len(entity_names)):
            relationships.append((entity_names[i], entity_names[j]))  # (Entity1, Entity2)
    
    return relationships

# ✅ 10. Apply relationship extraction using normalized entity names
df["Relationships"] = df.apply(
    lambda row: extract_relationships(row["Text"], row["Normalized_NER_Entities"]),
    axis=1
)

# ✅ 11. Aggregate relationships across all excerpts
relationship_dict = defaultdict(set)

for relationships in df["Relationships"]:
    for entity1, entity2 in relationships:
        relationship_dict[entity1].add(entity2)
        relationship_dict[entity2].add(entity1)  # Bidirectional relationship

# ✅ 12. Convert relationships into a DataFrame for better visualization
relationship_df = pd.DataFrame(
    [(k, list(v)) for k, v in relationship_dict.items()], 
    columns=["Entity", "Related Entities"]
)

# ✅ 13. Save relationships to a new Excel file
relationship_df.to_excel("aggregated_relationships.xlsx", index=False)
print("✅ Relationship extraction complete! Results saved to 'aggregated_relationships.xlsx'.")

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from fuzzywuzzy import process, fuzz
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering

# Load the dataset
file_path = "cleaned_ner_results.xlsx"  # Update this with the actual file path
df = pd.read_excel(file_path)

# ✅ 1. Load the sentence transformer model (Fast, 6x speedup)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ 2. Extract all unique entity names dynamically from the dataset
all_entities = set()
for entity_list in df["Final_NER_Entities"]:  # Change this if your column name is different
    for name, entity_type, _ in eval(entity_list):  # Convert string to list
        all_entities.add(name)

# ✅ 3. Compute embeddings for all entity names at once (100x faster than loops)
entity_list = list(all_entities)
entity_embeddings = embedding_model.encode(entity_list, convert_to_tensor=True)

# ✅ 4. Use clustering to pre-group similar entities (Agglomerative Clustering)
def cluster_entities(entities, embeddings, threshold=0.85):
    """
    - Uses Agglomerative Clustering to group similar entities
    - Reduces the number of comparisons needed
    """
    clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1 - threshold, linkage='average', metric='cosine')
    labels = clustering.fit_predict(embeddings.cpu().numpy())

    # Create a mapping from entity to its cluster
    cluster_map = defaultdict(list)
    for idx, label in enumerate(labels):
        cluster_map[label].append(entities[idx])

    # Pick the most descriptive name from each cluster
    alias_map = {}
    for cluster in cluster_map.values():
        canonical_name = max(cluster, key=len)  # Choose the longest name as canonical
        for entity in cluster:
            alias_map[entity] = canonical_name

    return alias_map

# ✅ 5. Generate the alias mapping using clustering (FAST, scalable)
entity_aliases = cluster_entities(entity_list, entity_embeddings)

# ✅ 6. Function to normalize entity names using the alias mapping
def normalize_entity(entity_name):
    return entity_aliases.get(entity_name, entity_name)  # Replace if alias exists

# ✅ 7. Apply alias mapping to NER entities
df["Normalized_NER_Entities"] = df["Final_NER_Entities"].apply(
    lambda entities: [
        (normalize_entity(name), entity_type, confidence)
        for name, entity_type, confidence in eval(entities)
    ]
)

# ✅ 8. Save standardized entity results to a new file
df.to_excel("optimized_normalized_ner.xlsx", index=False)
print("✅ Auto-aliasing complete! Results saved to 'optimized_normalized_ner.xlsx'.")

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from collections import defaultdict
from fuzzywuzzy import process, fuzz
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import AgglomerativeClustering
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Load the dataset
file_path = "news_excerpts_parsed.xlsx"  # Update this with the actual file path
df = pd.read_excel(file_path)

# ########################################################
# ✅ 1. Named Entity Recognition (NER) Extraction
# ########################################################

# Load the pre-trained NER model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a Named Entity Recognition pipeline
nlp_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ✅ Function to perform NER extraction & merge subwords (### tokens)
def extract_ner_entities(text):
    if isinstance(text, str):
        ner_results = nlp_pipeline(text)
        
        merged_entities = []
        current_entity = []
        current_label = None

        for entity in ner_results:
            word = entity['word']
            entity_type = entity['entity_group']
            confidence = round(entity['score'], 4)

            # Handle subword tokens (e.g., 'U.', '##OB' -> "UOB")
            if word.startswith("##"):
                if current_entity:
                    current_entity[-1] += word[2:]  # Merge subword into previous token
            else:
                if current_entity:
                    merged_entities.append(("".join(current_entity), current_label, max_confidence))
                current_entity = [word]
                current_label = entity_type
                max_confidence = confidence

        # Add the last entity if present
        if current_entity:
            merged_entities.append(("".join(current_entity), current_label, max_confidence))

        return merged_entities
    return []

# Apply NER extraction to each text entry
df["NER_Entities"] = df["Text"].apply(extract_ner_entities)

# ########################################################
# ✅ 2. Entity Standardization (Fuzzy Matching + Clustering)
# ########################################################

# ✅ Load the sentence transformer model for entity similarity
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# ✅ Extract all unique entity names dynamically from the dataset
all_entities = set()
for entity_list in df["NER_Entities"]:
    for name, entity_type, _ in entity_list:
        all_entities.add(name)

# ✅ Compute embeddings for all entity names (Batch processing)
entity_list = list(all_entities)
entity_embeddings = embedding_model.encode(entity_list, convert_to_tensor=True)

# ✅ Function to cluster entities and standardize names
def cluster_entities(entities, embeddings, threshold=0.85):
    """
    - Uses Agglomerative Clustering to group similar entities
    - Reduces the number of comparisons needed
    """
    clustering = AgglomerativeClustering(
        n_clusters=None, distance_threshold=1 - threshold, linkage='average', metric='cosine'
    )
    labels = clustering.fit_predict(embeddings.cpu().numpy())

    # Create a mapping from entity to its cluster
    cluster_map = defaultdict(list)
    for idx, label in enumerate(labels):
        cluster_map[label].append(entities[idx])

    # Pick the most descriptive name from each cluster
    alias_map = {}
    for cluster in cluster_map.values():
        canonical_name = max(cluster, key=len)  # Choose the longest name as canonical
        for entity in cluster:
            alias_map[entity] = canonical_name

    return alias_map

# ✅ Generate the alias mapping dynamically
entity_aliases = cluster_entities(entity_list, entity_embeddings)

# ✅ Function to normalize entity names using alias mapping
def normalize_entity(entity_name):
    return entity_aliases.get(entity_name, entity_name)

# ✅ Apply alias mapping to NER entities
df["Normalized_NER_Entities"] = df["NER_Entities"].apply(
    lambda entities: [
        (normalize_entity(name), entity_type, confidence)
        for name, entity_type, confidence in entities
    ]
)

# ✅ Save standardized entity results to a new file
df.to_excel("optimized_normalized_ner.xlsx", index=False)
print("✅ Auto-aliasing complete! Results saved to 'optimized_normalized_ner.xlsx'.")

# ########################################################
# ✅ 3. Relationship Extraction (After Entity Standardization)
# ########################################################

# ✅ Function to extract relationships while ensuring standardization
def extract_relationships(text, entities):
    """
    - Extracts relationships between entities within the same excerpt.
    - Uses standardized entity names.
    """
    relationships = []
    entity_names = [name for name, _, _ in entities]  # Extract only entity names
    
    # ✅ Generate simple entity-entity relationships (for each entity pair)
    for i in range(len(entity_names)):
        for j in range(i + 1, len(entity_names)):
            relationships.append((entity_names[i], entity_names[j]))  # (Entity1, Entity2)
    
    return relationships

# ✅ Apply relationship extraction using normalized entity names
df["Relationships"] = df.apply(
    lambda row: extract_relationships(row["Text"], row["Normalized_NER_Entities"]),
    axis=1
)

# ✅ Aggregate relationships across all excerpts
relationship_dict = defaultdict(set)

for relationships in df["Relationships"]:
    for entity1, entity2 in relationships:
        relationship_dict[entity1].add(entity2)
        relationship_dict[entity2].add(entity1)  # Bidirectional relationship

# ✅ Convert relationships into a DataFrame for better visualization
relationship_df = pd.DataFrame(
    [(k, list(v)) for k, v in relationship_dict.items()], 
    columns=["Entity", "Related Entities"]
)

# ✅ Save relationships to a new Excel file
relationship_df.to_excel("aggregated_relationships.xlsx", index=False)
print("✅ Relationship extraction complete! Results saved to 'aggregated_relationships.xlsx'.")