<a href="https://colab.research.google.com/github/Shriyatha/Named_Entity_Recognition/blob/main/DATASET_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#NLP PROJECT: NAMED ENTITY RECOGNITION ON CONLL2003 DATASET

In [None]:
!pip install datasets transformers seqeval evaluate

# NOW LOADING THE DATASET

In [None]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("conll2003")

# Get the label names
label_names = dataset["train"].features["ner_tags"].feature.names

# Function to map numerical tags to feature names
def map_tags_to_names(tags, label_names):
    return [label_names[tag] for tag in tags]

# Example: Map tags for the first sample in the training set
ner_tags_numerical = dataset["train"]["ner_tags"][0]
ner_tags_features = map_tags_to_names(ner_tags_numerical, label_names)

# Print tokens and corresponding NER tags
tokens = dataset["train"]["tokens"][0]
for token, tag in zip(tokens, ner_tags_features):
    print(f"{token}: {tag}")

# ANALYSING THE DATASET

### Calculates and plots the length distribution of entities (e.g., "PER", "ORG") in an NER dataset,

In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt

# Initialize a dictionary to store entity lengths
entity_lengths = defaultdict(list)

# Iterate through the dataset
for tags in dataset["train"]["ner_tags"]:
    current_entity = None
    current_length = 0
    for tag in tags:
        if tag == 0:  # "O" tag
            if current_entity:
                entity_lengths[current_entity].append(current_length)
                current_entity = None
                current_length = 0
        else:
            tag_name = label_names[tag]
            if tag_name.startswith("B-"):
                if current_entity:
                    entity_lengths[current_entity].append(current_length)
                current_entity = tag_name[2:]  # Remove "B-" or "I-"
                current_length = 1
            elif tag_name.startswith("I-"):
                current_length += 1
    if current_entity:
        entity_lengths[current_entity].append(current_length)

# Plot the distribution of entity lengths
for entity, lengths in entity_lengths.items():
    plt.hist(lengths, bins=range(1, max(lengths) + 1), alpha=0.5, label=entity)
plt.xlabel("Entity Length")
plt.ylabel("Frequency")
plt.title("Length Distribution of Entities")
plt.legend()
plt.show()

# Most Frequent Entities

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Flatten the list of NER tags and map to feature names
all_tags = [tag for sublist in dataset["train"]["ner_tags"] for tag in sublist]
all_tags_features = map_tags_to_names(all_tags, label_names)

# Filter out "O" tags and count entity frequencies (ignoring B- and I- prefixes)
entity_frequencies = Counter(tag[2:] for tag in all_tags_features if tag != "O" and tag[2:] in {"PER", "LOC", "ORG", "MISC"})

# Print most frequent entities
print("Most frequent entities:")
print(entity_frequencies.most_common())

# Prepare data for plotting
entities = list(entity_frequencies.keys())
counts = list(entity_frequencies.values())

# Create a bar plot
plt.figure(figsize=(10, 6))
plt.bar(entities, counts, color=['blue', 'green', 'red', 'purple'])

# Add labels and title
plt.xlabel('Entity Types')
plt.ylabel('Frequency')
plt.title('Frequency Distribution of Entity Types in CoNLL Dataset')

# Display the plot
plt.show()

# Co-occurrence of entity pairs in sentences from a CoNLL-formatted dataset.


In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Initialize a dictionary to store co-occurrence counts
co_occurrence = defaultdict(int)

# Iterate through the dataset
for tags in dataset["train"]["ner_tags"]:
    # Get unique entities in the sentence
    entities_in_sentence = set()
    for tag in tags:
        if tag != 0:  # Ignore "O" tags
            entity = label_names[tag][2:]  # Remove "B-" or "I-"
            if entity in {"PER", "LOC", "ORG", "MISC"}:  # Filter specific entity types
                entities_in_sentence.add(entity)
    # Update co-occurrence counts
    entities_in_sentence = list(entities_in_sentence)
    for i in range(len(entities_in_sentence)):
        for j in range(i + 1, len(entities_in_sentence)):
            pair = tuple(sorted([entities_in_sentence[i], entities_in_sentence[j]]))
            co_occurrence[pair] += 1

# Print the most frequent co-occurring entity pairs
print("Most frequent co-occurring entity pairs:")
for pair, count in sorted(co_occurrence.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{pair}: {count}")

# Visualization: Co-occurrence heatmap
co_occurrence_df = pd.DataFrame(list(co_occurrence.items()), columns=["Pair", "Count"])
co_occurrence_df[["Entity1", "Entity2"]] = pd.DataFrame(co_occurrence_df["Pair"].tolist(), index=co_occurrence_df.index)
co_occurrence_matrix = co_occurrence_df.pivot(index="Entity1", columns="Entity2", values="Count").fillna(0)

plt.figure(figsize=(8, 6))
sns.heatmap(co_occurrence_matrix, annot=True, fmt="f", cmap="YlGnBu")
plt.title("Co-occurrence of Entity Pairs")
plt.show()

# Context Analysis

In [None]:
from nltk.corpus import stopwords

# Download NLTK stopwords (if not already downloaded)
import nltk
nltk.download("stopwords")

# Initialize stopwords set
stopwords_set = set(stopwords.words("english"))

# Extract words surrounding "ORG" entities
context_window = 2  # Number of words to consider on each side
org_contexts = []

for tokens, tags in zip(dataset["train"]["tokens"], dataset["train"]["ner_tags"]):
    tags_features = map_tags_to_names(tags, label_names)
    i = 0
    while i < len(tags_features):
        if tags_features[i] == "B-ORG":
            # Find the end of the ORG entity
            j = i
            while j + 1 < len(tags_features) and tags_features[j + 1] == "I-ORG":
                j += 1
            # Extract context around the entire entity
            start = max(0, i - context_window)
            end = min(len(tokens), j + context_window + 1)
            context = [word for word in tokens[start:end] if word.lower() not in stopwords_set and word.isalnum()]
            org_contexts.append(context)
            i = j + 1  # Skip the rest of the ORG entity
        else:
            i += 1

# Print the first 10 contexts
for context in org_contexts[:10]:
    print(" ".join(context))

# Save contexts to a file
with open("org_contexts.txt", "w") as f:
    for context in org_contexts:
        f.write(" ".join(context) + "\n")

# Find the most common words surrounding a specific entity

In [None]:
from collections import Counter
import matplotlib.pyplot as plt

# Custom stopwords to exclude
custom_stopwords = {"said", "told", "reuters", "newsroom", "st", "inc"}

# Flatten the list of contexts and convert to lowercase
all_context_words = [word.lower() for context in org_contexts for word in context]

# Filter out numbers and custom stopwords
filtered_words = [word for word in all_context_words if word.isalpha() and word not in custom_stopwords]

# Count the frequency of each word
word_frequencies = Counter(filtered_words)

# Print the 20 most common words in the context of ORG entities
print("Most common words in the context of ORG entities:")
for word, count in word_frequencies.most_common(20):
    print(f"{word}: {count}")

# Visualization: Bar plot of the most common words
top_words, top_counts = zip(*word_frequencies.most_common(20))

plt.figure(figsize=(10, 6))
plt.bar(top_words, top_counts, color='skyblue')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top 20 Most Common Words in the Context of ORG Entities')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


# Entity bound analysis

In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt

# Initialize boundary counts
boundary_counts = defaultdict(lambda: {"B": 0, "I": 0})

# Iterate through the dataset
for tags in dataset["train"]["ner_tags"]:
    for tag in tags:
        if tag != 0:  # Ignore "O" tags
            tag_name = label_names[tag]
            entity_type = tag_name[2:]  # Remove "B-" or "I-"
            boundary = tag_name[0]  # "B" or "I"
            if entity_type in {"PER", "ORG", "LOC", "MISC"}:  # Filter specific entity types
                boundary_counts[entity_type][boundary] += 1

# Print boundary counts
for entity, counts in boundary_counts.items():
    print(f"{entity}: B={counts['B']}, I={counts['I']}")

# Visualization: Bar plot of boundary counts
entities = list(boundary_counts.keys())
b_counts = [boundary_counts[entity]["B"] for entity in entities]
i_counts = [boundary_counts[entity]["I"] for entity in entities]

x = range(len(entities))
plt.figure(figsize=(10, 6))
plt.bar(x, b_counts, width=0.4, label="B", color="blue", align="center")
plt.bar(x, i_counts, width=0.4, label="I", color="orange", align="edge")
plt.xlabel('Entity Types')
plt.ylabel('Counts')
plt.title('Counts of "B" and "I" Tags for Each Entity Type')
plt.xticks(x, entities)
plt.legend()
plt.show()

### TOKEN LEVEL ANALYSIS: most common tokens for each entity type

In [None]:
from collections import defaultdict, Counter
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))  # Load common stopwords
ignore_list = {"new", "open", "league", "cup", "party", "st", "united"}  # Manually add entity-like common words

# Initialize a dictionary to store tokens for each entity type
entity_tokens = defaultdict(list)

# Extract tokens per entity type
for tokens, tags in zip(dataset["train"]["tokens"], dataset["train"]["ner_tags"]):
    for token, tag in zip(tokens, tags):
        if 0 <= tag < len(label_names) and label_names[tag] != "O":  # Ensure valid tag
            entity = label_names[tag][2:]  # Remove "B-" or "I-"
            token = token.lower().strip()  # Normalize token

            if token not in stop_words and token not in ignore_list:  # Filter out common words
                entity_tokens[entity].append(token)

# Print the most common tokens for each entity type
for entity, tokens in entity_tokens.items():
    print(f"Most common tokens for {entity}:")
    print(Counter(tokens).most_common(10))

# Add the most common tokens to custom entities
custom_entities = defaultdict(set)

for entity, tokens in entity_tokens.items():
    most_common_tokens = Counter(tokens).most_common(10)  # Get top 10 tokens
    for token, _ in most_common_tokens:
        custom_entities[token].add(entity)  # Store as a set to handle multi-label cases

# Print updated custom entities
print("\nUpdated Custom Entities:")
for token, entities in custom_entities.items():
    print(f"{token}: {', '.join(entities)}")


### Sentence Length Distribution

In [None]:
import matplotlib.pyplot as plt

# Calculate sentence lengths
sentence_lengths = [len(tokens) for tokens in dataset["train"]["tokens"]]

# Plot the distribution
plt.hist(sentence_lengths, bins=50)
plt.xlabel("Sentence Length")
plt.ylabel("Frequency")
plt.title("Distribution of Sentence Lengths")
plt.show()

### Out-of-Vocabulary (OOV) Analysis

###### Out-of-vocabulary (OOV) tokens, meaning they are rare words that the model might struggle to learn.

In [None]:
from collections import Counter

# Flatten the list of tokens
all_tokens = [token for sublist in dataset["train"]["tokens"] for token in sublist]

# Count token frequencies
token_frequencies = Counter(all_tokens)

# Identify low-frequency tokens (potential OOV tokens)
oov_tokens = [token for token, count in token_frequencies.items() if count <= 1]
print(f"Number of OOV tokens: {len(oov_tokens)}")
print(f"Example OOV tokens: {oov_tokens[:30]}")

In [None]:
# Define vocabulary threshold (words appearing more than once are kept)
vocab_threshold = 1

# Create vocabulary with words above the threshold
vocab = {word for word, count in token_frequencies.items() if count > vocab_threshold}

# Replace OOV tokens in dataset with [UNK]
updated_tokens = [
    [token if token in vocab else "[UNK]" for token in sublist]
    for sublist in dataset["train"]["tokens"]
]

# Print sample
print(updated_tokens[:4])  # Show first 3 tokenized sentences


# Dataset split analysis

In [None]:
# Compare entity frequencies across splits
for split in ["train", "validation", "test"]:
    all_tags = [tag for sublist in dataset[split]["ner_tags"] for tag in sublist]
    all_entities = [label_names[tag][2:] for tag in all_tags if tag != 0]
    entity_frequencies = Counter(all_entities)
    print(f"Entity frequencies in {split} split:")
    print(entity_frequencies.most_common())