# Part 1: POS Tagging

## Task 1

In [3]:
import nltk
from nltk import word_tokenize

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

text = """The Cambridge Analytica scandal is more than a “breach,” as Facebook executives have defined it.
It exemplifies the possibility of using online data to algorithmically predict and influence human behavior
in a manner that circumvents users’ awareness of such influence. Using an intermediary app, Cambridge Analytica
was able to harvest large data volumes—over 50 million raw profiles—and use big data analytics to create
psychographic profiles in order to subsequently target users with customized digital ads and other manipulative
information. According to some observers, this massive data analytics tactic might have been used to purposively
swing election campaigns around the world. The reports are still incomplete and more is likely to come to light
in the next days."""

# Tokenizing the text
tokens = word_tokenize(text)

# Tagging the tokens using the Brown tagset via the universal tagset mapping in NLTK
tagged = nltk.pos_tag(tokens, tagset='universal')

# Display the tagged tokens
tagged


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[('The', 'DET'),
 ('Cambridge', 'NOUN'),
 ('Analytica', 'NOUN'),
 ('scandal', 'NOUN'),
 ('is', 'VERB'),
 ('more', 'ADJ'),
 ('than', 'ADP'),
 ('a', 'DET'),
 ('“', 'ADJ'),
 ('breach', 'NOUN'),
 (',', '.'),
 ('”', 'CONJ'),
 ('as', 'ADP'),
 ('Facebook', 'NOUN'),
 ('executives', 'NOUN'),
 ('have', 'VERB'),
 ('defined', 'VERB'),
 ('it', 'PRON'),
 ('.', '.'),
 ('It', 'PRON'),
 ('exemplifies', 'VERB'),
 ('the', 'DET'),
 ('possibility', 'NOUN'),
 ('of', 'ADP'),
 ('using', 'VERB'),
 ('online', 'ADJ'),
 ('data', 'NOUN'),
 ('to', 'PRT'),
 ('algorithmically', 'ADV'),
 ('predict', 'VERB'),
 ('and', 'CONJ'),
 ('influence', 'VERB'),
 ('human', 'ADJ'),
 ('behavior', 'NOUN'),
 ('in', 'ADP'),
 ('a', 'DET'),
 ('manner', 'NOUN'),
 ('that', 'DET'),
 ('circumvents', 'VERB'),
 ('users', 'NOUN'),
 ('’', 'NOUN'),
 ('awareness', 'NOUN'),
 ('of', 'ADP'),
 ('such', 'ADJ'),
 ('influence', 'NOUN'),
 ('.', '.'),
 ('Using', 'VERB'),
 ('an', 'DET'),
 ('intermediary', 'ADJ'),
 ('app', 'NOUN'),
 (',', '.'),
 ('Cambridge'

“Analytica” is repeatedly tagged as NOUN, which is correct given its usage, but it might also have been tagged as a proper noun (PROPN in some tag sets) because it's part of the name "Cambridge Analytica." Quotation marks and special characters like “ and ” have inconsistent tags such as ADJ and CONJ. These should ideally be tagged as punctuation (PUNCT) or not be assigned misleading categorical tags. "Facebook" is tagged as NOUN which is correct, but it could be more specifically tagged as a proper noun (PROPN) because it is a named entity. The dash in "volumes—over" causes a split issue, with "volumes—over" tagged as ADV, which is incorrect. This indicates a tokenization error where the tagger fails to properly handle punctuation.

## Task 2

In [49]:
import nltk
from nltk.tag import HiddenMarkovModelTagger
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize

nltk.download('treebank')
nltk.download('punkt')

# Load the Penn Treebank tagged sentences
sentences = treebank.tagged_sents()

# Train the HMM tagger
tagger = HiddenMarkovModelTagger.train(sentences)

# Text to be tagged
text = """The Cambridge Analytica scandal is more than a “breach,” as Facebook executives
have defined it. It exemplifies the possibility of using online data to algorithmically predict and
influence human behavior in a manner that circumvents users’ awareness of such influence.
Using an intermediary app, Cambridge Analytica was able to harvest large data volumes—over
50 million raw profiles—and use big data analytics to create psychographic profiles in order to
subsequently target users with customized digital ads and other manipulative information.
According to some observers, this massive data analytics tactic might have been used to
purposively swing election campaigns around the world. The reports are still incomplete and
more is likely to come to light in the next days."""

# Tokenize the text
tokens = word_tokenize(text)

# Tag the tokenized text using the trained HMM tagger
tagged_text = tagger.tag(tokens)

# Print the tagged text
print(tagged_text)

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('The', 'DT'), ('Cambridge', 'NNP'), ('Analytica', 'POS'), ('scandal', 'NN'), ('is', 'VBZ'), ('more', 'JJR'), ('than', 'IN'), ('a', 'DT'), ('“', 'JJ'), ('breach', 'NNS'), (',', ','), ('”', "''"), ('as', 'IN'), ('Facebook', 'PRP$'), ('executives', 'NNS'), ('have', 'VBP'), ('defined', 'VBN'), ('it', 'PRP'), ('.', '.'), ('It', "''"), ('exemplifies', 'VBZ'), ('the', 'DT'), ('possibility', 'NN'), ('of', 'IN'), ('using', 'VBG'), ('online', 'PRP$'), ('data', 'NNS'), ('to', 'TO'), ('algorithmically', 'VB'), ('predict', 'NNS'), ('and', 'CC'), ('influence', 'NN'), ('human', 'JJ'), ('behavior', 'NN'), ('in', 'IN'), ('a', 'DT'), ('manner', 'NN'), ('that', 'IN'), ('circumvents', 'PRP$'), ('users', 'NNS'), ('’', '.'), ('awareness', "''"), ('of', 'IN'), ('such', 'JJ'), ('influence', 'NN'), ('.', '.'), ('Using', "''"), ('an', 'DT'), ('intermediary', 'JJ'), ('app', 'NNS'), (',', ','), ('Cambridge', 'NNP'), ('Analytica', 'NNP'), ('was', 'VBD'), ('able', 'JJ'), ('to', 'TO'), ('harvest', 'VB'), ('large',

Common nouns and proper nouns are mostly correctly identified, e.g., "Cambridge" as NNP and "scandal" as NN. 
Basic parts of speech such as determiners, prepositions, and conjunctions are mostly accurate, e.g., "The" as DT, "in" as IN, and "and" as CC.
Verbs in various forms are generally well tagged, e.g., "is" as VBZ, "defined" as VBN.
"Analytica" is tagged as POS (possessive ending) which is incorrect; it should be tagged as NNP (proper noun) consistently.
"Facebook" is incorrectly tagged as PRP (possessive pronoun), it should be NNP.
"online" is tagged as PRP$, which is incorrect; it should be JJ (adjective) or NN (noun) depending on context.
Punctuation and special characters like quotes and dashes (“, —) are incorrectly tagged.
"analytics" is sometimes tagged as . (period), indicating a failure to recognize it as a noun.

# Part 2: Named Entity Recognition

## Task 1

In [7]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.chunk import ne_chunk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [10]:
# Text to be analyzed
text = "Atlas Honda is expected to achieve sales of 1.1 million units by end of its financial year ending March 31, while it aims to hit sales of 1.3m bikes in its next financial year, a Honda dealer said."

# Tokenize and POS tag the text
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)

In [11]:
# Chunk and recognize named entities
named_entities = ne_chunk(pos_tags)

In [12]:
# Function to extract and display named entities
def extract_named_entities(ne_tree):
    ne_in_sent = []
    for subtree in ne_tree:
        if hasattr(subtree, 'label'):
            entity = " ".join(c[0] for c in subtree.leaves())  # Joining word tokens back together
            ne_in_sent.append((entity, subtree.label()))
    return ne_in_sent

# Extract named entities
named_entity_list = extract_named_entities(named_entities)
print(named_entity_list)

[('Atlas', 'PERSON'), ('Honda', 'ORGANIZATION'), ('Honda', 'GPE')]


## Task 2

In [13]:
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Obtaining dependency information for spacy from https://files.pythonhosted.org/packages/92/fb/d1f0605e1e8627226c6c96053fe1632e9a04a3fbcd8b5d715528cb95eb97/spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Obtaining dependency information for spacy-legacy<3.1.0,>=3.0.11 from https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata
 

In [14]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 217.9 kB/s eta 0:00:59
     --------------------------------------- 0.0/12.8 MB 186.2 kB/s eta 0:01:09
     --------------------------------------- 0.0/12.8 MB 196.9 kB/s eta 0:01:05
     --------------------------------------- 0.1/12.8 MB 233.8 kB/s eta 0:00:55
     --------------------------------------- 0.1/12.8 MB 286.7 kB/s eta 0:00:45
     --------------------------------------- 0.1/12.8 MB 344.8 kB/s eta 0:00:37
     --------------------------------------- 0.2/12.8 MB 426.7 kB/s eta 0:00:30
      ------------------------------------

In [17]:
import spacy

# Loading the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

text = "Atlas Honda is expected to achieve sales of 1.1 million units by end of its financial year ending March 31, while it aims to hit sales of 1.3m bikes in its next financial year, a Honda dealer said."

# Processing the text
doc = nlp(text)

# Extract and display named entities
print("Named Entities, Phrases, Labels:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


Named Entities, Phrases, Labels:
Atlas Honda (PERSON)
1.1 million (CARDINAL)
year ending March 31 (DATE)
1.3 (CARDINAL)
next financial year (DATE)
Honda (ORG)


## Task 3

In [24]:
import spacy
from spacy.tokens import DocBin, Span

# Initializing a blank English spaCy model
nlp = spacy.blank('en')

def load_data(file_path):
    with open(file_path, encoding='utf-8') as file:
        lines = file.readlines()
    
    data = []
    sentence = []
    for line in lines:
        line = line.strip()
        if line == "":
            if sentence:
                data.append(sentence)
                sentence = []
        elif line.startswith("-DOCSTART-"):
            continue
        else:
            parts = line.split()
            word, pos, chunk, ner = parts
            sentence.append((word, ner))
    if sentence:
        data.append(sentence)
    
    return data

def create_spacy_data(data, output_file):
    docbin = DocBin()
    
    for sent in data:
        words = [token[0] for token in sent]
        doc = Doc(nlp.vocab, words=words)
        entities = []
        current_pos = 0
        
        for token, label in sent:
            start = current_pos
            end = start + len(token)
            if label != 'O':
                entity_label = label.split('-')[-1]  # Get only the entity type (e.g., PER, ORG)
                entities.append((start, end, entity_label))
            current_pos = end + 1  # +1 for the space character after each token

    
        spans = []
        for start, end, label in entities:
            if end <= len(doc.text):  
                span = doc.char_span(start, end, label=label, alignment_mode="strict")
                if span:
                    spans.append(span)
        
        doc.ents = spans
        docbin.add(doc)
    
    docbin.to_disk(output_file)

train_data = load_data(r"C:\Users\Hamza\Desktop\NLP\CONLL-2003 NER shared task dataset\train.txt")
create_spacy_data(train_data, 'train.spacy')

valid_data = load_data(r"C:\Users\Hamza\Desktop\NLP\CONLL-2003 NER shared task dataset\valid.txt")
create_spacy_data(valid_data, 'valid.spacy')


In [51]:
import spacy
from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.tokens import DocBin
import random

# Load a blank English model
nlp = spacy.blank("en")

# Adding a new entity recognizer to the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe('ner', last=True)

# Loading training and validation data
train_data = DocBin().from_disk("./train.spacy")
dev_data = DocBin().from_disk("./valid.spacy")

train_docs = list(train_data.get_docs(nlp.vocab))
dev_docs = list(dev_data.get_docs(nlp.vocab))

# Preparing training examples
train_examples = [Example.from_dict(doc, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}) for doc in train_docs]

# Initializing the model and optimizer
optimizer = nlp.initialize()

# Training loop
for i in range(5):
    losses = {}
    # Batch up the examples using spaCy's minibatch
    random.shuffle(train_examples)  
    batches = minibatch(train_examples, size=compounding(4., 32., 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses, sgd=optimizer)
    print("Losses", losses)


Losses {'ner': 23415.508204853973}
Losses {'ner': 13512.84387119233}
Losses {'ner': 10606.527916079274}
Losses {'ner': 8994.433092137588}
Losses {'ner': 8036.647725338172}


In [52]:
# Evaluate the model on the validation data
eval_examples = [Example.from_dict(doc, {"entities": [(e.start_char, e.end_char, e.label_) for e in doc.ents]}) for doc in dev_docs]
scores = nlp.evaluate(eval_examples)

precision = scores["ents_p"]
recall = scores["ents_r"]
f1_score = scores["ents_f"]

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")

nlp.to_disk("./trained_model")

Precision: 0.981
Recall: 1.000
F1 Score: 0.991


In [53]:
# Load the trained model
trained_nlp = spacy.load("./trained_model")

# Inference
test_text = "Atlas Honda is expected to achieve sales of 1.1 million units by end of its financial year ending March 31, while it aims to hit sales of 1.3m bikes in its next financial year, a Honda dealer said."
doc = trained_nlp(test_text)
print("Named Entities in the Sentence:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


Named Entities in the Sentence:
Atlas (PER)
Honda (PER)
Honda (ORG)
