# Chapter-5 Assignments

**Installed the required Python prerequisite packages and libraries.**

Exercise 1: Parts of Speech (POS) Tagging

In [None]:
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text into words
tokens = word_tokenize(text)

# Perform POS tagging
pos_tags = pos_tag(tokens)

print("POS Tags:")
print(pos_tags)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 2: Named Entity Recognition (NER)

In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii."

# Process the text with the spaCy model
doc = nlp(text)

# Print named entities with their labels
print("Named Entities:")
for ent in doc.ents:
    print(ent.text, ent.label_)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 3: Training a Custom NER Model

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding

# Create a blank English model
nlp = spacy.blank("en")

# Create a new NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add labels to the NER component
ner.add_label("GADGET")

# Sample training data
TRAIN_DATA = [
    ("Apple is releasing a new iPhone.", {"entities": [(26, 32, "GADGET")]}),
    ("The new iPad Pro is amazing.", {"entities": [(8, 16, "GADGET")]}),
]

# Convert the training data to spaCy's format
# Create a list of Example objects from the training data
examples = []
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)


# Train the NER model
optimizer = nlp.begin_training()
for epoch in range(10):
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        # Pass the batch directly to nlp.update as it now contains Example objects
        nlp.update(batch, drop=0.5, losses=losses)
    print("Losses", losses)

# Test the trained model
doc = nlp("I just bought a new iPhone.")
print("Named Entities:", [(ent.text, ent.label_) for ent in doc.ents])

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 4: Dependency Parsing

In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "She enjoys reading books."

# Process the text with the spaCy model
doc = nlp(text)

# Print dependency parsing results
print("Dependency Parsing:")
for token in doc:
    print(f"{token.text} ({token.dep_}): {token.head.text}")

# Visualize the dependency tree (requires jupyter notebook or similar environment)
from spacy import displacy
displacy.render(doc, style="dep", jupyter=True)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 5: Training a Custom Dependency Parser

In [None]:
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding

# Create a blank English model
nlp = spacy.blank("en")

# Create a new parser component and add it to the pipeline
parser = nlp.add_pipe("parser")

# Define labels for the parser
parser.add_label("nsubj")
parser.add_label("dobj")
parser.add_label("prep")
parser.add_label("aux") # Add aux label for 'playing' in the first sentence
parser.add_label("punct") # Add punct label for '.'

# Sample training data
# Corrected TRAIN_DATA to accurately reflect tokenization and dependencies
TRAIN_DATA = [
    ("She enjoys playing tennis.", {"heads": [1, 1, 1, 2, 1], "deps": ["nsubj", "ROOT", "aux", "dobj", "punct"]}),
    ("I like reading books.", {"heads": [1, 1, 2, 1], "deps": ["nsubj", "ROOT", "dobj", "punct"]}),
]

# Convert the training data to spaCy's format
examples = []
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    # Ensure annotations match the tokenization
    if len(doc) == len(annotations["heads"]) and len(doc) == len(annotations["deps"]):
        example = Example.from_dict(doc, annotations)
        examples.append(example)
    else:
        print(f"Skipping example due to length mismatch: {text}")
        print(f"Doc length: {len(doc)}, Heads length: {len(annotations['heads'])}, Deps length: {len(annotations['deps'])}")


# Train the parser
optimizer = nlp.begin_training()
for epoch in range(15): # Increased epochs for potentially better training
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print("Losses", losses)

# Test the trained model
doc = nlp("She enjoys reading books.")
for token in doc:
    print(f"{token.text} ({token.dep_}): {token.head.text}")

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  