# 5.1 Parts of Speech (POS) Tagging

5.1.2 Implementing POS Tagging in Python

In [None]:
!pip install nltk

In [None]:
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "Natural Language Processing with Python is fascinating."

# Tokenize the text into words
tokens = word_tokenize(text)

# Perform POS tagging
pos_tags = pos_tag(tokens)

print("POS Tags:")
print(pos_tags)

5.1.3 Evaluating POS Taggers

In [None]:
from nltk import pos_tag
from nltk.corpus import treebank
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import nltk
nltk.download('treebank')

# Load the treebank corpus
test_data = treebank.tagged_sents()[3000:]
test_sentences = [[word for word, tag in sent] for sent in test_data]
gold_standard = [[tag for word, tag in sent] for sent in test_data]

# Tag the test sentences using a pre-trained tagger
tagger = nltk.PerceptronTagger()
predicted_tags = [tagger.tag(sent) for sent in test_sentences]
predicted_tags = [[tag for word, tag in sent] for sent in predicted_tags]

# Flatten the lists to compute metrics
gold_standard_flat = [tag for sent in gold_standard for tag in sent]
predicted_tags_flat = [tag for sent in predicted_tags for tag in sent]

# Compute evaluation metrics
accuracy = accuracy_score(gold_standard_flat, predicted_tags_flat)
precision = precision_score(gold_standard_flat, predicted_tags_flat, average='weighted')
recall = recall_score(gold_standard_flat, predicted_tags_flat, average='weighted')
f1 = f1_score(gold_standard_flat, predicted_tags_flat, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

5.1.4 Training Custom POS Taggers

In [None]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.corpus import treebank
nltk.download('treebank')

# Load the treebank corpus
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]

# Train a UnigramTagger
unigram_tagger = UnigramTagger(train_data)

# Evaluate the tagger
accuracy = unigram_tagger.evaluate(test_data)
print("Unigram Tagger Accuracy:", accuracy)

# Train a BigramTagger backed by the UnigramTagger
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)

# Evaluate the tagger
accuracy = bigram_tagger.evaluate(test_data)
print("Bigram Tagger Accuracy:", accuracy)

# 5.2 Named Entity Recognition (NER)

5.2.2 Implementing NER in Python

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "Apple is looking at buying U.K. startup for $1 billion."

# Process the text with the spaCy model
doc = nlp(text)

# Print named entities with their labels
print("Named Entities:")
for ent in doc.ents:
    print(ent.text, ent.label_)

5.2.3 Evaluating NER Systems

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# True entities in the text (manually annotated)
true_entities = ["Apple", "U.K.", "startup", "$1 billion"]

# Entities identified by the NER system
predicted_entities = ["Apple", "UK", "startup", "$1B"]

# Calculate precision, recall, and F1 score
precision = precision_score(true_entities, predicted_entities, average='micro')
recall = recall_score(true_entities, predicted_entities, average='micro')
f1 = f1_score(true_entities, predicted_entities, average='micro')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

5.2.4 Training Custom NER Models

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding

# Create a blank English model
nlp = spacy.blank("en")

# Create a new NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add labels to the NER component
ner.add_label("GADGET")

# Sample training data
TRAIN_DATA = [
    ("Apple is releasing a new iPhone.", {"entities": [(26, 32, "GADGET")]}),
    ("The new iPad Pro is amazing.", {"entities": [(8, 16, "GADGET")]}),
]

# Convert the training data to spaCy's format
doc_bin = DocBin()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    doc_bin.add(example.reference)

# Load the training data
examples = doc_bin.get_docs(nlp.vocab)

# Train the NER model
optimizer = nlp.begin_training()
for epoch in range(10):
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print("Losses", losses)

# Test the trained model
doc = nlp("I just bought a new iPhone.")
print("Named Entities:", [(ent.text, ent.label_) for ent in doc.ents])

# 5.3 Dependency Parsing

5.3.2 Dependency Parsing with spaCy

In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "The cat sat on the mat."

# Process the text with the spaCy model
doc = nlp(text)

# Print dependency parsing results
print("Dependency Parsing:")
for token in doc:
    print(f"{token.text} ({token.dep_}): {token.head.text}")

# Visualize the dependency tree (requires Jupyter Notebook or similar environment)
from spacy import displacy
displacy.render(doc, style="dep", jupyter=True)

5.3.4 Training Custom Dependency Parsers

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding

# Create a blank English model
nlp = spacy.blank("en")

# Create a new parser component and add it to the pipeline
parser = nlp.add_pipe("parser")

# Define labels for the parser
parser.add_label("nsubj")
parser.add_label("dobj")
parser.add_label("prep")

# Sample training data
TRAIN_DATA = [
    ("She enjoys playing tennis.", {"heads": [1, 1, 1, 2, 1], "deps": ["nsubj", "ROOT", "aux", "prep", "pobj"]}),
    ("I like reading books.", {"heads": [1, 1, 2, 1], "deps": ["nsubj", "ROOT", "dobj", "punct"]}),
]

# Convert the training data to spaCy's format
doc_bin = DocBin()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    doc_bin.add(example.reference)

# Load the training data
examples = doc_bin.get_docs(nlp.vocab)

# Train the parser
optimizer = nlp.begin_training()
for epoch in range(10):
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print("Losses", losses)

# Test the trained model
doc = nlp("She enjoys reading books.")
for token in doc:
    print(f"{token.text} ({token.dep_}): {token.head.text}")

# Chapter-5 Assignments

Exercise 1: Parts of Speech (POS) Tagging

In [None]:
import nltk
from nltk import word_tokenize, pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text into words
tokens = word_tokenize(text)

# Perform POS tagging
pos_tags = pos_tag(tokens)

print("POS Tags:")
print(pos_tags)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 2: Named Entity Recognition (NER)

In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "Barack Obama was born on August 4, 1961, in Honolulu, Hawaii."

# Process the text with the spaCy model
doc = nlp(text)

# Print named entities with their labels
print("Named Entities:")
for ent in doc.ents:
    print(ent.text, ent.label_)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 3: Training a Custom NER Model

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding

# Create a blank English model
nlp = spacy.blank("en")

# Create a new NER component and add it to the pipeline
ner = nlp.add_pipe("ner")

# Add labels to the NER component
ner.add_label("GADGET")

# Sample training data
TRAIN_DATA = [
    ("Apple is releasing a new iPhone.", {"entities": [(26, 32, "GADGET")]}),
    ("The new iPad Pro is amazing.", {"entities": [(8, 16, "GADGET")]}),
]

# Convert the training data to spaCy's format
doc_bin = DocBin()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    doc_bin.add(example.reference)

# Load the training data
examples = doc_bin.get_docs(nlp.vocab)

# Train the NER model
optimizer = nlp.begin_training()
for epoch in range(10):
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print("Losses", losses)

# Test the trained model
doc = nlp("I just bought a new iPhone.")
print("Named Entities:", [(ent.text, ent.label_) for ent in doc.ents])

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 4: Dependency Parsing

In [None]:
import spacy

# Load the pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')

# Sample text
text = "She enjoys reading books."

# Process the text with the spaCy model
doc = nlp(text)

# Print dependency parsing results
print("Dependency Parsing:")
for token in doc:
    print(f"{token.text} ({token.dep_}): {token.head.text}")

# Visualize the dependency tree (requires jupyter notebook or similar environment)
from spacy import displacy
displacy.render(doc, style="dep", jupyter=True)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  

Exercise 5: Training a Custom Dependency Parser

In [None]:
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.util import minibatch, compounding

# Create a blank English model
nlp = spacy.blank("en")

# Create a new parser component and add it to the pipeline
parser = nlp.add_pipe("parser")

# Define labels for the parser
parser.add_label("nsubj")
parser.add_label("dobj")
parser.add_label("prep")

# Sample training data
TRAIN_DATA = [
    ("She enjoys playing tennis.", {"heads": [1, 1, 1, 2, 1], "deps": ["nsubj", "ROOT", "aux", "prep", "pobj"]}),
    ("I like reading books.", {"heads": [1, 1, 2, 1], "deps": ["nsubj", "ROOT", "dobj", "punct"]}),
]

# Convert the training data to spaCy's format
doc_bin = DocBin()
for text, annotations in TRAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    doc_bin.add(example.reference)

# Load the training data
examples = doc_bin.get_docs(nlp.vocab)

# Train the parser
optimizer = nlp.begin_training()
for epoch in range(10):
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print("Losses", losses)

# Test the trained model
doc = nlp("She enjoys reading books.")
for token in doc:
    print(f"{token.text} ({token.dep_}): {token.head.text}")

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  