In [1]:
# Import necessary libraries for POS tagging
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from textblob import TextBlob
import re
import pandas as pd
from IPython.display import display

# Download necessary NLTK resources
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Define function to load text data from a file
def load_text(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# File path for Data_2.txt
file_path_2 = "Data_2.txt"

# Load Data_2.txt content
data_2_text = load_text(file_path_2)

print("\nData_2.txt:")
print(data_2_text)

# Tokenize sentence
tokens = word_tokenize(data_2_text)

print("\nTokenized Sentence:")
print(tokens)


Data_2.txt:
The big black dog barked at the white cat and chased away.

Tokenized Sentence:
['The', 'big', 'black', 'dog', 'barked', 'at', 'the', 'white', 'cat', 'and', 'chased', 'away', '.']


In [4]:
# 1. POS Tagging using NLTK
pos_nltk = pos_tag(tokens)

print("\nPOS Tagging Results using NLTK:")
print(pos_nltk)

# 2. POS Tagging using TextBlob
blob = TextBlob(data_2_text)
pos_textblob = blob.tags

print("\nPOS Tagging Results using TextBlob:")
print(pos_textblob)

# 3. POS Tagging using Regular Expression Tagger
# Define a simple regex-based POS tagging pattern
patterns = [
    (r'.*ing$', 'VBG'),  # Gerunds
    (r'.*ed$', 'VBD'),   # Past tense verbs
    (r'.*es$', 'VBZ'),   # Third-person singular present
    (r'.*ly$', 'RB'),    # Adverbs
    (r'.*ous$', 'JJ'),   # Adjectives
    (r'^[A-Z].*$', 'NNP'),  # Proper nouns
    (r'.*', 'NN')        # Default to noun
]

regex_tagger = nltk.RegexpTagger(patterns)
pos_regex = regex_tagger.tag(tokens)

print("\nPOS Tagging Results using Regular Expression Tagger:")
print(pos_regex)

# Store POS tagging results for comparison
pos_results = {
    "NLTK POS Tagger": pos_nltk,
    "TextBlob POS Tagger": pos_textblob,
    "Regex POS Tagger": pos_regex
}

# Convert to DataFrame for visualization
df_pos = pd.DataFrame.from_dict(pos_results, orient="index").transpose()
df_pos.columns = ["NLTK POS", "TextBlob POS", "Regex POS"]

# Display DataFrame in Jupyter Notebook
print("\nPOS Tagging Comparison (First 20 Tokens):")
display(df_pos)



POS Tagging Results using NLTK:
[('The', 'DT'), ('big', 'JJ'), ('black', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('white', 'JJ'), ('cat', 'NN'), ('and', 'CC'), ('chased', 'VBD'), ('away', 'RB'), ('.', '.')]

POS Tagging Results using TextBlob:
[('The', 'DT'), ('big', 'JJ'), ('black', 'JJ'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('white', 'JJ'), ('cat', 'NN'), ('and', 'CC'), ('chased', 'VBD'), ('away', 'RB')]

POS Tagging Results using Regular Expression Tagger:
[('The', 'NNP'), ('big', 'NN'), ('black', 'NN'), ('dog', 'NN'), ('barked', 'VBD'), ('at', 'NN'), ('the', 'NN'), ('white', 'NN'), ('cat', 'NN'), ('and', 'NN'), ('chased', 'VBD'), ('away', 'NN'), ('.', 'NN')]

POS Tagging Comparison (First 20 Tokens):


Unnamed: 0,NLTK POS,TextBlob POS,Regex POS
0,"(The, DT)","(The, DT)","(The, NNP)"
1,"(big, JJ)","(big, JJ)","(big, NN)"
2,"(black, JJ)","(black, JJ)","(black, NN)"
3,"(dog, NN)","(dog, NN)","(dog, NN)"
4,"(barked, VBD)","(barked, VBD)","(barked, VBD)"
5,"(at, IN)","(at, IN)","(at, NN)"
6,"(the, DT)","(the, DT)","(the, NN)"
7,"(white, JJ)","(white, JJ)","(white, NN)"
8,"(cat, NN)","(cat, NN)","(cat, NN)"
9,"(and, CC)","(and, CC)","(and, NN)"


In [7]:
from nltk import RegexpParser

chunker = RegexpParser("""
    NP: {<DT>?<JJ>*<NN>}  # Noun Phrase: Optional determiner, adjectives, and noun
    P: {<IN>}              # Preposition
    V: {<V.*>}             # Verb: Any verb form
    PP: {<P><NP>}          # Prepositional Phrase: Preposition + Noun Phrase
    VP: {<V><NP|PP>*}      # Verb Phrase: Verb + Noun Phrase or Prepositional Phrase
""")


output = chunker.parse(pos_nltk)
print("After Extracting\n", output)
output.draw()

After Extracting
 (S
  (NP The/DT big/JJ black/JJ dog/NN)
  (VP (V barked/VBD) (PP (P at/IN) (NP the/DT white/JJ cat/NN)))
  and/CC
  (VP (V chased/VBD))
  away/RB
  ./.)
