<a href="https://colab.research.google.com/github/kushbajpai/collab_notebooks/blob/main/ML_C69_Syntactic_Processing_Live_34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk

In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('averaged_perceptron_tagger') # needed for pos tagging

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# Sample Text
text = "Alice loves painting in the quiet evenings."

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
# Tokenize the text
tokens = word_tokenize(text)
tokens

['Alice', 'loves', 'painting', 'in', 'the', 'quiet', 'evenings', '.']

In [None]:
# Use pre-trained POS tager
pos_tags = nltk.pos_tag(tokens)
print(pos_tags)

[('Alice', 'NNP'), ('loves', 'VBZ'), ('painting', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('quiet', 'JJ'), ('evenings', 'NNS'), ('.', '.')]


 - NNP: Proper noun, sigular
 - VBX: 3rd person sigular present
 - VBG: Present particple
 - IN: Preposition
 - DT: Determiner
 - JJ: Adjective
 - NNS: Plural noun
<br/>

 - Use pre-trained POS tagger
 - Customise this pre-trained POS tagger only

 <br/>

 **Reference:** https://spotintelligence.com/2023/01/24/part-of-speech-pos-tagging-in-nlp-python/

In [22]:
# Define custom rules
def custom_tagger(pos_tags):
  modified_tags = []
  for word, tag in pos_tags:
    # Rule 1: "painting" should always be a noun
    if word.lower() == "painting":
      modified_tags.append((word, "NN"))
    # Rule 2: "evenings" should always be singular noun
    elif word.lower() == "evenings":
      modified_tags.append((word, "NN"))
    else:
      # Default case: Use the tag from the pre-trained tagger
      modified_tags.append((word, tag))
    return modified_tags

In [23]:
# Apply the custom rule-based tagger
custom_pos_tags = custom_tagger(pos_tags)
print("Custom POS Tags:", custom_pos_tags)

Custom POS Tags: [('Alice', 'NNP')]


In [8]:
from nltk.corpus import treebank
from nltk.tag import UnigramTagger # Rule Based Tagger

In [9]:
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [10]:
# Load tagged sentences from the Treebank corpus
tagged_sentences = treebank.tagged_sents()
print("Sample Tagged Sentences:", tagged_sentences)

Sample Tagged Sentences: [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]


In [11]:
len(tagged_sentences)

3914

In [24]:
# Split date into 80% training and 20% testing
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(tagged_sentences, test_size=0.2, random_state=42)

print("Number of Training Sentences:", len(train_data))
print("Number of Testing Sentences:", len(test_data))

Number of Training Sentences: 3131
Number of Testing Sentences: 783


"painting" (40) -> NNP: 32VBZ: 8 Final Prediction -> Painting -> NNP(most frequent tag)

In [25]:
# Train a Unigram POS Tagger
unigram_tagger = UnigramTagger(train_data)

In [27]:
accuracy = unigram_tagger.accuracy(test_data)
print("Uniram Tagger Accuracy:", round(100*accuracy, 2), "%")

Uniram Tagger Accuracy: 88.27 %


In [28]:
from nltk.tag import BigramTagger

In [29]:
# Train a Bigram POS Tagger with Unigram Tagger as backoff
bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)

In [30]:
# Evaluate the bigram tagger
accuracy = bigram_tagger.accuracy(test_data)
print("Bigram Tagger Accuracy:", round(100*accuracy, 2), "%")

Bigram Tagger Accuracy: 89.12 %


In [31]:
from nltk.tag import TrigramTagger

In [32]:
# Train a Trigram POS Tagger with Bigram as backoff
trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)

In [33]:
# Evaluate the trigram tagger
accuracy = trigram_tagger.accuracy(test_data)
print("Trigram Tagger Accuracy:", round(100*accuracy, 2), "%")

Trigram Tagger Accuracy: 89.15 %


- Pre-trained models
- Pre-trained models + Manual Rule Based Adjustments(This can be done for few cases) ( Customization)
- Customised tained models with non-machine learning logic ( Unigram Tageer, Bigram Tageer, Trigram Taeer) - Fequency Based
- Customised trained model with machine learning logic(Classifer Based POS Tagger)

In [34]:
sent  = "The capital of India is New Delhi"

In [35]:
# Tokenize and tag using th trained tagger
tokens = nltk.word_tokenize(sent)

In [36]:
tags = unigram_tagger.tag(tokens)
print("Tagged Sentences", tags)

Tagged Sentences [('The', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('India', 'NNP'), ('is', 'VBZ'), ('New', 'NNP'), ('Delhi', None)]


In [37]:
for i in tagged_sentences:
  for j in i:
    if j[0] == "Delhi":
      print(j)

In [38]:
from nltk.tag.sequential import ClassifierBasedPOSTagger

In [39]:
# Train a Classifier-Based POS Tagger
classifier_tagger = ClassifierBasedPOSTagger(train=train_data)

In [40]:
# Evaluate the classifier tagger
accuracy = classifier_tagger.accuracy(test_data)
print("Classifier-Bassed Tagger Accuracy:", round(100*accuracy, 2), "%")

Classifier-Bassed Tagger Accuracy: 93.75 %


In [41]:
tags = classifier_tagger.tag(tokens)
print("Tagged Sentences", tags)

Tagged Sentences [('The', 'DT'), ('capital', 'NN'), ('of', 'IN'), ('India', 'NNP'), ('is', 'VBZ'), ('New', 'NNP'), ('Delhi', 'NNP')]
