<a href="https://colab.research.google.com/github/kabilan-39/11239A039-DAA/blob/main/NLP_observation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')  # Download required data once

lemmatizer = WordNetLemmatizer()

words = ['running', 'cats', 'better', 'wolves', 'was']

# Convert words to their base form
lemmas = [lemmatizer.lemmatize(word) for word in words]

print("Original:", words)
print("Lemmatized:", lemmas)

[nltk_data] Downloading package wordnet to /root/nltk_data...


Original: ['running', 'cats', 'better', 'wolves', 'was']
Lemmatized: ['running', 'cat', 'better', 'wolf', 'wa']


In [None]:
#Tokenization
import nltk
from nltk.tokenize import word_tokenize

# Download the necessary resource
nltk.download('punkt')
nltk.download('punkt_tab')

# Example text
text = "Hello! How are you doing today? Let's tokenize this sentence."

# Tokenize the text into words
tokens = word_tokenize(text)

print(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['Hello', '!', 'How', 'are', 'you', 'doing', 'today', '?', 'Let', "'s", 'tokenize', 'this', 'sentence', '.']


In [None]:
#Stemming
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample words
words = ["running", "flies", "cats", "better", "studies"]

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

print("Word\tStem\tLemma")
for word in words:
    print(f"{word}\t{stemmer.stem(word)}\t{lemmatizer.lemmatize(word)}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Word	Stem	Lemma
running	run	running
flies	fli	fly
cats	cat	cat
better	better	better
studies	studi	study


In [None]:
#Morphology
import nltk
from nltk.stem import PorterStemmer

# Initialize the stemmer
stemmer = PorterStemmer()

# Example words
words = ['running', 'runs', 'ran', 'easily', 'fairly']

# Stem the words
for word in words:
    stem = stemmer.stem(word)
    print(f"{word} -> {stem}")

running -> run
runs -> run
ran -> ran
easily -> easili
fairly -> fairli


In [None]:
#Spelling correction
from textblob import TextBlob

# Example text with a misspelled word
text = "I havv a speling error."

# Create a TextBlob object
blob = TextBlob(text)

# Correct the text
corrected_text = blob.correct()

print(f"Original: {text}")
print(f"Corrected: {corrected_text}")

Original: I havv a speling error.
Corrected: I have a spelling error.


In [None]:
#Unigram model
text = "this is a simple unigram model this is simple"
words = text.split()
total = len(words)
freq = {}

for w in words:
    freq[w] = freq.get(w, 0) + 1

for w in freq:
    print(w, freq[w] / total)

this 0.2222222222222222
is 0.2222222222222222
a 0.1111111111111111
simple 0.2222222222222222
unigram 0.1111111111111111
model 0.1111111111111111


In [None]:
#Bigram model
import random
text = "I love NLP".lower().split()
bigrams = [(text[i], text[i+1]) for i in range(len(text)-1)]
model = {w: [] for w, _ in bigrams}
for w1, w2 in bigrams: model[w1].append(w2)
w = random.choice(list(model.keys()))
for _ in range(10):
    print(w, end=' ')
    w = random.choice(model.get(w, ['.']))

love nlp . . . . . . . . 

In [None]:
#Tirgram model
text = "this is a simple trigram model"
words = text.split()

# Count bigrams and trigrams
bi = {}
tri = {}
for i in range(len(words) - 1):
    bi[(words[i], words[i+1])] = bi.get((words[i], words[i+1]), 0) + 1
for i in range(len(words) - 2):
    tri[(words[i], words[i+1], words[i+2])] = tri.get((words[i], words[i+1], words[i+2]), 0) + 1

# Print trigram probabilities: P(w3|w1,w2) = count(w1,w2,w3) / count(w1,w2)
for (w1, w2, w3), count in tri.items():
    print(f"P({w3}|{w1},{w2}) = {count / bi[(w1, w2)]:.2f}")

P(a|this,is) = 1.00
P(simple|is,a) = 1.00
P(trigram|a,simple) = 1.00
P(model|simple,trigram) = 1.00


In [None]:
#Smoothening N-Gram
from collections import Counter

def ngram_smooth(text, n):
    tokens = text.split()
    ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    counts = Counter(ngrams)
    prefix_counts = Counter([ng[:-1] for ng in ngrams])
    V = len(set(tokens))
    return {ng: (counts[ng]+1)/ (prefix_counts[ng[:-1]] + V) for ng in counts}

# Example
text = "the bird is flying"
probs = ngram_smooth(text, 2)
print(probs)

{('the', 'bird'): 0.4, ('bird', 'is'): 0.4, ('is', 'flying'): 0.4}


In [None]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Download necessary resources (only needed once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


# Input text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text into words
tokens = word_tokenize(text)

# Perform POS tagging
pos_tags = nltk.pos_tag(tokens)

# Print results
print("Tokens and their POS tags:")
for word, tag in pos_tags:
    print(f"{word:10} --> {tag}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


Tokens and their POS tags:
The        --> DT
quick      --> JJ
brown      --> NN
fox        --> NN
jumps      --> VBZ
over       --> IN
the        --> DT
lazy       --> JJ
dog        --> NN
.          --> .


In [None]:
#HMM POS Tagging
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm

# Download the Treebank corpus (for training data)
nltk.download('treebank')

# Load tagged sentences
data = treebank.tagged_sents()

# Split into training and testing data
train_data = data[:3000]
test_data = data[3000:]

# Train an HMM POS tagger
trainer = hmm.HiddenMarkovModelTrainer()
hmm_tagger = trainer.train_supervised(train_data)

# Test the tagger on a sentence
sentence = ["The", "quick", "brown", "fox", "jumps"]
tags = hmm_tagger.tag(sentence)

# Print the results
print("HMM POS Tagging:")
for word, tag in tags:
    print(f"{word:10} --> {tag}")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
  X[i, j] = self._transitions[si].logprob(self._states[j])
  O[i, k] = self._output_logprob(si, self._symbols[k])
  P[i] = self._priors.logprob(si)


HMM POS Tagging:
The        --> DT
quick      --> JJ
brown      --> NNP
fox        --> NNP
jumps      --> NNP


  O[i, k] = self._output_logprob(si, self._symbols[k])


In [None]:
#Bigram
import nltk
from nltk.corpus import treebank
nltk.download('treebank'); nltk.download('universal_tagset')

# Load data
data = treebank.tagged_sents(tagset='universal')
train, test = data[:3500], data[3500:]

# Build taggers
default = nltk.DefaultTagger('NOUN')
uni = nltk.UnigramTagger(train, backoff=default)
bi = nltk.BigramTagger(train, backoff=uni)

# Blend function (prefer bigram if available)
def blend_tag(sentence):
    u, b = uni.tag(sentence), bi.tag(sentence)
    return [(w, bt or ut) for ((w, bt), (_, ut)) in zip(b, u)]

# Test
sent = ["The", "cat", "sits", "on", "the", "mat"]
print("Blended POS Tags:", blend_tag(sent))
print("Accuracy:", bi.evaluate(test))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


Blended POS Tags: [('The', 'DET'), ('cat', 'NOUN'), ('sits', 'NOUN'), ('on', 'ADP'), ('the', 'DET'), ('mat', 'NOUN')]
Accuracy: 0.9391034950140381


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print("Accuracy:", bi.evaluate(test))
