Practicle 1 - Write and Demonstrate a NLP Program to Word and Text
Analysis.


In [None]:
!pip install nltk
nltk.download('punkt')
nltk.download('stopwords')

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# Sample text
text = "Natural Language Processing is fascinating. Natural language is very interesting"

# Tokenization
tokens = word_tokenize(text)

# Convert to lower case
tokens = [word.lower() for word in tokens]

# Remove punctuation and numbers
words = [word for word in tokens if word.isalpha()]

# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if not word in stop_words]

# Frequency distribution
freq_dist = nltk.FreqDist(words)

# Print 10 most common words
for word, frequency in freq_dist.most_common(10):
  print(f"{word}: {frequency}")


natural: 2
language: 2
processing: 1
fascinating: 1
interesting: 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Practicle 2 - Write a program to Implement Word Generation using NLP
Technique.

In [None]:
!pip install nltk
import nltk
import random

nltk.download("words")

from nltk.corpus import words

class WordGenerator:
  def __init__ (self):
    self.word_list = set(words.words())

  def generate_word(self, length = 6):
    word = ""
    while len(word) != length:
      word = random.choice(list(self.word_list))
    return word

generator = WordGenerator()

for _ in range(6):
  print(generator.generate_word())



[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Vannic
scutty
Wapato
bezzle
yieldy
beelol


Practicle 3 - Write a Program to understand the morphology( morphology is the study of the structure and formation of words in a language.) of a word by
the use of Add-Delete table.

In [None]:
add_delete_table = [
    ("ing", ""),
    ("ed", ""),
    ("", "e"),
    ("er", ""),
    ("est", ""),
]

def apply_morphology_rule(word, add_delete_table):
  morphed_word = []

  for rule in add_delete_table:
    add, delete = rule

    if delete:
      if(word.endswith(delete)):
        morphed_word.append(word[:-len(delete)] + add)
    else:
      morphed_word.append(word + add)
  return morphed_word

word = "dance"
print(f"Morphed words for '{word}' : {apply_morphology_rule(word, add_delete_table)}")

Morphed words for 'dance' : ['danceing', 'danceed', 'danc', 'danceer', 'danceest']


Practicle 4 - Write a Program to learn to calculate bigrams(a bigram is a combination of two adjacent words in a sentence, document, or corpus) from a given
corpus(
A corpus is a large collection of texts used for studying language patterns and characteristics) and calculate probability of a sentence.

In [None]:
from collections import defaultdict
import re

def calculate_bigrams(text):
    # Preprocess the text
    text = re.sub(r'\W+', ' ', text.lower())
    words = text.split()

    # Initialize a dictionary to store the bigrams
    bigrams = defaultdict(int)

    # Calculate the bigrams
    for i in range(len(words) - 1):
        bigrams[(words[i], words[i+1])] += 1
    return bigrams

def calculate_sentence_probability(sentence, bigrams):
    # Preprocess the sentence
    sentence = re.sub(r'\W+', ' ', sentence.lower())
    words = sentence.split()

    # Initialize the probability
    probability = 1.0

    # Calculate the probability of the sentence
    for i in range(len(words) - 1):
        bigram = (words[i], words[i+1])
        bigram_count = bigrams[bigram]
        word_count = sum([count for bigram, count in bigrams.items() if bigram[0] == words[i]])
        probability *= bigram_count / word_count if word_count > 0 else 0
    return probability

# Test the functions with a corpus and a sentence
corpus = "The cat sat on the mat. The cat ate the rat. The rat ran away."
sentence = "The cat sat on the mat"
bigrams = calculate_bigrams(corpus)
probability = calculate_sentence_probability(sentence, bigrams)
print(f"Probability of sentence '{sentence}': {probability}")


Probability of sentence 'The cat sat on the mat': 0.04000000000000001


Practicle 5 - Demonstrate and design a program to learn how to apply add-
one smoothing(Smoothing adjusts probabilities in models to avoid zero probabilities for unseen or low-frequency events, improving model reliability.
) on sparse bigram table.

In [None]:
from collections import Counter
from nltk import bigrams
import re

def preprocess_text(text):
    # Convert to lowercase and remove non-alphabetic characters
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

def calculate_bigrams(corpus):
    # Tokenize the preprocessed text into words
    words = corpus.split()
    # Generate bigrams using NLTK library
    bigram_list = list(bigrams(words))
    # Count the occurrences of each bigram
    bigram_counts = Counter(bigram_list)
    return bigram_counts

def apply_add_one_smoothing(bigram_counts):
    # Create a copy of the bigram counts with added smoothing
    smoothed_bigram_counts = bigram_counts.copy()
    # Get the vocabulary size (unique bigrams)
    vocabulary_size = len(set(bigram_counts.keys()))
    # Apply add-one smoothing
    for bigram in smoothed_bigram_counts:
        smoothed_bigram_counts[bigram] += 1
    # Add pseudo-counts for unseen bigrams
    smoothed_bigram_counts[None] = vocabulary_size
    return smoothed_bigram_counts

def main():
    # Example corpus
    corpus = "This is a simple example sentence. Another example is also given for demonstration purposes."
    # Preprocess and calculate bigrams
    preprocessed_corpus = preprocess_text(corpus)
    bigram_counts = calculate_bigrams(preprocessed_corpus)
    # Print the original bigram counts
    print("Original Bigrams and their counts:")
    for bigram, count in bigram_counts.items():
        print(f"{bigram}: {count}")
    # Apply add-one smoothing
    smoothed_bigram_counts = apply_add_one_smoothing(bigram_counts)
    # Print the bigram counts after add-one smoothing
    print("\nBigrams and their counts after Add-One Smoothing:")
    for bigram, count in smoothed_bigram_counts.items():
        print(f"{bigram}: {count}")

if __name__ == "__main__":
    main()


Original Bigrams and their counts:
('this', 'is'): 1
('is', 'a'): 1
('a', 'simple'): 1
('simple', 'example'): 1
('example', 'sentence'): 1
('sentence', 'another'): 1
('another', 'example'): 1
('example', 'is'): 1
('is', 'also'): 1
('also', 'given'): 1
('given', 'for'): 1
('for', 'demonstration'): 1
('demonstration', 'purposes'): 1

Bigrams and their counts after Add-One Smoothing:
('this', 'is'): 2
('is', 'a'): 2
('a', 'simple'): 2
('simple', 'example'): 2
('example', 'sentence'): 2
('sentence', 'another'): 2
('another', 'example'): 2
('example', 'is'): 2
('is', 'also'): 2
('also', 'given'): 2
('given', 'for'): 2
('for', 'demonstration'): 2
('demonstration', 'purposes'): 2
None: 13


Practicle 6 - Write a program to find POS tags of words in a sentence using
Viterbi decoding.

In [None]:
#!pip install nltk
import nltk
nltk.download('treebank')
nltk.download('punkt')
from nltk.corpus import treebank

# Training the Hidden Markov Model (HMM) on the Penn Treebank corpus
train_data = treebank.tagged_sents()[:3000]  # Using a subset for training
hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_data)

def viterbi_decode(sentence, hmm_tagger):
    # Tokenize the input sentence
    words = nltk.word_tokenize(sentence)
    # Use the Viterbi algorithm to find the most likely sequence of POS tags
    tags = hmm_tagger.tag(words)
    return tags

# Example usage
input_sentence = "This is a sample sentence."
result = viterbi_decode(input_sentence, hmm_tagger)
# Print the result
print(result)


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('sample', 'JJ'), ('sentence', 'NNS'), ('.', '.')]


Practicle 7 - Calculate emission(For example, the probability of observing the word "run" given its POS tag as a verb might be higher compared to its probability as a noun.) and

transition matrix(For example, the probability of a noun being followed by a verb might be higher than being followed by an adjective.) which will be helpful
for tagging Parts of Speech using Hidden Markov Model.

In [None]:
from collections import defaultdict

# Example tagged sentences
tagged_sentences = [
    [('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('on', 'IN'), ('the', 'DT'), ('mat', 'NN')],
    [('A', 'DT'), ('dog', 'NN'), ('chases', 'VBZ'), ('the', 'DT'), ('cat', 'NN')]
]

# Calculate emission matrix
def calculate_emission_matrix(tagged_sentences):
    emission_matrix = defaultdict(lambda: defaultdict(int))
    word_counts = defaultdict(int)

    for sentence in tagged_sentences:
        for word, tag in sentence:
            emission_matrix[tag][word] += 1
            word_counts[word] += 1

    # Convert counts to probabilities
    for tag in emission_matrix:
        total_count = sum(emission_matrix[tag].values())
        for word in emission_matrix[tag]:
            emission_matrix[tag][word] /= total_count

    return emission_matrix, word_counts

# Calculate transition matrix
def calculate_transition_matrix(tagged_sentences):
    transition_matrix = defaultdict(lambda: defaultdict(int))

    for sentence in tagged_sentences:
        prev_tag = None
        for _, tag in sentence:
            if prev_tag is not None:
                transition_matrix[prev_tag][tag] += 1
            prev_tag = tag

    # Convert counts to probabilities
    for prev_tag in transition_matrix:
        total_count = sum(transition_matrix[prev_tag].values())
        for tag in transition_matrix[prev_tag]:
            transition_matrix[prev_tag][tag] /= total_count

    return transition_matrix

# Example usage
emission_matrix, word_counts = calculate_emission_matrix(tagged_sentences)
transition_matrix = calculate_transition_matrix(tagged_sentences)

# Print emission matrix
print("Emission Matrix:")
for tag, emissions in emission_matrix.items():
    print(tag)
    for word, prob in emissions.items():
        print(f"  {word}: {prob:.4f}")

# Print transition matrix
print("\nTransition Matrix:")
for prev_tag, transitions in transition_matrix.items():
    print(prev_tag)
    for tag, prob in transitions.items():
        print(f"  {tag}: {prob:.4f}")


Emission Matrix:
DT
  The: 0.2500
  the: 0.5000
  A: 0.2500
NN
  cat: 0.5000
  mat: 0.2500
  dog: 0.2500
VBZ
  is: 0.5000
  chases: 0.5000
IN
  on: 1.0000

Transition Matrix:
DT
  NN: 1.0000
NN
  VBZ: 1.0000
VBZ
  IN: 0.5000
  DT: 0.5000
IN
  DT: 1.0000


Practicle 8 - Design and demonstrate the program to know the importance of
selecting proper features for training a model and size of
training corpus in learning how to do chunking(identify and label).

In [None]:
import nltk
nltk.download('conll2000')
from nltk.corpus import conll2000
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.chunk import ChunkParserI

# Define the feature selection function
def features(sentence, index):
    return {
        'word': sentence[index][0],
        'postag': sentence[index][1],
    }

# Define the Chunker class
class Chunker(ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t, c) for w, t, c in tree2conlltags(sent)] for sent in train_sents]
        self.tagger = nltk.TrigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return conlltags2tree(conlltags)

# Load the training and testing data
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

# Train the model
chunker = Chunker(train_sents)

# Test the model
print(chunker.evaluate(test_sents))


[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Unzipping corpora/conll2000.zip.
  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  print(chunker.evaluate(test_sents))


ChunkParse score:
    IOB Accuracy:  93.3%%
    Precision:     82.5%%
    Recall:        86.8%%
    F-Measure:     84.6%%
