<a href="https://colab.research.google.com/github/koushik1904/Natural-language-processing-NLB-/blob/main/NLP_LAB_08_2403a52057.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Natural Language Toolkit for NLP tasks like tokenization
import nltk

# Regular expressions for cleaning text
import re

# Mathematical functions for probability and perplexity
import math

# Numerical computations
import numpy as np

# Data handling and table display
import pandas as pd

# Counter for counting words and N-grams
from collections import Counter

# Tokenizers for sentences and words
from nltk.tokenize import sent_tokenize, word_tokenize

# Download required NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [7]:
import nltk

# Download required tokenizer resources
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Load text corpus (must contain at least 1500 words)
with open("/content/corpus.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Display sample text
print(text[:500])

Natural language processing is a field of artificial intelligence that focuses on the interaction between computers and human language. It allows machines to read, understand, and generate text in a meaningful way. Language models are an important part of natural language processing because they assign probabilities to sequences of words. These models help systems predict the next word in a sentence based on previous words. Applications of language models include machine translation, speech reco


In [22]:
def preprocess_text(text):
    # Tokenize text into sentences first to preserve sentence boundaries
    sentences = sent_tokenize(text)

    processed_sentences = []

    for sentence in sentences:
        # Convert sentence to lowercase
        sentence = sentence.lower()

        # Remove punctuation and numbers from the sentence
        # Now this will apply to individual sentences without affecting tokenization
        sentence = re.sub(r'[^a-z\s]', '', sentence)

        # Tokenize sentence into words
        words = word_tokenize(sentence)

        # Add start and end tokens
        words = ['<s>'] + words + ['</s>']
        processed_sentences.append(words)

    return processed_sentences

In [26]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Keep dots for sentence splitting
    text = re.sub(r'[^a-z\s.]', '', text)

    # Split sentences manually (safe)
    sentences = text.split('.')

    processed_sentences = []

    for sentence in sentences:
        words = sentence.strip().split()
        if len(words) > 0:
            processed_sentences.append(['<s>'] + words + ['</s>'])

    return processed_sentences


In [27]:
def build_unigram(data):
    return Counter([word for sentence in data for word in sentence])


In [28]:
def build_bigram(data):
    return Counter([
        (sentence[i], sentence[i+1])
        for sentence in data
        for i in range(len(sentence)-1)
    ])


In [29]:
def build_trigram(data):
    return Counter([
        (sentence[i], sentence[i+1], sentence[i+2])
        for sentence in data
        for i in range(len(sentence)-2)
    ])


In [30]:
# Build models
unigrams = build_unigram(train_data)
bigrams = build_bigram(train_data)
trigrams = build_trigram(train_data)

vocab_size = len(unigrams)


In [31]:
# Unigram table
pd.DataFrame(unigrams.most_common(10), columns=["Word", "Count"])


Unnamed: 0,Word,Count
0,language,56
1,<s>,48
2,</s>,48
3,of,32
4,and,32
5,models,32
6,a,24
7,in,24
8,natural,16
9,processing,16


In [32]:
# Bigram table
pd.DataFrame(bigrams.most_common(10), columns=["Bigram", "Count"])


Unnamed: 0,Bigram,Count
0,"(language, models)",24
1,"(natural, language)",16
2,"(language, processing)",16
3,"(human, language)",16
4,"(in, a)",16
5,"(words, </s>)",16
6,"(<s>, natural)",8
7,"(processing, is)",8
8,"(is, a)",8
9,"(a, field)",8


In [15]:
# Trigram table
pd.DataFrame(trigrams.most_common(10), columns=["Trigram", "Count"])


Unnamed: 0,Trigram,Count


In [33]:
def unigram_probability(word):
    return (unigrams[word] + 1) / (sum(unigrams.values()) + vocab_size)

def bigram_probability(w1, w2):
    return (bigrams[(w1, w2)] + 1) / (unigrams[w1] + vocab_size)

def trigram_probability(w1, w2, w3):
    return (trigrams[(w1, w2, w3)] + 1) / (bigrams[(w1, w2)] + vocab_size)


In [34]:
def sentence_probability(sentence, model):
    words = ['<s>'] + word_tokenize(sentence.lower()) + ['</s>']
    probability = 1

    if model == "unigram":
        for w in words:
            probability *= unigram_probability(w)

    elif model == "bigram":
        for i in range(len(words)-1):
            probability *= bigram_probability(words[i], words[i+1])

    elif model == "trigram":
        for i in range(len(words)-2):
            probability *= trigram_probability(words[i], words[i+1], words[i+2])

    return probability


In [35]:
sentences = [
    "language models are important",
    "this is a simple test",
    "n gram models predict words",
    "the system learns probabilities",
    "this sentence is unseen"
]


In [36]:
# Check if vocab_size is zero, indicating empty training data
if vocab_size == 0:
    print("Error: Vocabulary size is 0. This typically means the training data (train_data) is empty.")
    print("Please check the data splitting logic in cell 'ngGwxXL99JET' and ensure 'train_data' contains sentences.")
else:
    for s in sentences:
        print("\nSentence:", s)
        # If vocab_size is not zero, the denominators in probability functions should be non-zero
        # as they all add `vocab_size` to their respective counts.
        print("Unigram Probability:", sentence_probability(s, "unigram"))
        print("Bigram Probability:", sentence_probability(s, "bigram"))
        print("Trigram Probability:", sentence_probability(s, "trigram"))


Sentence: language models are important
Unigram Probability: 6.527560163828783e-10
Bigram Probability: 2.4373515201562826e-07
Trigram Probability: 2.2416153622716095e-06

Sentence: this is a simple test
Unigram Probability: 1.0616313591626366e-15
Bigram Probability: 3.5290382249500016e-11
Trigram Probability: 6.153904564814774e-10

Sentence: n gram models predict words
Unigram Probability: 2.3823007699609565e-14
Bigram Probability: 4.9645538796280764e-11
Trigram Probability: 6.877893337145923e-10

Sentence: the system learns probabilities
Unigram Probability: 6.554936664013784e-13
Bigram Probability: 2.920325811545927e-10
Trigram Probability: 4.676967469259228e-08

Sentence: this sentence is unseen
Unigram Probability: 3.4702605868308264e-13
Bigram Probability: 3.227728528550761e-10
Trigram Probability: 4.676967469259228e-08


In [37]:
def perplexity(sentence, model):
    words = ['<s>'] + word_tokenize(sentence.lower()) + ['</s>']
    N = len(words)
    log_prob = 0

    if model == "unigram":
        for w in words:
            log_prob += math.log(unigram_probability(w))

    elif model == "bigram":
        for i in range(len(words)-1):
            log_prob += math.log(bigram_probability(words[i], words[i+1]))

    elif model == "trigram":
        for i in range(len(words)-2):
            log_prob += math.log(trigram_probability(words[i], words[i+1], words[i+2]))

    return math.exp(-log_prob / N)


In [38]:
for s in sentences:
    print("\nSentence:", s)
    print("Unigram Perplexity:", perplexity(s, "unigram"))
    print("Bigram Perplexity:", perplexity(s, "bigram"))
    print("Trigram Perplexity:", perplexity(s, "trigram"))



Sentence: language models are important
Unigram Perplexity: 33.95274195450601
Bigram Perplexity: 12.652615347242236
Trigram Perplexity: 8.741242144402024

Sentence: this is a simple test
Unigram Perplexity: 137.7674464643477
Bigram Perplexity: 31.13091948464274
Trigram Perplexity: 20.693578848013384

Sentence: n gram models predict words
Unigram Perplexity: 88.33714196896518
Bigram Perplexity: 29.6494819953641
Trigram Perplexity: 20.367369280238847

Sentence: the system learns probabilities
Unigram Perplexity: 107.29313044782475
Bigram Perplexity: 38.82350867225823
Trigram Perplexity: 16.65990837100792

Sentence: this sentence is unseen
Unigram Perplexity: 119.29062629305031
Bigram Perplexity: 38.18128142022645
Trigram Perplexity: 16.65990837100792
