<a href="https://colab.research.google.com/github/kryptobolt07/Paytm/blob/main/Ml%20programs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Part 1: Hidden Markov Model (HMM) for Part-of-Speech Tagging using Viterbi Algorithm
# Note: This section uses the NLTK Treebank corpus, so the "sample text" for the model itself is fixed, 
# but the test sample sentences are randomized.

import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import time

# Download required NLTK data
nltk.download('treebank')
nltk.download('universal_tagset')

# Get the Penn Treebank corpus tagged with Universal Tagset
corpus_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))

print("\n--- Sample Tagged Sentences from Treebank ---")
print(corpus_data[:2])

# Display the word/tag tuples individually for better inspection
print("\n--- Individual Word/Tag Pairs for First Two Sentences ---")
for sentence in corpus_data[:2]:
    for word_tag_pair in sentence:
        print(word_tag_pair)

# Split the corpus into training and testing sets
train_set, test_set = train_test_split(corpus_data, train_size=0.80, test_size=0.20, random_state=101)

# Flatten the list of sentences into a single list of (word, tag) tuples for both sets
train_tagged_words = [pair for sentence in train_set for pair in sentence]
test_tagged_words_flat = [pair for sentence in test_set for pair in sentence] # Renamed to avoid confusion with words-only list later

print(f"\nTotal tagged words in training data: {len(train_tagged_words)}")
print(f"Total tagged words in testing data: {len(test_tagged_words_flat)}")

# Check the first few tagged words in the training data
print(f"\nFirst 5 tagged words in training data: {train_tagged_words[:5]}")

# Determine the unique set of POS tags in the training data
pos_tags = {tag for word, tag in train_tagged_words}
print(f"\nNumber of unique tags: {len(pos_tags)}")
print(f"Unique POS tags: {pos_tags}")

# Determine the total unique words (vocabulary) in the training data
vocabulary = {word for word, tag in train_tagged_words}

print(f"Total unique words in vocabulary: {len(vocabulary)}")


# Function to calculate the probability of a word given a tag (Emission Probability P(word | tag))
def word_given_tag_prob(word, tag, train_data=train_tagged_words):
    # Filter pairs matching the given tag
    tag_occurrences = [pair for pair in train_data if pair[1] == tag]
    count_tag = len(tag_occurrences) # Count of the specific tag
    
    # Filter pairs where both the word and the tag match
    word_tag_occurrences = [pair for pair in tag_occurrences if pair[0] == word]
    count_word_given_tag = len(word_tag_occurrences)
    
    return (count_word_given_tag, count_tag)


# Function to calculate the transition count from tag t1 to t2
def tag_transition_counts(t1, t2, train_data=train_tagged_words):
    # Extract only the sequence of tags from the training data
    tag_sequence = [pair[1] for pair in train_data]
    
    # Count occurrences of the first tag (t1)
    count_t1 = tag_sequence.count(t1)
    
    # Count occurrences of the transition t1 -> t2
    count_t2_given_t1 = 0
    for index in range(len(tag_sequence) - 1):
        if tag_sequence[index] == t1 and tag_sequence[index+1] == t2:
            count_t2_given_t1 += 1
            
    return (count_t2_given_t1, count_t1)


# Calculate the Transition Probability Matrix P(t2 | t1)
tag_list = list(pos_tags)
tags_matrix = np.zeros((len(tag_list), len(tag_list)), dtype='float32')

# Iterate through all possible tag transitions (t1 -> t2)
for i, t1 in enumerate(tag_list):
    for j, t2 in enumerate(tag_list):
        count_t2_t1, count_t1 = tag_transition_counts(t1, t2)
        
        # Probability is P(t2|t1) = Count(t1, t2) / Count(t1)
        tags_matrix[i, j] = count_t2_t1 / count_t1 if count_t1 != 0 else 0

print("\n--- Transition Probability Matrix (Numpy) ---")
print(tags_matrix)

# Convert the transition matrix into a Pandas DataFrame for easier lookup (Transition Probabilities)
tags_df = pd.DataFrame(tags_matrix, columns=tag_list, index=tag_list)

print("\n--- Transition Probability Matrix (DataFrame) ---")
print(tags_df)


# Viterbi Algorithm implementation for POS Tagging
def Viterbi_Tagger(word_sequence, train_data=train_tagged_words):
    # The sequence of predicted tags will be stored here
    predicted_state_sequence = []
    # Get the set of all unique possible tags
    all_tags = list(set([pair[1] for pair in train_data]))

    # Iterate through the words in the input sequence
    for index, current_word in enumerate(word_sequence):
        probabilities = []
        
        # For each possible tag (t) for the current word
        for current_tag in all_tags:
            # 1. Calculate Transition Probability P(current_tag | previous_tag)
            if index == 0:  # Start of sentence - P(current_tag | start_of_sentence)
                # The tag '.' is used in this corpus as the end-of-sentence marker, 
                # which can be repurposed for the start state transition.
                transition_prob = tags_df.loc['.', current_tag]
            else:
                # Transition from the last predicted tag to the current tag
                transition_prob = tags_df.loc[predicted_state_sequence[-1], current_tag]

            # 2. Calculate Emission Probability P(word | tag)
            count_w_given_tag, count_tag = word_given_tag_prob(current_word, current_tag)
            # Handle unknown words/tags gracefully by checking for count_tag != 0
            emission_prob = count_w_given_tag / count_tag if count_tag != 0 else 0

            # 3. Calculate the Viterbi probability
            # Viterbi_Prob = Emission_Prob * Transition_Prob
            state_probability = emission_prob * transition_prob
            probabilities.append(state_probability)

        # Find the maximum probability and the corresponding tag
        max_prob = max(probabilities)
        best_tag = all_tags[probabilities.index(max_prob)]
        predicted_state_sequence.append(best_tag)

    # Return the tagged sequence as a list of (word, tag) tuples
    return list(zip(word_sequence, predicted_state_sequence))


# Testing the Viterbi Tagger on a sample of the test set
random.seed(1234)

# Select 10 random sentence indices from the test set
random_indices = [random.randint(0, len(test_set) - 1) for _ in range(10)]

# Create the test run set using these random sentences
test_sentences = [test_set[i] for i in random_indices]

# Flatten the actual tagged words for comparison
test_base_flat = [pair for sentence in test_sentences for pair in sentence]

# Extract only the words to feed into the Viterbi Tagger
input_words = [pair[0] for sentence in test_sentences for pair in sentence]

print(f"\nTotal words in the test run sample: {len(input_words)}")

# Time and run the Viterbi Tagger
start_time = time.time()
predicted_sequence = Viterbi_Tagger(input_words)
end_time = time.time()
time_elapsed = end_time - start_time

print("Time taken for Viterbi tagging (seconds): ", time_elapsed)

# Evaluate the model's accuracy
# Compare the predicted (word, tag) pairs with the actual (word, tag) pairs
correct_predictions = [1 for predicted, actual in zip(predicted_sequence, test_base_flat) if predicted == actual]

accuracy = len(correct_predictions) / len(predicted_sequence)
print('Viterbi HMM Tagger Accuracy on Sample: {:.2f}%'.format(accuracy * 100))

# Display a sample result
print("\n--- Sample Predicted vs Actual Tagging ---")
for i in range(15):
    print(f"Word: {predicted_sequence[i][0]}, Predicted Tag: {predicted_sequence[i][1]}, Actual Tag: {test_base_flat[i][1]}")

In [None]:
# Part 2: Sentence Similarity Metrics (Jaccard, TF-IDF Cosine, Word2Vec Cosine)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Ensure the 'punkt' tokenizer is available for NLTK
nltk.download('punkt')

# Function for Jaccard Similarity (Set-based similarity)
def calculate_jaccard_similarity(text_a, text_b):
    # Convert to lowercase and split into words to create sets
    set_a, set_b = set(text_a.lower().split()), set(text_b.lower().split())
    # Jaccard = |Intersection| / |Union|
    return len(set_a & set_b) / len(set_a | set_b)

# Function for Cosine Similarity using TF-IDF vectors
def calculate_tfidf_cosine_similarity(text_a, text_b):
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    # Fit and transform the two documents (sentences)
    tfidf_vectors = vectorizer.fit_transform([text_a, text_b])
    # Calculate cosine similarity between the two vectors
    return cosine_similarity(tfidf_vectors[0:1], tfidf_vectors[1:2])[0][0]

# Function for Cosine Similarity using Word2Vec sentence embeddings
def calculate_word2vec_similarity(text_a, text_b):
    # Tokenize sentences
    tokens_a, tokens_b = word_tokenize(text_a.lower()), word_tokenize(text_b.lower())
    
    # Train a minimal Word2Vec model on the two sentences
    # vector_size=100, window=3, min_count=1 are common starting parameters
    word2vec_model = Word2Vec([tokens_a, tokens_b], vector_size=100, window=3, min_count=1, workers=1)
    
    # Calculate sentence vector by averaging word vectors (simple approach)
    vec_a = sum(word2vec_model.wv[word] for word in tokens_a) / len(tokens_a)
    vec_b = sum(word2vec_model.wv[word] for word in tokens_b) / len(tokens_b)
    
    # Compute cosine similarity between the two sentence vectors
    return cosine_similarity([vec_a], [vec_b])[0][0]

# Define the input sentences (CHANGED SAMPLE TEXTS)
sentence_a = "The ancient scroll describes a dragon that breathes fire and ice."
sentence_b = "An old parchment speaks of a serpent that exhales heat and frost."

print("\n--- Comparing Two Sentences ---")
print(f"Sentence A: {sentence_a}")
print(f"Sentence B: {sentence_b}")

# Calculate and display similarities
print("\nJaccard Similarity:", calculate_jaccard_similarity(sentence_a, sentence_b))
print("TF-IDF Cosine Similarity:", calculate_tfidf_cosine_similarity(sentence_a, sentence_b))
print("Word2Vec Similarity:", calculate_word2vec_similarity(sentence_a, sentence_b))

In [None]:
# Part 3: N-Gram Language Model using Maximum Likelihood Estimation (MLE)

import pandas as pd
import re
from nltk.lm import MLE
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.util import ngrams

# Sample text corpus for training the language model (CHANGED SAMPLE TEXTS)
sample_text = """
The cat sat on the mat.
The dog ran past the cat.
The mat is old.
The dog is fast.
The cat is black.
"""

# Tokenization and lowercase conversion
# Use regex to find word boundaries
tokens = [word.lower() for word in re.findall(r"\b\w+\b", sample_text)]
print(f"\nTokens used for training: {tokens}")

print("\n==========================")

## N=2 (Bigram) Language Model
N_BIGRAM = 2
# Prepare data for MLE model: padding sentences and generating N-grams
train_data_bigram, vocab_bigram = padded_everygram_pipeline(N_BIGRAM, [tokens])

# Initialize and fit the MLE model (no smoothing)
lm_bigram = MLE(N_BIGRAM)
lm_bigram.fit(train_data_bigram, vocab_bigram)

print(f"N={N_BIGRAM} (Bigram Model) Trained")

# Display a sample of bigram counts for inspection
bigram_counts = {}
for token in sorted(list(set(tokens))):
    # Get counts of all words following the current token
    bigram_counts[token] = dict(lm_bigram.counts[(token,)])
print("Bigram successor counts (P(w_i | w_{i-1})):", bigram_counts)

print("\n--- Word Prediction with Bigram Model ---")
# Generate a 3-word sequence starting from 'the'
context_word = 'the'
predicted_words_bigram = [context_word]

for _ in range(3):
    # Check if the context has any successors in the model
    if lm_bigram.counts[(context_word,)]:
        # Find the word with the maximum count (most probable next word)
        next_word = max(lm_bigram.counts[(context_word,)].items(), key=lambda x: x[1])[0]
        predicted_words_bigram.append(next_word)
        context_word = next_word # Update context for next iteration
    else:
        break
print(f"Bigram prediction (start='the', max 3 words): {' '.join(predicted_words_bigram)}")

print("\n==========================")

## N=3 (Trigram) Language Model
N_TRIGRAM = 3
train_data_trigram, vocab_trigram = padded_everygram_pipeline(N_TRIGRAM, [tokens])

lm_trigram = MLE(N_TRIGRAM)
lm_trigram.fit(train_data_trigram, vocab_trigram)

print(f"N={N_TRIGRAM} (Trigram Model) Trained")

# Display a sample of trigram counts
trigram_counts = {}
bigrams_in_text = list(ngrams(tokens, N_TRIGRAM - 1)) # Get all bigram contexts
for bigram_context in sorted(list(set(bigrams_in_text))):
    # Get counts of all words following the bigram context
    trigram_counts[bigram_context] = dict(lm_trigram.counts[bigram_context])
print("Trigram successor counts (P(w_i | w_{i-2} w_{i-1})):", trigram_counts)

print("\n--- Word Prediction with Trigram Model ---")
# Generate a 3-word sequence starting from 'the cat'
context_list = ['the', 'cat'] # Need two words for a trigram context
predicted_words_trigram = context_list.copy()

for _ in range(3):
    context_tuple = tuple(context_list)
    # Check if the bigram context has any successors
    if lm_trigram.counts[context_tuple]:
        # Find the word with the maximum count
        next_word = max(lm_trigram.counts[context_tuple].items(), key=lambda x: x[1])[0]
        predicted_words_trigram.append(next_word)
        # Update the context to the last two words
        context_list = [context_list[-1], next_word]
    else:
        break
print(f"Trigram prediction (start='the cat', max 3 words): {' '.join(predicted_words_trigram)}")