<a href="https://colab.research.google.com/github/jeetnsinha/jeet-phd-aiprojects/blob/main/Viterbi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import re
from collections import Counter

# Function to segment a string without spaces into its component words
def segment_text(text):
    """
    Segments a string with no spaces into a list of words using a unigram word model
    and dynamic programming (Viterbi algorithm).

    Args:
        text (str): The input string without spaces.

    Returns:
        list: A list of segmented words.
    """
    # Viterbi dynamic programming arrays:
    probs = [1.0]  # Stores probabilities of the best segmentation up to each position
    lasts = [0]    # Stores the last split point for the best segmentation up to each position

    # Iterate over each position in the input text
    for i in range(1, len(text) + 1):
        max_prob = 0  # Maximum probability for the current position
        best_split = 0  # Best split point for the current position

        # Evaluate all possible splits ending at position i
        for j in range(max(0, i - max_word_length), i):
            # Calculate the probability of the current segmentation
            curr_prob = probs[j] * word_prob(text[j:i])  # Unigram probability combined with prior segmentation
            if curr_prob > max_prob:  # Update if a better segmentation is found
                max_prob = curr_prob
                best_split = j

        # Store the best probability and split point for position i
        probs.append(max_prob)
        lasts.append(best_split)

    # Backtrack using the Viterbi path (lasts array) to extract the segmented words
    words = []
    i = len(text)
    while i > 0:
        words.append(text[lasts[i]:i])  # Add the segment corresponding to the last split
        i = lasts[i]  # Move to the previous split point
    words.reverse()  # Reverse the list to get the correct order
    return words

# Function to calculate the probability of a word (Unigram model)
def word_prob(word):
    """
    Returns the probability of a word based on the predefined unigram dictionary.

    Args:
        word (str): The word to evaluate.

    Returns:
        float: The probability of the word.
    """
    # If the word is non-alphabetic and a single character, assign probability 1 (penalize noise)
    if not wordPattern.match(word.lower()) and len(word) == 1:
        return 1

    # Return the frequency of the word in the dictionary, with smoothing for unseen words
    return dictionary.get(word.lower(), 1e-6) / total

# Helper function to extract words from text
def words(text):
    """
    Extracts a list of words from a given text using a regex pattern.

    Args:
        text (str): Input text.

    Returns:
        list: A list of words found in the text.
    """
    return re.findall('[a-z]+', text.lower())

# Regular expression pattern to match alphabetic words
wordPattern = re.compile('[a-z]+')

# Define a small example corpus as a base dictionary
sample_corpus = """
the longest list of the stuff at domain name long last
"""
# Build a unigram frequency dictionary from the sample corpus
dictionary = Counter(words(sample_corpus))

# Find the maximum word length from the dictionary for optimization
max_word_length = max(map(len, dictionary))

# Calculate the total word frequency count for normalization
total = float(sum(dictionary.values()))

# Example usage: segment a space-free string into its component words
input_text = "thelongestlistofthelongeststuffatthelongestdomainnameatlonglast.com"

# Segment the input text
segmented_words = segment_text(input_text)

# Print the resulting list of words
print(segmented_words)


['the', 'longest', 'list', 'of', 'the', 'longest', 'stuff', 'at', 'the', 'longest', 'domain', 'name', 'at', 'long', 'last', '.com']
