<a href="https://colab.research.google.com/github/jeetnsinha/jeet-phd-aiprojects/blob/main/ViterbiWordFreq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import math

# Function to compute the probability of a word given the total word count
def calculate_word_probability(word, total_word_count, word_statistics):
    if word in word_statistics:
        return -math.log(word_statistics[word] / total_word_count)
    else:
        return float('inf')  # Return high cost for unknown words

# Viterbi algorithm for segmenting text
def segment_text_viterbi(input_text, word_statistics):
    text_length = len(input_text)
    total_word_count = sum(word_statistics.values())  # Total word count for probability calculation

    optimal_segmentation = [None] * (text_length + 1)
    optimal_score = [float('inf')] * (text_length + 1)

    # Base case - zero cost for empty prefix
    optimal_score[0] = 0

    for end in range(1, text_length + 1):
        for start in range(end):
            segment = input_text[start:end]
            score = optimal_score[start] + calculate_word_probability(segment, total_word_count, word_statistics)
            if score < optimal_score[end]:
                optimal_score[end] = score
                optimal_segmentation[end] = (start, segment)

    # Reconstruct the optimal word segmentation
    index = text_length
    final_segments = []
    while index > 0:
        start, segment = optimal_segmentation[index]
        final_segments.append(segment)
        index = start

    return final_segments[::-1]

# The URL to be segmented
input_url = "thelongestlistofthelongeststuffatthelongestdomainnameatlonglastcom"

# Unigram word frequencies
word_statistics = {
    "the": 1000, "longest": 500, "list": 500, "of": 1000, "stuff": 100,
    "at": 1000, "domain": 100, "name": 500, "long": 200, "last": 200,
    "com": 1000
}

# Test the function with the URL
segmented_result = segment_text_viterbi(input_url, word_statistics)

# Print the segmented words
print(segmented_result)


['the', 'longest', 'list', 'of', 'the', 'longest', 'stuff', 'at', 'the', 'longest', 'domain', 'name', 'at', 'long', 'last', 'com']
