In [33]:
import random
import re
from collections import defaultdict

In [34]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    return words

In [35]:
def build_markov_chain(words, chain_length):
    transitions = defaultdict(lambda: defaultdict(int))

    for i in range(len(words) - chain_length):
        current_state = tuple(words[i:i + chain_length])
        next_state = words[i + chain_length]
        transitions[current_state][next_state] += 1

    markov_chain = defaultdict(dict)
    for current_state, next_states in transitions.items():
        total_transitions = sum(next_states.values())
        for next_state, count in next_states.items():
            markov_chain[current_state][next_state] = count / total_transitions

    return markov_chain

In [36]:
def generate_sentence(markov_chain, start_words, num_generated):
    current_state = tuple(start_words)
    generated_sentence = list(current_state)

    for _ in range(num_generated):
        next_states = markov_chain.get(current_state, {})
        if not next_states:
            break
        next_state = random.choices(list(next_states.keys()), list(next_states.values()))[0]
        generated_sentence.append(next_state)
        current_state = tuple(generated_sentence[-len(start_words):])

    return ' '.join(generated_sentence)

In [37]:
def generate_from_corpus(text, start_words, chain_length, num_generated):
    words = preprocess_text(text)
    markov_chain = build_markov_chain(words, chain_length)
    generated_sentence = generate_sentence(markov_chain, start_words, num_generated)
    return generated_sentence

In [46]:
# Test Case 1: Basic Test
text_corpus_1 = """
This is a sample text corpus. It contains some words that we can use to build a Markov chain model.
The Markov chain model will generate sentences based on the structure and patterns of this text corpus.
"""
start_words_1 = ['this', 'is']
chain_length_1 = 2
num_generated_1 = 10
generated_sentence_1 = generate_from_corpus(text_corpus_1, start_words_1, chain_length_1, num_generated_1)
print("Generated Sentence (Test Case 1):", generated_sentence_1)


Generated Sentence (Test Case 1): this is a sample text corpus it contains some words that we


In [47]:
# Test Case 2: Longer Chain Length
text_corpus_2 = """
This is a sample text corpus. It contains some words that we can use to build a Markov chain model.
The Markov chain model will generate sentences based on the structure and patterns of this text corpus.
"""
start_words_2 = ['this', 'is']
chain_length_2 = 3
num_generated_2 = 10
generated_sentence_2 = generate_from_corpus(text_corpus_2, start_words_2, chain_length_2, num_generated_2)
print("Generated Sentence (Test Case 2):", generated_sentence_2)


Generated Sentence (Test Case 2): this is


In [48]:
# Test Case 3: Different Start Words
text_corpus_3 = """
This is a sample text corpus. It contains some words that we can use to build a Markov chain model.
The Markov chain model will generate sentences based on the structure and patterns of this text corpus.
"""
start_words_3 = ['sample', 'text']
chain_length_3 = 2
num_generated_3 = 10
generated_sentence_3 = generate_from_corpus(text_corpus_3, start_words_3, chain_length_3, num_generated_3)
print("Generated Sentence (Test Case 3):", generated_sentence_3)


Generated Sentence (Test Case 3): sample text corpus it contains some words that we can use to


In [49]:
# Test Case 4: Varying Number of Words Generated
text_corpus_4 = """
This is a sample text corpus. It contains some words that we can use to build a Markov chain model.
The Markov chain model will generate sentences based on the structure and patterns of this text corpus.
"""
start_words_4 = ['this', 'is']
chain_length_4 = 2
num_generated_15 = 15
num_generated_5 = 5
generated_sentence_4 = generate_from_corpus(text_corpus_4, start_words_4, chain_length_4, num_generated_15)
generated_sentence_5 = generate_from_corpus(text_corpus_4, start_words_4, chain_length_4, num_generated_5)
print("Generated Sentence (Test Case 4 - 15 words):", generated_sentence_4)
print("Generated Sentence (Test Case 4 - 5 words):", generated_sentence_5)


Generated Sentence (Test Case 4 - 15 words): this is a sample text corpus it contains some words that we can use to build a
Generated Sentence (Test Case 4 - 5 words): this is a sample text corpus it
