In [4]:
import glob
from bs4 import BeautifulSoup

In [2]:
file_list = glob.glob('reut2-*.sgm')
all_texts = []

In [3]:
print(f"Found {len(file_list)} files matching pattern 'reut2-*.sgm'.")

for file in file_list:
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        all_texts.append(f.read())

print(f"Read {len(all_texts)} files.")

Found 22 files matching pattern 'reut2-*.sgm'.
Read 22 files.


In [5]:
first_file_content = all_texts[0]

In [7]:
soup = BeautifulSoup(first_file_content, 'html.parser')

In [8]:
articles = soup.find_all('reuters')

In [None]:
all_article_text = " ".join(article.get_text() for article in articles)
clean_text = ' '.join(all_article_text.split())
print(f"Total characters in the article body: {len(clean_text)}")

Total characters in the article body: 937333


In [None]:
tokens = clean_text.split()
print(f"\nTotal tokens in the article body: {len(tokens)}")


Total tokens in the article body: 152991


In [None]:
import random
from collections import Counter

def next_word_freq(token_array, input_sentence):
    sentence_tokens = input_sentence.split()
    sentence_length = len(sentence_tokens)
    following_words = []
    
    for i in range(len(token_array) - sentence_length):
        current_sequence = ' '.join(token_array[i:i + sentence_length])
        
        if current_sequence.lower() == input_sentence.lower():
            if i + sentence_length < len(token_array):
                following_words.append(token_array[i + sentence_length])
    
    return dict(Counter(following_words))

✓ next_word_freq function defined


In [None]:
def calculate_cdf(frequency_dict):
    if not frequency_dict:
        return {}
    
    total_frequency = sum(frequency_dict.values())
    cumulative_prob = 0
    cdf_dict = {}
    
    for word, frequency in frequency_dict.items():
        pmf = frequency / total_frequency
        cumulative_prob += pmf
        cdf_dict[word] = cumulative_prob
    
    return cdf_dict

✓ calculate_cdf function defined


In [None]:
def predict_next_word(cdf_dict):
    if not cdf_dict:
        return None
    
    random_number = random.uniform(0, 1)
    
    for word, cdf_value in cdf_dict.items():
        if random_number <= cdf_value:
            return word
    
    return list(cdf_dict.keys())[-1]

✓ predict_next_word function defined


In [None]:
def generate_text(starting_word, target_length=10, corpus_tokens=None):
    if corpus_tokens is None:
        corpus_tokens = tokens
    
    current_phrase = starting_word
    generated_text = starting_word
    words_generated = len(starting_word.split())
    
    while words_generated < target_length:
        word_frequencies = next_word_freq(corpus_tokens, current_phrase)
        
        if not word_frequencies:
            break
        
        cdf_dict = calculate_cdf(word_frequencies)
        next_word = predict_next_word(cdf_dict)
        
        if next_word is None:
            break
        
        generated_text += " " + next_word
        current_phrase = next_word
        words_generated += 1
    
    return generated_text

✓ generate_text function defined


## Text Generation

Test the text generation system:

In [None]:
result1 = generate_text("the", target_length=10)
print(result1)

EXAMPLE 1: Starting with 'the'
🚀 Starting text generation with: 'the'
📊 Corpus size: 152,991 tokens
🎯 Target length: 10 words
--------------------------------------------------
Step 1: Looking for words after 'the'...
   Top candidates: [('company', 225), ('U.S.', 137), ('government', 79)]
   ✅ Predicted: 'quake' (frequency: 2)
Step 2: Looking for words after 'quake'...
   Top candidates: [('jolted', 1), ('cracked', 1), ('measured', 1)]
   ✅ Predicted: 'measured' (frequency: 1)
Step 3: Looking for words after 'measured'...
   Top candidates: [('by', 1), ('6.25', 1)]
   ✅ Predicted: 'by' (frequency: 1)
Step 4: Looking for words after 'by'...
   Top candidates: [('the', 149), ('a', 41), ('Standard', 6)]
   ✅ Predicted: 'commercial' (frequency: 2)
Step 5: Looking for words after 'commercial'...
   Top candidates: [('banks', 13), ('bank', 9), ('paper', 7)]
   ✅ Predicted: 'Credit' (frequency: 4)
Step 6: Looking for words after 'Credit'...
   Top candidates: [('card', 8), ('and', 5), ('Co',

In [None]:
result2 = generate_text("is", target_length=8)
print(result2)

EXAMPLE 2: Starting with 'is'
🚀 Starting text generation with: 'is'
📊 Corpus size: 152,991 tokens
🎯 Target length: 8 words
--------------------------------------------------
Step 1: Looking for words after 'is'...
   Top candidates: [('expected', 49), ('a', 40), ('not', 40)]
   ✅ Predicted: 'not' (frequency: 40)
Step 2: Looking for words after 'not'...
   Top candidates: [('be', 31), ('have', 12), ('to', 11)]
   ✅ Predicted: 'be' (frequency: 31)
Step 3: Looking for words after 'be'...
   Top candidates: [('used', 28), ('a', 27), ('the', 19)]
   ✅ Predicted: 'combined' (frequency: 1)
Step 4: Looking for words after 'combined'...
   Top candidates: [('pays', 8), ('sales', 3), ('with', 2)]
   ✅ Predicted: '1986' (frequency: 1)
Step 5: Looking for words after '1986'...
   Top candidates: [('and', 23), ('net', 13), ('was', 6)]
   ✅ Predicted: 'JAKARTA,' (frequency: 1)
Step 6: Looking for words after 'JAKARTA,'...
   Top candidates: [('March', 11)]
   ✅ Predicted: 'March' (frequency: 11)
Ste

In [None]:
result3 = generate_text("the company", target_length=12)
print(result3)

## Custom Generation

Experiment with different parameters:

In [None]:
your_starting_word = "market"
desired_length = 15

result_custom = generate_text(your_starting_word, target_length=desired_length)
print(result_custom)