# Task 1: Third-order Letter Approximation Model
- Creating a third-order letter approximation model from five English texts sourced from Project Gutenberg. texts will be processed by:
- Removing unnecessary content (preamble, postamble).
- Retaining only ASCII letters, full stops, and spaces.
- Converting all letters to uppercase.

Count the occurrences of each trigram (a sequence of three characters) to build a model of the English language based on these texts.


# .txt files saved on local directory / using py. to read them.

# ref: https://realpython.com/read-write-files-python/
# https://www.dataquest.io/blog/read-file-python/

In [6]:
# List of file paths for the texts
import re

file_paths = [
    r"C:\Users\hemer\emerginTechnologies\Text\betrothed.txt",
    r"C:\Users\hemer\emerginTechnologies\Text\chronicles.txt",
    r"C:\Users\hemer\emerginTechnologies\Text\Frank.txt",
    r"C:\Users\hemer\emerginTechnologies\Text\school.txt",
    r"C:\Users\hemer\emerginTechnologies\Text\voyage.txt"
]

# Function to load texts
def load_texts(file_paths):
    texts = []
    for path in file_paths:
        with open(path, 'r', encoding='utf-8') as file:
            texts.append(file.read())
    return texts

# Load all texts and display the first 500 characters of each for verification
texts = load_texts(file_paths)
print("Loaded Texts :")
for i, text in enumerate(texts):
    print(f"Text {i+1}:\n{text[:500]}\n{'-'*40}")


Loaded Texts :
Text 1:
The Project Gutenberg eBook of My betrothed and other poems
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this
----------------------------------------
Text 2:
The Project Gutenberg eBook of The chronicles of Enguerrand de Monstrelet
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,

# Remove preamble and postamble, Removes all characters except ASCII letters, spaces, and full stops andConverts all letters to uppercase.
# Ref: https://datagy.io/python-read-text-file/

In [7]:
# Function to preprocess each text
def preprocess_text(text):
    # Remove preamble and postamble
    start_marker = "START OF THIS PROJECT GUTENBERG EBOOK"
    end_marker = "END OF THIS PROJECT GUTENBERG EBOOK"
    
    start_pos = text.find(start_marker)
    end_pos = text.find(end_marker)
    
    # Slice text to keep only the main content
    if start_pos != -1:
        text = text[start_pos + len(start_marker):]
    if end_pos != -1:
        text = text[:end_pos]
    
    # Remove unwanted characters, keep only ASCII letters, full stops, and spaces
    text = re.sub(r'[^A-Za-z. ]', '', text)
    text = text.upper()  # Convert to uppercase
    return text

# Preprocess each text and display the first 500 characters after preprocessing
processed_texts = [preprocess_text(text) for text in texts]
print("\nPreprocessed Texts - First 500 Characters of Each Text")
for i, text in enumerate(processed_texts):
    print(f"Processed Text {i+1}:\n{text[:500]}\n{'-'*40}")


Preprocessed Texts - First 500 Characters of Each Text
Processed Text 1:
THE PROJECT GUTENBERG EBOOK OF MY BETROTHED AND OTHER POEMS    THIS EBOOK IS FOR THE USE OF ANYONE ANYWHERE IN THE UNITED STATES ANDMOST OTHER PARTS OF THE WORLD AT NO COST AND WITH ALMOST NO RESTRICTIONSWHATSOEVER. YOU MAY COPY IT GIVE IT AWAY OR REUSE IT UNDER THE TERMSOF THE PROJECT GUTENBERG LICENSE INCLUDED WITH THIS EBOOK OR ONLINEAT WWW.GUTENBERG.ORG. IF YOU ARE NOT LOCATED IN THE UNITED STATESYOU WILL HAVE TO CHECK THE LAWS OF THE COUNTRY WHERE YOU ARE LOCATEDBEFORE USING THIS EBOOK.TITL
----------------------------------------
Processed Text 2:
THE PROJECT GUTENBERG EBOOK OF THE CHRONICLES OF ENGUERRAND DE MONSTRELET    THIS EBOOK IS FOR THE USE OF ANYONE ANYWHERE IN THE UNITED STATES ANDMOST OTHER PARTS OF THE WORLD AT NO COST AND WITH ALMOST NO RESTRICTIONSWHATSOEVER. YOU MAY COPY IT GIVE IT AWAY OR REUSE IT UNDER THE TERMSOF THE PROJECT GUTENBERG LICENSE INCLUDED WITH THIS EBOOK OR ONLINEAT WWW.GUTENB

# Creates a function to iterate through each preprocessed text and extract trigrams

In [8]:
# Function to generate trigram model
def generate_trigram_model(texts):
    trigram_counts = {}  # Dictionary to store trigram counts

    for text in texts:
        for i in range(len(text) - 2):  # Loop through each character up to the third-to-last
            trigram = text[i:i + 3]  # Extract a trigram of three characters
            if trigram in trigram_counts:
                trigram_counts[trigram] += 1  # Increment count if trigram exists
            else:
                trigram_counts[trigram] = 1  # Initialize count if trigram is new
    
    return trigram_counts

# Generate the trigram model and display a sample of 10 items
trigram_model = generate_trigram_model(processed_texts)
print("\n10 Trigrams with Counts:")
sample_trigrams = list(trigram_model.items())[:10]
for trigram, count in sample_trigrams:
    print(f"'{trigram}': {count}")


10 Trigrams with Counts:
'THE': 22719
'HE ': 19788
'E P': 2035
' PR': 2578
'PRO': 1583
'ROJ': 445
'OJE': 445
'JEC': 597
'ECT': 1577
'CT ': 808


# Creates a function to display the most common tiagrams

In [9]:
# Display top N trigrams
def display_top_trigrams(trigram_model, top_n=10):
    # Sort trigrams by count in descending order
    sorted_trigrams = sorted(trigram_model.items(), key=lambda item: item[1], reverse=True)
    # Display the top N trigrams
    print(f"\n {top_n} Most Common Trigrams")
    for trigram, count in sorted_trigrams[:top_n]:
        print(f"'{trigram}': {count}")

# Display the 10 most common trigrams
display_top_trigrams(trigram_model, top_n=10)


 10 Most Common Trigrams
' TH': 25434
'THE': 22719
'HE ': 19788
'   ': 17800
'AND': 10688
'ND ': 10315
' AN': 10238
'ED ': 9757
' TO': 9718
' OF': 9412


# Task 2: Third-Order Letter Approximation Generation
I'll be using the trigram model generated in (Task 1) to create a string of 10,000 characters. Starting with a seed (e.g., "TH"), each new character is generated based on the previous two characters, selecting the next character probabilistically based on trigram frequencies.


In [10]:
import random

def generate_text(trigram_model, start="TH", length=10000):
    # Initialize generated text with the starting string
    generated_text = start
    print(f"Starting text generation with seed: '{start}'\n")

    # Loop until the desired length is reached
    while len(generated_text) < length:
        # Get the last two characters to use as a prefix
        prefix = generated_text[-2:]
        print(f"\nCurrent prefix: '{prefix}'")

        # Find all trigrams that start with this prefix
        candidates = {k: v for k, v in trigram_model.items() if k.startswith(prefix)}
        
        if not candidates:
            print("No matching trigrams found. Ending text generation.")
            break  # Stop generation if no matching trigrams are found

        # Extract the third character from each matching trigram and its count
        next_chars = [trigram[2] for trigram in candidates.keys()]
        weights = list(candidates.values())

        # Display candidates and their weights
        print("Candidate trigrams and weights:")
        for trigram, weight in candidates.items():
            print(f"'{trigram}': {weight}")
        
        # Choose the next character based on weighted probabilities
        next_char = random.choices(next_chars, weights=weights, k=1)[0]
        print(f"Chosen next character: '{next_char}'")

        # Append the chosen character to the generated text
        generated_text += next_char

    print("\nFinished text generation.\n")
    return generated_text

# Generate a 10,000-character text using the trigram model from Task 1
generated_string = generate_text(trigram_model, start="TH", length=10000)

# Display the beginning of the generated text to verify the output
print("\nGenerated Text Sample (First 500 Characters):")
print(generated_string[:500])


Starting text generation with seed: 'TH'


Current prefix: 'TH'
Candidate trigrams and weights:
'THE': 22719
'THI': 2995
'TH ': 2991
'THO': 1252
'THA': 3725
'THR': 542
'THU': 123
'THY': 84
'THT': 42
'THS': 135
'THW': 16
'THK': 3
'THF': 44
'THL': 19
'TH.': 95
'THQ': 2
'THM': 22
'THD': 21
'THH': 24
'THN': 4
'THB': 8
'THP': 6
'THC': 7
'THJ': 2
'THV': 11
Chosen next character: 'A'

Current prefix: 'HA'
Candidate trigrams and weights:
'HAT': 3878
'HAV': 1583
'HAP': 496
'HAS': 431
'HAD': 2410
'HAI': 111
'HAN': 1152
'HAR': 762
'HAL': 473
'HAK': 27
'HAC': 19
'HAE': 13
'HAB': 96
'HAM': 219
'HAU': 34
'HA ': 15
'HAW': 9
'HAG': 6
'HAF': 12
'HAZ': 2
Chosen next character: 'D'

Current prefix: 'AD'
Candidate trigrams and weights:
'ADI': 200
'ADE': 757
'AD ': 2768
'ADL': 53
'ADO': 125
'ADN': 24
'ADD': 160
'ADS': 78
'AD.': 55
'ADM': 123
'ADY': 343
'ADW': 17
'ADA': 78
'ADP': 11
'ADU': 13
'ADV': 142
'ADR': 29
'ADF': 148
'ADJ': 19
'ADB': 29
'ADC': 20
'ADG': 6
'ADT': 20
'ADH': 6
Chosen next character: 'A'

# Task 3: Analyze your model
It Loads a list of valid English words, splits the generated text into individual words and checks if they exist in the list and also calculates and prints the percentage of valid English words.


In [11]:
# Load English words from words.txt
def load_english_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return set(word.strip().lower() for word in file.readlines())

# Path to words.txt
words_file_path = r'C:\Users\hemer\emerginTechnologies\words.txt'
english_words = load_english_words(words_file_path)
print(f"Loaded {len(english_words)} English words.")

# Analyze the generated text
def analyze_generated_text(text, english_words):
    # Normalize the generated text and split into words
    words_in_text = re.findall(r'\b[a-zA-Z]+\b', text.lower())
    
    # Count valid English words
    valid_words = [word for word in words_in_text if word in english_words]
    
    # Calculate percentage of valid words
    percentage = (len(valid_words) / len(words_in_text)) * 100 if words_in_text else 0
    return percentage, len(valid_words), len(words_in_text)

# Analyze the generated string
percentage, valid_count, total_count = analyze_generated_text(generated_string, english_words)
print(f"Valid English Words: {valid_count}/{total_count} ({percentage:.2f}%)")


Loaded 45373 English words.
Valid English Words: 628/1716 (36.60%)
