In [17]:
import time
import re
import requests
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import TextBlob
import spacy
from collections import Counter

# --- SETUP: Download Required NLTK Data ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Text Normalizing and cleaning


In [18]:
# ==========================================
# 0. HELPER: Roman to Arabic Conversion
# ==========================================
def roman_to_int(roman):
    """Converts a Roman Numeral string to an integer string."""
    roman_map = {'I': 1, 'V': 5, 'X': 10, 'L': 50,
                 'C': 100, 'D': 500, 'M': 1000}
    total = 0
    prev_value = 0

    for char in reversed(roman.upper()):
        value = roman_map.get(char, 0)
        if value < prev_value:
            total -= value
        else:
            total += value
        prev_value = value
    return str(total)

def normalize_token(word):
    """
    Checks if a word is a Roman Numeral and converts it.
    Refuses to convert 'I' to avoid confusing the pronoun 'I' with the number '1'.
    """
    # 1. Strict Regex for Roman Numerals
    # Matches patterns like II, IV, XII, but not random words like 'MIX' or 'DIV' (unless purely uppercase)
    roman_pattern = r"^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$"

    # 2. Safety Check: Must be Uppercase and match pattern
    if word.isupper() and re.match(roman_pattern, word):
        # EXCEPTION: Do not convert solitary "I" (Pronoun protection)
        if word == "I":
            return word
        return roman_to_int(word)

    return word

# Input file

In [19]:
# ==========================================
# 1. SETUP & UTILITIES
# ==========================================
def download_input_file():
    url = "https://corpus.canterbury.ac.nz/descriptions/aliced29.txt"
    print(f"Downloading {url}...")
    response = requests.get(url)
    return response.content.decode('latin-1')

def base_clean(text):
    # 1. Remove Emoticons & Special Symbols (Keep words, space, dots)
    text = re.sub(r'[^\w\s,.]', '', text)
    # 2. Normalize Whitespace (\s\t\n -> space)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# NLTK

In [20]:

# FRAMEWORK 1: NLTK
# ==========================================
def run_nltk(raw_text):
    start_time = time.time()

    # A. Clean String
    step1_text = base_clean(raw_text)

    # B. Tokenize
    sentences = sent_tokenize(step1_text)
    tokens = word_tokenize(step1_text)

    # C. Filter Stopwords & Convert Roman Numerals
    stop_words = set(stopwords.words('english'))
    clean_tokens = []

    for w in tokens:
        # Check if it's a useful word
        if w.lower() not in stop_words and w.isalnum():
            # ** NEW STEP: Convert Roman -> Arabic **
            final_word = normalize_token(w)
            clean_tokens.append(final_word)

    # D. Outputs
    cleaned_text_str = " ".join(clean_tokens)
    freq_dist = nltk.FreqDist(clean_tokens)
    top_10 = freq_dist.most_common(10)

    end_time = time.time()
    return end_time - start_time, cleaned_text_str, sentences, clean_tokens, top_10

# TextBlob

In [21]:


# ==========================================
# FRAMEWORK 2: TextBlob
# ==========================================
def run_textblob(raw_text):
    start_time = time.time()

    step1_text = base_clean(raw_text)
    blob = TextBlob(step1_text)

    sentences = blob.sentences
    tokens = blob.words

    stop_words = set(stopwords.words('english'))
    clean_tokens = []

    for w in tokens:
        if w.lower() not in stop_words and w.isalnum():
            # ** NEW STEP: Convert Roman -> Arabic **
            final_word = normalize_token(w)
            clean_tokens.append(final_word)

    cleaned_text_str = " ".join(clean_tokens)
    word_counts = Counter(clean_tokens)
    top_10 = word_counts.most_common(10)

    end_time = time.time()
    return end_time - start_time, cleaned_text_str, sentences, clean_tokens, top_10

# Spacy

In [22]:
# ==========================================
# FRAMEWORK 3: spaCy
# ==========================================
def run_spacy(raw_text):
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 2000000

    start_time = time.time()

    step1_text = base_clean(raw_text)
    doc = nlp(step1_text)
    sentences = list(doc.sents)

    clean_tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha:
            # ** NEW STEP: Convert Roman -> Arabic **
            # We use token.text to get the string
            final_word = normalize_token(token.text)
            clean_tokens.append(final_word)

    cleaned_text_str = " ".join(clean_tokens)
    word_counts = Counter(clean_tokens)
    top_10 = word_counts.most_common(10)

    end_time = time.time()
    return end_time - start_time, cleaned_text_str, sentences, clean_tokens, top_10

# Output

In [23]:
# ==========================================
# MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    # 1. Get Data
    raw_text = download_input_file()
    print(f"File downloaded. Length: {len(raw_text)} chars.")

    # 2. Run All Frameworks
    print("Running NLTK...")
    nltk_time, nltk_clean, nltk_sents, nltk_words, nltk_top10 = run_nltk(raw_text)

    print("Running TextBlob...")
    blob_time, blob_clean, blob_sents, blob_words, blob_top10 = run_textblob(raw_text)

    print("Running spaCy...")
    spacy_time, spacy_clean, spacy_sents, spacy_words, spacy_top10 = run_spacy(raw_text)

    # 3. Generate Output Files (Using NLTK results)

    # Output A: cleaned.txt
    with open("cleaned.txt", "w", encoding="utf-8") as f:
        f.write(nltk_clean)

    # Output B: words.txt
    with open("words.txt", "w", encoding="utf-8") as f:
        f.write("--- SENTENCES (Sample) ---\n")
        for s in nltk_sents[:5]:
            f.write(str(s) + "\n")
        f.write("\n--- TOKENS (Sample) ---\n")
        f.write(str(nltk_words[:20]))

    # Output C: top10words.txt
    with open("top10words.txt", "w", encoding="utf-8") as f:
        f.write("Rank | Word | Frequency\n")
        f.write("-" * 30 + "\n")
        for i, (word, count) in enumerate(nltk_top10, 1):
            f.write(f"{i} | {word} | {count}\n")

    # Output D: time_compares.txt
    with open("time_compares.txt", "w", encoding="utf-8") as f:
        f.write(f"Framework Performance Comparison\n")
        f.write(f"================================\n")
        f.write(f"NLTK Time     : {nltk_time:.4f} seconds\n")
        f.write(f"TextBlob Time : {blob_time:.4f} seconds\n")
        f.write(f"spaCy Time    : {spacy_time:.4f} seconds\n")

    print("\nDONE! Roman numerals (like II, VII) have been converted to Arabic (2, 7).")
    print("Files generated: cleaned.txt, words.txt, top10words.txt, time_compares.txt")

Downloading https://corpus.canterbury.ac.nz/descriptions/aliced29.txt...
File downloaded. Length: 323 chars.
Running NLTK...
Running TextBlob...
Running spaCy...

DONE! Roman numerals (like II, VII) have been converted to Arabic (2, 7).
Files generated: cleaned.txt, words.txt, top10words.txt, time_compares.txt
