In [None]:
import re
import os
from collections import Counter
def generate_stop_list(num_files=10, top_n=50):
    """
    Reads text from a sequence of files, cleans it, counts word frequencies,
    and identifies the top N most frequent words (the stop-list). (didn't know if we should leave stop words like to , and ... out so i left them in .)
    """
    all_text = ""
    # 1. Read all text from the specified files article1-10.txt
    print("\n--- Step 1: Reading and Aggregating Text ---")
    files_found = 0
    for i in range(1, num_files + 1):
        filename = f'article{i}.txt'
        try:
            # Read the file content
            with open(filename, 'r', encoding='utf-8') as f:
                content = f.read()
                all_text += content + "\n"
                files_found += 1
            print(f"Successfully read: {filename}")
        except FileNotFoundError:
            print(f"Warning: File not found: {filename}. Skipping.")
        except Exception as e:
            print(f"Error reading {filename}: {e}")

    if files_found == 0:
        print("\nERROR: Could not find any text files (article1.txt to article10.txt). Please ensure they are in the same directory.")
        return

    # 2. Tokenize and Clean the Text
    print("\n--- Step 2: Cleaning and Tokenizing Text ---")
    # Convert to lowercase and remove all characters that are not letters or spaces.
    cleaned_text = all_text.lower()
    # Replace anything that is not a word character or whitespace with a space
    cleaned_text = re.sub(r'[^a-z\s]', ' ', cleaned_text)
    # Tokenize: split by whitespace and filter out empty strings
    words = [word for word in cleaned_text.split() if word]

    print(f"Total words collected and cleaned: {len(words):,}")

    # 3. Count Word Frequencies
    word_counts = Counter(words)

    # 4. Get the Top N Most Frequent Words
    print(f"\n--- Step 3: Finding the Top {top_n} Most Frequent Words ---")
    most_common_words = word_counts.most_common(top_n)

    # 5. Save the Stop-List to a File
    output_filename = 'stop_list.txt'
    try:
        with open(output_filename, 'w', encoding='utf-8') as f:
            # Save words one per line
            stop_list = [word for word, count in most_common_words]
            f.write('\n'.join(stop_list))
        print(f"\nSUCCESS: The top {top_n} words have been saved to '{output_filename}'")
    except Exception as e:
        print(f"Error saving file: {e}")
        return

    # 6. Print the 50 Words
    print(f"\n--- Top {top_n} Most Frequent Words (The Stop-List) ---")
    print(f"| {'Word':<15} | {'Count':<7} |")
    print("-" * 34)
    for rank, (word, count) in enumerate(most_common_words, 1):
        print(f"| {word:<15} | {count:<7} |")


if __name__ == "__main__":
    generate_stop_list()


--- Step 1: Reading and Aggregating Text ---
Successfully read: article1.txt
Successfully read: article2.txt
Successfully read: article3.txt
Successfully read: article4.txt
Successfully read: article5.txt
Successfully read: article6.txt
Successfully read: article7.txt
Successfully read: article8.txt
Successfully read: article9.txt
Successfully read: article10.txt

--- Step 2: Cleaning and Tokenizing Text ---
Total words collected and cleaned: 3,860

--- Step 3: Finding the Top 50 Most Frequent Words ---

SUCCESS: The top 50 words have been saved to 'stop_list.txt'

--- Top 50 Most Frequent Words (The Stop-List) ---
| Word            | Count   |
----------------------------------
| the             | 247     |
| and             | 165     |
| of              | 130     |
| to              | 82      |
| in              | 75      |
| a               | 74      |
| by              | 46      |
| data            | 44      |
| is              | 43      |
| as              | 36      |
| football 