In [2]:
import re
import os
from collections import Counter

def generate_stop_list(folder_name='articles', num_files=10, top_n=50):
    """
    Reads text from a sequence of files in a specified folder, cleans it, counts
    word frequencies, and identifies the top N most frequent words (the stop-list).
    """
    all_text = ""

    # 1. Read all text from the specified files in the 'articles' folder
    print("\n--- Step 1: Reading and Aggregating Text ---")
    files_found = 0

    # Check if the directory exists
    if not os.path.isdir(folder_name):
        print(f"\nERROR: The folder '{folder_name}' was not found.")
        print("Please ensure the folder exists and contains 'article1.txt' to 'article10.txt'.")
        return

    for i in range(1, num_files + 1):
        filename = f'article{i}.txt'
        # Construct the full path to the file
        filepath = os.path.join(folder_name, filename)

        try:
            # Read the file content
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
                all_text += content + "\n"
                files_found += 1
            print(f"Successfully read: {filepath}")
        except FileNotFoundError:
            # We don't stop the process if one file is missing, we just warn and continue
            print(f"Warning: File not found: {filepath}. Skipping.")
        except Exception as e:
            print(f"Error reading {filepath}: {e}")

    if files_found == 0:
        print("\nERROR: Could not find any text files. Please ensure files 'article1.txt' to 'article10.txt' are inside the 'articles' folder.")
        return

    # 2. Tokenize and Clean the Text
    print("\n--- Step 2: Cleaning and Tokenizing Text ---")
    # Convert to lowercase and remove all characters that are not letters or spaces.
    cleaned_text = all_text.lower()
    # Replace anything that is not a word character or whitespace with a space
    cleaned_text = re.sub(r'[^a-z\s]', ' ', cleaned_text)
    # Tokenize: split by whitespace and filter out empty strings
    words = [word for word in cleaned_text.split() if word]

    print(f"Total words collected and cleaned: {len(words):,}")

    # 3. Count Word Frequencies
    word_counts = Counter(words)

    # 4. Get the Top N Most Frequent Words
    print(f"\n--- Step 3: Finding the Top {top_n} Most Frequent Words ---")
    most_common_words = word_counts.most_common(top_n)

    # 5. Save the Stop-List to a File
    output_filename = 'stop_list.txt'
    try:
        with open(output_filename, 'w', encoding='utf-8') as f:
            # Save words one per line
            stop_list = [word for word, count in most_common_words]
            f.write('\n'.join(stop_list))
        print(f"\nSUCCESS: The top {top_n} words have been saved to '{output_filename}'")
    except Exception as e:
        print(f"Error saving file: {e}")
        return

    # 6. Print the 50 Words
    print(f"\n--- Top {top_n} Most Frequent Words (The Stop-List) ---")
    print(f"| {'Word':<15} | {'Count':<7} |")
    print("-" * 34)
    for rank, (word, count) in enumerate(most_common_words, 1):
        print(f"| {word:<15} | {count:<7} |")


if __name__ == "__main__":
    # The default folder name is now passed to the function
    generate_stop_list(folder_name='articles')


--- Step 1: Reading and Aggregating Text ---
Successfully read: articles/article1.txt
Successfully read: articles/article2.txt
Successfully read: articles/article3.txt
Successfully read: articles/article4.txt
Successfully read: articles/article5.txt
Successfully read: articles/article6.txt
Successfully read: articles/article7.txt
Successfully read: articles/article8.txt
Successfully read: articles/article9.txt
Successfully read: articles/article10.txt

--- Step 2: Cleaning and Tokenizing Text ---
Total words collected and cleaned: 3,860

--- Step 3: Finding the Top 50 Most Frequent Words ---

SUCCESS: The top 50 words have been saved to 'stop_list.txt'

--- Top 50 Most Frequent Words (The Stop-List) ---
| Word            | Count   |
----------------------------------
| the             | 247     |
| and             | 165     |
| of              | 130     |
| to              | 82      |
| in              | 75      |
| a               | 74      |
| by              | 46      |
| data     