In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import utils
from pathlib import Path
from pprint import pprint
from importlib import reload

In [None]:
# --- Configuration ---
SOURCE_DIRECTORY = Path("data/fr/")
OUTPUT_FILE_PATH = Path("data/all_fr.txt")

print(f"Starting file combination from: {SOURCE_DIRECTORY.resolve()}")

combined_text = []
file_count = 0

# Recursively find all .txt files
# ** means "this directory and all subdirectories"
for file_path in SOURCE_DIRECTORY.glob('**/*.txt'):
    try:
        # Read the file content
        with open(file_path, 'r', encoding='utf-8') as f:
            book_txt = f.read()
            # drop everything after "*** END OF THE PROJECT GUTENBERG EBOOK"
            end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
            if end_marker in book_txt:
                book_txt = book_txt.split(end_marker)[0]
            # drop everything before "*** START OF THE PROJECT GUTENBERG EBOOK"
            start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
            # find the next "***" after the start marker and drop everything before that
            if start_marker in book_txt:
                parts = book_txt.split(start_marker)[1].split("***", 1)
                if len(parts) > 1:
                    book_txt = parts[1]
                else:
                    book_txt = parts[0]
            # Append to the combined text list
            combined_text.append(book_txt)
            
        file_count += 1
        
    except Exception as e:
        print(f"ERROR reading file {file_path}: {e}")

# Join all text chunks with a clear separator
# You can change this separator if needed (e.g., just "\n")
final_raw_text = "\n\n--- FILE BREAK ---\n\n".join(combined_text)

# Ensure the output directory exists
OUTPUT_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)

# Write the combined text to the output file
try:
    with open(OUTPUT_FILE_PATH, 'w', encoding='utf-8') as f:
        f.write(final_raw_text)
        
    print(f"\nâœ… Successfully combined {file_count} files into: {OUTPUT_FILE_PATH.resolve()}")
    
except Exception as e:
    print(f"FATAL ERROR writing to output file {OUTPUT_FILE_PATH}: {e}")

# --- User's original loading and printing logic ---
corpus_path = OUTPUT_FILE_PATH
raw_text = utils.load_text(corpus_path)