In [2]:
import os, re, sys, csv, chardet
sys.path.append("..")
from gutenberg_cleaner import super_cleaner
from modules.gutenberg_text_utils import non_novels, remove_above_strings, remove_below_strings, publishers, short_scraps, long_scraps

#Toggle between source_files and test_files (a test_files directory can be created manually and populated with a few files). This is useful for debugging problematic files, rather than doing a full processing run.
directory = '../sources/gutenberg_source_files'

#Only include novels. If turned on this will delete files that are included in non_novels.py.
novels = 'on'

def novels_only(directory: str):
    if novels == 'on':
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file in non_novels:
                    os.remove(os.path.join(root, file))

def detect_file_encodings(directory, csv_file):
    encodings = {}
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'rb') as f:
            raw_data = f.read()
            result = chardet.detect(raw_data)
            encodings[filename] = result['encoding']

    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Filename', 'Encoding'])
        for filename, encoding in encodings.items():
            writer.writerow([filename, encoding])

def convert_files_to_utf8(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'rb') as f:
            result = chardet.detect(f.read())
        if result['encoding'] == 'ascii' or result['encoding'] == 'utf-8':
            with open(file_path, 'r', encoding=result['encoding']) as f:
                content = f.read()
            with open(file_path, 'w', encoding='utf-8') as f:
                f.write(content)

def rename_files(directory):
    for dirpath, dirnames, filenames in os.walk(directory):
        for filename in filenames:
            old_path = os.path.join(dirpath, filename)
            new_filename = filename.strip().replace(' ', '').replace('complete', '')
            new_path = os.path.join(dirpath, new_filename)
            os.rename(old_path, new_path)

def additional_gutenberg_clean(text: str):
    lines = text.splitlines()
    new_text = ''
    blank_lines = 0
    skip = 0
    for line in lines:
        if '***END OF THE PROJECT GUTENBERG EBOOK' in line:
            line = line.split('***END OF THE PROJECT GUTENBERG EBOOK')[0]
            new_text += line + '\n'
            break
        if '      *      *      *      *      *' in line:
            line = line.split('      *      *      *      *      *')[0]
            new_text += line + '\n'
            break
        if skip > 0:
            skip -= 1
            continue
        if 'Transcriber\'s note:' in line:
            skip = 8
        if 'Transcriber\'s note:' in line: 
            line = ''
        if 'First published' in line:
            skip = 2 # Skip the current and next lines
            continue # Skip the current iteration and move to the next one
        if 'Footnotes' in line:
            line = line.split('Footnotes')[0]
            new_text += line + '\n'
            break
        if 'These Short Books' in line:
            line = line.split('These Short Books')[0]
            new_text += line + '\n'
            break
        # Check if the current line is centered by checking if it starts and ends with a certain number of spaces.
        # You can adjust the number of spaces to fit your needs.
        if len(line) > 10 and (line.startswith(' '*15) or line.endswith(' '*15)):
            continue # Skip this iteration and move to the next one
        new_text += line + '\n'
    return new_text

def remove_indented_text(text: str, num_spaces: int):
    pattern = r'^\s{{{}}}.+'.format(num_spaces)
    new_text = re.sub(pattern, '', text, flags=re.MULTILINE)
    return new_text

def remove_above_blank_lines(text: str, num_blank_lines: int):
    pattern = r'(\n[ \t]*\n){{{}}}'.format(num_blank_lines)
    sections = re.split(pattern, text, flags=re.DOTALL)
    last_section = sections[-1]
    if 'Editorial note:' in last_section:
        last_section = last_section.split('Editorial note:')[1]
    return last_section

def remove_roman_numeral_lines(text: str):
    pattern = r'^\s*(?=[MDCLXVI])(M{0,3})(C[MD]|D?C{0,3})(X[CL]|L?X{0,3})(I[XV]|V?I{0,3})\.'
    lines = text.splitlines()
    new_text = ''
    for line in lines:
        if not re.match(pattern, line):
            new_text += line + '\n'
    return new_text

def remove_bracketed_text(text: str):
    pattern = r'\[[^\]]*?\]'
    new_text = re.sub(pattern, '', text, flags=re.DOTALL)
    return new_text

def remove_deleted_lines(text: str):
    lines = text.splitlines()
    new_text = ''
    for line in lines:
        if '[deleted]' not in line:
            new_text += line + '\n'
    return new_text

def remove_publishers(text: str, publishers: list) -> str:
    new_text = text
    for string in publishers:
        new_text = new_text.replace(string, '')
    return new_text

def remove_short_scraps(text: str, short_scraps: list) -> str:
    new_text = text
    for string in short_scraps:
        new_text = new_text.replace(string, '')
    return new_text

def remove_long_scraps(text: str, long_scraps: list) -> str:
    new_text = text
    for string in long_scraps:
        new_text = new_text.replace(string, '')
    return new_text

def remove_above_strings_in_text(text: str):
    for string in remove_above_strings:
        if string in text:
            sections = text.split(string)
            last_section = sections[-1]
            text = string + last_section
    return text

def remove_below_strings_in_text(text: str):
    for string in remove_below_strings:
        if string in text:
            sections = text.split(string)
            first_section = sections[0]
            text = first_section + string
    return text

rename_files(directory)

# Run the novels_only function
novels_only(directory)

# Run the detect file encodings function. This is only needs to be run to ascertain the encoding of files.
# detect_file_encodings(directory, 'output/file_encodings.csv')

# Convert from ascii to UTF-8 encoding
convert_files_to_utf8(directory) 

# Run the functions (the order is important).
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read()
    cleaned_text = super_cleaner(text, min_token=5, max_token=600)
    cleaned_text = remove_indented_text(cleaned_text, 6)
    cleaned_text = additional_gutenberg_clean(cleaned_text)
    cleaned_text = remove_roman_numeral_lines(cleaned_text)
    cleaned_text = remove_publishers(cleaned_text, publishers)
    cleaned_text = remove_short_scraps(cleaned_text, short_scraps)
    cleaned_text = remove_long_scraps(cleaned_text, long_scraps)
    cleaned_text = remove_above_blank_lines(cleaned_text, 15)
    cleaned_text = remove_bracketed_text(cleaned_text)
    cleaned_text = remove_deleted_lines(cleaned_text)
    cleaned_text = remove_above_strings_in_text(cleaned_text)
    cleaned_text = remove_below_strings_in_text(cleaned_text)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(cleaned_text)


