In [1]:
# cleaning text
import os
import re
import spacy
from nltk.corpus import stopwords

In [2]:
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    # special characters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # lowercase
    text = text.lower()
    
    # Initialize custom stop words from NLTK's stop words
    custom_stop_words = set(stopwords.words('english'))
    
    
    # Add domain-specific stop words
    custom_stop_words.update([
        'say', 'go', 'come', 'one', 'well', 'little', 'take',
        'could', 'look', 'know', 'see', 'old', 'give', 'time',
        'upon', 'said', 'would', 'should', 'must', 'may', 'might',
        'first', 'second', 'third'
    ])
    
    # Process text with spaCy
    doc = nlp(text)
    
    # Lemmatize and filter tokens
    tokens = [
        token.lemma_ for token in doc 
        if token.text not in custom_stop_words 
        and not token.is_punct 
        and not token.is_space
        and len(token.text) > 2  # Remove very short words
        and not token.like_num  # Remove numbers
        and token.pos_ in ['NOUN', 'VERB', 'ADJ']  # Keep only content words
    ]
    
    # Validate the cleaned text
    if len(tokens) < 20:  # Minimum threshold for meaningful analysis
        print(f"Warning: Text has only {len(tokens)} tokens after cleaning")
    
    return ' '.join(tokens)
    

In [3]:
# Add validation function
def validate_cleaned_texts(directory):
    """Validate cleaned texts to ensure they're suitable for LDA"""
    problems = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r", encoding="utf-8") as file:
                text = file.read()
                tokens = text.split()
                
                # Check for potential issues
                if len(tokens) < 20:
                    problems.append(f"{filename}: Only {len(tokens)} tokens")
                elif len(set(tokens)) < 10:
                    problems.append(f"{filename}: Only {len(set(tokens))} unique tokens")
    
    if problems:
        print("\nPotential problems found:")
        for problem in problems:
            print(problem)
    else:
        print("\nAll documents passed validation")
    
    return len(problems) == 0

In [4]:
input_folder = "/Users/jinalee/Desktop/Topic_modeling/raw_books"  
output_folder = "/Users/jinalee/Desktop/Topic_modeling/cleaned_books" 
os.makedirs(output_folder, exist_ok=True)

In [5]:
# Modify the main processing loop
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(input_folder, filename), 'r', encoding='utf-8') as file:
            raw_text = file.read()
            cleaned_text = clean_text(raw_text)
            
            # Only save if we have enough content
            if len(cleaned_text.split()) >= 20:
                with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as out_file:
                    out_file.write(cleaned_text)
                print(f"Processed and saved: {filename}")
            else:
                print(f"Warning: Skipped {filename} due to insufficient content")

Processed and saved: Mother holle.txt
Processed and saved: The elderbush.txt
Processed and saved: The six swans.txt
Processed and saved: Little red cap.txt
Processed and saved: The happy family.txt
Processed and saved: Little snow white.txt
Processed and saved: The little match girl.txt
Processed and saved: The golden goose.txt
Processed and saved: Cinderella.txt
Processed and saved: The three little men in the wood.txt
Processed and saved: Faithful John.txt
Processed and saved: The old house.txt
Processed and saved: The frog prince.txt
Processed and saved: The emperor's new clothes.txt
Processed and saved: The water of life.txt
Processed and saved: Briar rose.txt
Processed and saved: The shoes of fortune.txt
Processed and saved: Hansel and grethel.txt
Processed and saved: The story of a mother.txt
Processed and saved: Little one-eye, two-eyes and three-eyes.txt
Processed and saved: The swinherd.txt
Processed and saved: The false collar.txt
Processed and saved: The snow queen.txt
Proce

In [6]:
# Run validation after processing
print("\nValidating cleaned texts...")
validate_cleaned_texts(output_folder)


Validating cleaned texts...

All documents passed validation


True