In [89]:
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

txt_files = [
    "Alan_Weisman conv.txt",
    "Amy Godine conv.txt",
    "Anne_Therese_Gennari conv.txt",
    "Barbara_Kingsolver conv.txt",
    "Camille_T_Dungy conv.txt",
    "David_Goulson conv.txt",
    "Dina_Gilio_Whitaker conv.txt",
    "Greta_Thunberg conv.txt",
    "Jane_Goodall conv.txt",
    "Rachel_Carson conv.txt",
    "Val_Plumwood conv.txt",
    "_OceanofPDF.com_Braiding_Sweetgrass_-_Robin_Wall_Kimmerer conv.txt"
]

guten_dict = {
    "https://www.gutenberg.org/files/17748/17748-h/17748-h.htm": {
        "title": "The Extermination of the American Bison",
        "author": "William T. Hornaday"
    },
    "https://www.gutenberg.org/files/205/205-h/205-h.htm": {
        "title": "Walden", 
        "author": "Henry David Thoreau"
    },
    "https://www.gutenberg.org/cache/epub/49270/pg49270-images.html": {
        "title": "African Nature Notes and Reminiscences",
        "author": "Frederick Courteney Selous"
    },
    "https://www.gutenberg.org/cache/epub/66546/pg66546-images.html": {
        "title": "The Ivory King: A Popular History of the Elephant and Its Allies",
        "author": "Charles Fredrick Holde"
    },
    "https://www.gutenberg.org/ebooks/73563": {
        "title": "Tropical Nature, and Other Essays", 
        "author": "Alfred Russel Wallace"
    },
    "https://www.gutenberg.org/cache/epub/44764/pg44764-images.html": {
        "title": "California: The Land of the Sun",
        "author": "Mary Austin"
    }
}


author_genders = {
    # Local file authors
    "Alan_Weisman": "M",
    "Amy Godine": "F",
    "Anne_Therese_Gennari": "F",
    "Barbara_Kingsolver": "F",
    "Camille_T_Dungy": "F", 
    "David_Goulson": "M",
    "Dina_Gilio_Whitaker": "F",
    "Greta_Thunberg": "F",
    "Jane_Goodall": "F",
    "Rachel_Carson": "F",
    "Val_Plumwood": "F",
    "_Robin_Wall_Kimmerer": "F",
    
    # Gutenberg authors
    "William T. Hornaday": "M",
    "Henry David Thoreau": "M",
    "Frederick Courteney Selous": "M", 
    "Charles Fredrick Holde": "M",
    "Alfred Russel Wallace": "M",
    "Mary Austin": "F"
}

Gutenberg Cleaner:

https://github.com/kiasar/gutenberg_cleaner

In [91]:
pip install gutenberg_cleaner

Note: you may need to restart the kernel to use updated packages.


In [92]:
from gutenberg_cleaner import simple_cleaner, super_cleaner
import re
import requests
from bs4 import BeautifulSoup
import numpy as np

In [93]:
def local_cleaner(text):
# removes excessive whitespace, keeps punctuation and sentences

    # Replace multiple spaces/tabs with single space
    text = ' '.join(text.split())
    
    # remove common file header artifacts if they appear at the start
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        # Skip lines that are too short and look like metadata
        if len(line.strip()) < 5 and any(keyword in line.lower() for keyword in ['page', 'chapter', 'copyright']):
            continue
        filtered_lines.append(line.strip())
    
    return ' '.join(filtered_lines)

In [94]:
# Dictionary to store final results
final_dictionary = {}


# Process local files 
for text_file in txt_files:
    with open(text_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Use simple_cleaner local files, might be too harsh for local girlies but we'll see
    filtered_content = local_cleaner(content)
    
    words = filtered_content.split()
    # Skip files with 0 or 1 words after filtering
    if len(words) <= 1:
        print(f"Skipping {text_file}: only {len(words)} words after filtering")
        continue

    author_name = text_file.split(" conv.txt")[0]
    gender = author_genders.get(author_name, "Unknown")

    final_dictionary[author_name] = {
        "title": text_file.replace(" conv.txt", ""),
        "author": author_name,
        "words": text_selection,
        "gender": gender,
        "word_count": len(text_selection)
    }

# Process Gutenberg books
for url, meta in guten_dict.items():
    resp = requests.get(url)
    resp.raise_for_status()
    html = resp.text

    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
        tag.decompose()

    text = soup.get_text(separator=" ")
    clean_text = " ".join(text.split())

    # Just removes lines that are part of the Project Gutenberg header or footer
    # Doesnt go deeply in the text to remove other things like titles or footnotes or etc
    # return: str of the book without the lines that are part of the Project Gutenberg header and footer
    cleaned_text = simple_cleaner(clean_text)  # Basic cleaning
    # OR for more aggressive cleaning:
    # cleaned_text = super_cleaner(clean_text, min_token=10, max_token=500)
    
    words = cleaned_text.split()

    # Skip files with 0 or 1 words after filtering
    if len(words) <= 1:
        print(f"Skipping {meta['title']}: only {len(words)} words after filtering")
        continue

    author_name = meta["author"]
    gender = author_genders.get(author_name, "Unknown")

    final_dictionary[author_name] = {
        "title": meta["title"],
        "author": author_name,
        "words": text_selection,
        "gender": gender,
        "word_count": len(text_selection)
    }

Skipping The Extermination of the American Bison: only 0 words after filtering
Skipping The Ivory King: A Popular History of the Elephant and Its Allies: only 0 words after filtering
Skipping California: The Land of the Sun: only 0 words after filtering


In [96]:
print(f"Total authors after processing: {len(final_dictionary)}")

# Balance genders - only include authors with word_count > 1
valid_authors = {author: data for author, data in final_dictionary.items() if data['word_count'] > 1}

male_authors = [author for author, data in valid_authors.items() if data['gender'] == 'M']
female_authors = [author for author, data in valid_authors.items() if data['gender'] == 'F']

print(f"Valid male authors: {len(male_authors)}, Valid female authors: {len(female_authors)}")

if len(male_authors) == 0 or len(female_authors) == 0:
    balanced_data = valid_authors
else:
    min_count = min(len(male_authors), len(female_authors))

    balanced_male = np.random.choice(male_authors, min_count, replace=False)
    balanced_female = np.random.choice(female_authors, min_count, replace=False)

    balanced_authors = list(balanced_male) + list(balanced_female)
    balanced_data = {author: valid_authors[author] for author in balanced_authors}

    print(f"Total authors in balanced dataset: {len(balanced_data)}")
    print(f"Male authors: {len(balanced_male)}, Female authors: {len(balanced_female)}")

# Final check - remove any authors with 0 or 1 words from balanced_data (just to be safe)
final_balanced_data = {author: data for author, data in balanced_data.items() if data['word_count'] > 1}
print(f"Final balanced dataset size: {len(final_balanced_data)}")

Total authors after processing: 15
Valid male authors: 5, Valid female authors: 9
Total authors in balanced dataset: 10
Male authors: 5, Female authors: 5
Final balanced dataset size: 10


In [42]:
for author_name, author_data in final_balanced_data.items():
    words = author_data['words']
    title = author_data['title']
    
    # Print first 10 words to see what's getting through
    print(f"\n{title} - First 10 words:")
    print(' '.join(words[:10]))


African Nature Notes and Reminiscences - First 10 words:
that would not quietly withdraw, satisfied with the mercurial dose

David_Goulson - First 10 words:
‘convergent evolution’, whereby two unrelated creatures evolve to resemble one

Alan_Weisman - First 10 words:
ST. MARTIN'S PRESS NEW YORK THOMAS DUNNE BOOKS. An imprint

Walden - First 10 words:
was likely to hold together much longer, I was let

Tropical Nature, and Other Essays - First 10 words:
Tropical nature, and other essays by Alfred Russel Wallace |

Val_Plumwood - First 10 words:
subject’s universe is like the person-as-the-walled-moated-castle-town. It is under constant

Anne_Therese_Gennari - First 10 words:
influence gives us the power to help others and shift

Rachel_Carson - First 10 words:
the time to flare again into activity when spring awakens

Barbara_Kingsolver - First 10 words:
who winced, both at her foul language and at her

Dina_Gilio_Whitaker - First 10 words:
country, especially when accompanied by a lac

In [43]:
def print_top_ngrams(ngram_counts, ngram_label, book_title, top_ngrams=10):
    # Sliding window through the text based on the size and to extract the consecutive tokens
    top_items = sorted(ngram_counts.items(), key=lambda x: x[1], reverse=True)[:top_ngrams]
    
    print(f"\nTop {top_ngrams} {ngram_label} in {book_title}:")
    for ngram, frequency in top_items:
        print(f"  {ngram}: {frequency}")

In [44]:
for author_name, author_data in final_balanced_data.items():
    words = author_data['words']
    title = author_data['title']
    
    # Calculate n-grams, trigrams
    from collections import Counter
    trigrams = []
    for i in range(len(words) - 2):
        trigrams.append((words[i], words[i+1], words[i+2]))
    
    trigram_counts = Counter(trigrams)
    
    print_top_ngrams(trigram_counts, "trigrams", title)


Top 10 trigrams in African Nature Notes and Reminiscences:
  ('a', 'black', 'rhinoceros'): 11
  ('the', 'black', 'rhinoceros'): 11
  ('of', 'the', 'black'): 9
  ('black', 'rhinoceros', 'in'): 7
  ('by', 'a', 'black'): 5
  ('of', 'both', 'the'): 4
  ('both', 'the', 'black'): 4
  ('the', 'black', 'and'): 4
  ('black', 'and', 'the'): 4
  ('and', 'the', 'white'): 4

Top 10 trigrams in David_Goulson:
  ('to', 'ensure', 'that'): 5
  ('for', 'national', 'government'): 4
  ('Actions', 'for', 'all'): 4
  ('In', 'the', 'UK,'): 4
  ('of', 'pesticides', 'in'): 4
  ('fruit', 'and', 'veg.'): 3
  ('be', 'used', 'to'): 3
  ('Actions', 'for', 'national'): 3
  ('use', 'of', 'pesticides'): 3
  ('per', 'cent', 'of'): 3

Top 10 trigrams in Alan_Weisman:
  ('THE', 'WORLD', 'WITHOUT'): 2
  ('of', 'this', 'book'): 2
  ('shore', 'of', 'the'): 2
  ('the', 'Green', 'Line'): 2
  ('Greeks', 'to', 'the'): 2
  ('When', 'the', 'war'): 2
  ('by', 'Greek', 'Cypriots'): 2
  ('side', 'of', 'the'): 2
  ('the', 'size', 'o