In [1]:
import os
import numpy as np
import requests
from bs4 import BeautifulSoup
import re

txt_files = [
    "Alan_Weisman conv.txt",
    "Aldo Leopold conv.txt",
    "Amy Godine conv.txt",
    "Anne_Therese_Gennari conv.txt",
    "Barbara_Kingsolver conv.txt",
    "Bill_Mckibben conv.txt",
    "Camille_T_Dungy conv.txt",
    "Carolyn Merchant conv.txt",
    "David_Goulson conv.txt",
    "Dina_Gilio_Whitaker conv.txt",
    "Dr_Seuss conv.txt",
    "Greta_Thunberg conv.txt",
    "Jane_Goodall conv.txt",
    "Jon_Krakauer conv.txt",
    "Peter_Wohlleben conv.txt",
    "Rachel_Carson conv.txt",
    "Richard_Powers conv.txt",
    "Val_Plumwood conv.txt",
    "_OceanofPDF.com_Braiding_Sweetgrass_-_Robin_Wall_Kimmerer conv.txt"
]

guten_dict = {
    "https://www.gutenberg.org/files/17748/17748-h/17748-h.htm": {
        "title": "The Extermination of the American Bison",
        "author": "William T. Hornaday"
    },
    "https://www.gutenberg.org/files/205/205-h/205-h.htm": {
        "title": "Walden", 
        "author": "Henry David Thoreau"
    },
    "https://www.gutenberg.org/cache/epub/49270/pg49270-images.html": {
        "title": "African Nature Notes and Reminiscences",
        "author": "Frederick Courteney Selous"
    },
    "https://www.gutenberg.org/cache/epub/66546/pg66546-images.html": {
        "title": "The Ivory King: A Popular History of the Elephant and Its Allies",
        "author": "Charles Fredrick Holde"
    },
    "https://www.gutenberg.org/ebooks/73563": {
        "title": "Tropical Nature, and Other Essays", 
        "author": "Alfred Russel Wallace"
    },
    "https://www.gutenberg.org/cache/epub/44764/pg44764-images.html": {
        "title": "California: The Land of the Sun",
        "author": "Mary Austin"
    }
}


author_genders = {
    # Local file authors
    "Alan_Weisman": "M",
    "Aldo Leopold": "M", 
    "Amy Godine": "F",
    "Anne_Therese_Gennari": "F",
    "Barbara_Kingsolver": "F",
    "Bill_Mckibben": "M",
    "Camille_T_Dungy": "F", 
    "Carolyn Merchant": "F",
    "David_Goulson": "M",
    "Dina_Gilio_Whitaker": "F",
    "Dr_Seuss": "M",
    "Greta_Thunberg": "F",
    "Jane_Goodall": "F",
    "Jon_Krakauer": "M",
    "Peter_Wohlleben": "M",
    "Rachel_Carson": "F",
    "Richard_Powers": "M",
    "Val_Plumwood": "F",
    "_Robin_Wall_Kimmerer": "F",
    
    # Gutenberg authors
    "William T. Hornaday": "M",
    "Henry David Thoreau": "M",
    "Frederick Courteney Selous": "M", 
    "Charles Fredrick Holde": "M",
    "Alfred Russel Wallace": "M",
    "Mary Austin": "F"
}

Gutenberg Cleaner:

https://github.com/kiasar/gutenberg_cleaner

In [4]:
pip install gutenberg_cleaner

Note: you may need to restart the kernel to use updated packages.


In [5]:
from gutenberg_cleaner import simple_cleaner, super_cleaner
import re
import requests
from bs4 import BeautifulSoup
import numpy as np

In [6]:
# Dictionary to store final results
final_dictionary = {}

# Process local files 
for text_file in txt_files:
    with open(text_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Use simple_cleaner local files, might be too harsh for local girlies but we'll see
    filtered_content = simple_cleaner(content)
    
    words = filtered_content.split()
    # Skip files with 0 or 1 words after filtering
    if len(words) <= 1:
        print(f"Skipping {text_file}: only {len(words)} words after filtering")
        continue

    if len(words) > 5000:
        start_index = np.random.randint(0, len(words) - 5000)
        text_selection = words[start_index:start_index + 5000]
    else:
        text_selection = words

    author_name = text_file.split(" conv.txt")[0]
    gender = author_genders.get(author_name, "Unknown")

    final_dictionary[author_name] = {
        "title": text_file.replace(" conv.txt", ""),
        "author": author_name,
        "words": text_selection,
        "gender": gender,
        "word_count": len(text_selection)
    }

# Process Gutenberg books
for url, meta in guten_dict.items():
    resp = requests.get(url)
    resp.raise_for_status()
    html = resp.text

    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
        tag.decompose()

    text = soup.get_text(separator=" ")
    clean_text = " ".join(text.split())

    # Just removes lines that are part of the Project Gutenberg header or footer
    # Doesnt go deeply in the text to remove other things like titles or footnotes or etc
    # return: str of the book without the lines that are part of the Project Gutenberg header and footer
    cleaned_text = simple_cleaner(clean_text)  # Basic cleaning
    # OR for more aggressive cleaning:
    # cleaned_text = super_cleaner(clean_text, min_token=10, max_token=500)
    
    words = cleaned_text.split()

    # Skip files with 0 or 1 words after filtering
    if len(words) <= 1:
        print(f"Skipping {meta['title']}: only {len(words)} words after filtering")
        continue

    if len(words) >= 5000:
        start_index = np.random.randint(0, len(words) - 5000)
        text_selection = words[start_index:start_index + 5000]
    else:
        text_selection = words

    author_name = meta["author"]
    gender = author_genders.get(author_name, "Unknown")

    final_dictionary[author_name] = {
        "title": meta["title"],
        "author": author_name,
        "words": text_selection,
        "gender": gender,
        "word_count": len(text_selection)
    }

Skipping Aldo Leopold conv.txt: only 1 words after filtering
Skipping Bill_Mckibben conv.txt: only 1 words after filtering
Skipping Carolyn Merchant conv.txt: only 0 words after filtering
Skipping Dr_Seuss conv.txt: only 1 words after filtering
Skipping Jon_Krakauer conv.txt: only 1 words after filtering
Skipping Peter_Wohlleben conv.txt: only 1 words after filtering
Skipping Richard_Powers conv.txt: only 1 words after filtering
Skipping The Extermination of the American Bison: only 0 words after filtering
Skipping The Ivory King: A Popular History of the Elephant and Its Allies: only 0 words after filtering
Skipping California: The Land of the Sun: only 0 words after filtering


In [8]:
print(f"Total authors after processing: {len(final_dictionary)}")

# Balance genders - only include authors with word_count > 1
valid_authors = {author: data for author, data in final_dictionary.items() if data['word_count'] > 1}

male_authors = [author for author, data in valid_authors.items() if data['gender'] == 'M']
female_authors = [author for author, data in valid_authors.items() if data['gender'] == 'F']

print(f"Valid male authors: {len(male_authors)}, Valid female authors: {len(female_authors)}")

if len(male_authors) == 0 or len(female_authors) == 0:
    print("Warning: Not enough authors of one gender to create balanced dataset")
    balanced_data = valid_authors
else:
    min_count = min(len(male_authors), len(female_authors))

    balanced_male = np.random.choice(male_authors, min_count, replace=False)
    balanced_female = np.random.choice(female_authors, min_count, replace=False)

    balanced_authors = list(balanced_male) + list(balanced_female)
    balanced_data = {author: valid_authors[author] for author in balanced_authors}

    print(f"Total authors in balanced dataset: {len(balanced_data)}")
    print(f"Male authors: {len(balanced_male)}, Female authors: {len(balanced_female)}")

# Final check - remove any authors with 0 or 1 words from balanced_data (just to be safe)
final_balanced_data = {author: data for author, data in balanced_data.items() if data['word_count'] > 1}
print(f"Final balanced dataset size: {len(final_balanced_data)}")

Total authors after processing: 15
Valid male authors: 5, Valid female authors: 9
Total authors in balanced dataset: 10
Male authors: 5, Female authors: 5
Final balanced dataset size: 10


In [9]:
for author_name, author_data in final_balanced_data.items():
    words = author_data['words']
    title = author_data['title']
    
    # Print first 10 words to see what's getting through
    print(f"\n{title} - First 10 words:")
    print(' '.join(words[:10]))


David_Goulson - First 10 words:
per cent. They are specialist predators of large, hairy caterpillars

Walden - First 10 words:
taken up their march, with faint wiry peep, single file

Tropical Nature, and Other Essays - First 10 words:
Tropical nature, and other essays by Alfred Russel Wallace |

Alan_Weisman - First 10 words:
decades and erupted viciously several times during the 1950s. A

African Nature Notes and Reminiscences - First 10 words:
had struck the little Creature on the Loins and broken

Dina_Gilio_Whitaker - First 10 words:
https://newsmaven.io/indiancountrytoday/archive/northern-cheyenne-sue-to-block-coal-mining-on-public-lands-hPtxUVh1_0GncLXXPSQ9MA. 32. Michelle Tolson, “Yakama Nation Fights for Nuclear Waste

Val_Plumwood - First 10 words:
beyond this and can never be food. Domination emerges in

Jane_Goodall - First 10 words:
get to park headquarters on the other side. We could

Amy Godine - First 10 words:
light, much too focused on the cold to marvel at

Greta_Th

In [11]:
def print_top_ngrams(ngram_counts, ngram_label, book_title, top_ngrams=10):
    # Sliding window through the text based on the size and to extract the consecutive tokens
    top_items = sorted(ngram_counts.items(), key=lambda x: x[1], reverse=True)[:top_ngrams]
    
    print(f"\nTop {top_ngrams} {ngram_label} in {book_title}:")
    for ngram, frequency in top_items:
        print(f"  {ngram}: {frequency}")

In [12]:
for author_name, author_data in final_balanced_data.items():
    words = author_data['words']
    title = author_data['title']
    
    # Calculate n-grams, trigrams
    from collections import Counter
    trigrams = []
    for i in range(len(words) - 2):
        trigrams.append((words[i], words[i+1], words[i+2]))
    
    trigram_counts = Counter(trigrams)
    
    print_top_ngrams(trigram_counts, "trigrams", title)


Top 10 trigrams in David_Goulson:
  ('one', 'of', 'the'): 5
  ('that', 'glyphosate', 'was'): 4
  ('the', 'effects', 'of'): 3
  ('the', 'number', 'of'): 3
  ('that', 'we', 'are'): 3
  ('there', 'is', 'a'): 3
  ('while', 'the', 'EPA'): 3
  ('the', 'safety', 'of'): 3
  ('in', 'favour', 'of'): 3
  ('non-Hodgkin’s', 'lymphoma', 'after'): 3

Top 10 trigrams in Walden:
  ('I', 'did', 'not'): 7
  ('of', 'the', 'pond,'): 5
  ('when', 'he', 'came'): 5
  ('that', 'it', 'was'): 4
  ('in', 'the', 'woods,'): 4
  ('that', 'I', 'could'): 3
  ('to', 'see', 'the'): 3
  ('I', 'do', 'not'): 3
  ('if', 'I', 'had'): 3
  ('part', 'of', 'the'): 3

Top 10 trigrams in Tropical Nature, and Other Essays:
  ('by', 'Alfred', 'Russel'): 4
  ('Alfred', 'Russel', 'Wallace'): 4
  ('Tropical', 'nature,', 'and'): 3
  ('nature,', 'and', 'other'): 3
  ('and', 'other', 'essays'): 3
  ('other', 'essays', 'by'): 2
  ('essays', 'by', 'Alfred'): 2
  ('1.2', 'MB', 'EPUB'): 2
  ('--', 'The', 'colours'): 2
  ('The', 'colours', 'o