# emma_ch1_metrics
Learning exercise inspired by *Natural Language Processing with Python* (Bird, Klein, Loper).  
Uses NLTK's built-in Gutenberg corpus (`austen-emma.txt`) and computes simple per-sentence metrics for **Chapter I**.

© 2025 Johennie Helton. Licensed under the [Apache License 2.0](http://www.apache.org/licenses/LICENSE-2.0).

In [1]:

import re
import csv
from statistics import mean
from collections import Counter
import pandas as pd

import nltk
from nltk.corpus import gutenberg
from nltk import word_tokenize, sent_tokenize, pos_tag

# --- One-time downloads (uncomment on first run) ---
#nltk.download('gutenberg')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')



In [2]:

def extract_chapter_1(raw_text: str) -> str:
    """
    Very simple extractor for the first chapter in Austen's Emma as distributed in NLTK.
    Looks for 'CHAPTER I' and 'CHAPTER II' boundaries.
    Falls back to a prefix if patterns aren't found (shouldn't happen for this file).
    """
    # Try classic Roman numerals or digits just in case
    pattern = r'CHAPTER\s+[IVXLC\d]+'
    matches = list(re.finditer(pattern, raw_text))
    if len(matches) >= 2:
        start = matches[0].end()
        end = matches[1].start()
        return raw_text[start:end].strip()
    # Fallback: take first ~6k characters after first CHAPTER occurrence
    first = re.search(pattern, raw_text)
    if first:
        start = first.end()
        return raw_text[start:start+6000].strip()
    # Last resort: just take the beginning chunk
    return raw_text[:6000].strip()

def is_alpha_token(t: str) -> bool:
    return re.fullmatch(r"[A-Za-z’-]+", t) is not None

def syllables_en(word: str) -> int:
    """
    Heuristic syllable counter (English). Good enough for an educational demo.
    """
    w = re.sub(r"[^a-z]", "", word.lower())
    if not w:
        return 0
    vowels = "aeiouy"
    count = 0
    prev_v = False
    for ch in w:
        is_v = ch in vowels
        if is_v and not prev_v:
            count += 1
        prev_v = is_v
    if w.endswith("e") and count > 1:
        count -= 1
    return max(count, 1)

def safe_div(n, d):
    return n / d if d else 0.0

def analyze_sentence(sent_text: str) -> dict:
    tokens = word_tokenize(sent_text)
    alpha = [t for t in tokens if is_alpha_token(t)]
    lower = [t.lower() for t in alpha]

    n_tokens = len(lower)
    n_types = len(set(lower))
    ttr = safe_div(n_types, n_tokens)
    avg_word_len = safe_div(sum(len(w) for w in lower), n_tokens)

    # POS tagging
    tags = pos_tag(lower)

    is_verb = lambda tag: tag.startswith("VB")
    is_adv  = lambda tag: tag.startswith("RB")
    is_adj  = lambda tag: tag.startswith("JJ")
    is_noun = lambda tag: tag.startswith("NN")

    n_verbs = sum(1 for _, t in tags if is_verb(t))
    n_advs  = sum(1 for _, t in tags if is_adv(t))
    n_adjs  = sum(1 for _, t in tags if is_adj(t))
    n_nouns = sum(1 for _, t in tags if is_noun(t))

    content_words = n_verbs + n_advs + n_adjs + n_nouns
    lexical_density = safe_div(content_words, n_tokens)

    # Syllables & a per-sentence Flesch-ish score (rough)
    syll_count = sum(syllables_en(w) for w in lower)
    flesch = 206.835 - 1.015 * n_tokens - 84.6 * safe_div(syll_count, n_tokens) if n_tokens else 0.0

    return {
        "sentence_text": sent_text.strip(),
        "n_tokens": n_tokens,
        "n_types": n_types,
        "ttr": round(ttr, 3),
        "n_nouns": n_nouns,
        "n_verbs": n_verbs,
        "n_adjs": n_adjs,
        "n_advs": n_advs,
        "lexical_density": round(lexical_density, 3),
        "avg_word_len": round(avg_word_len, 2),
        "syllables": syll_count,
        "flesch": round(flesch, 1),
        "chars": len(sent_text),
    }


In [3]:
raw = gutenberg.raw('austen-emma.txt')
ch1 = extract_chapter_1(raw)

# Sentences (punkt)
sentences = sent_tokenize(ch1)

rows = []
for idx, s in enumerate(sentences, 1):
    m = analyze_sentence(s)
    m["sent_index"] = idx
    rows.append(m)




In [4]:
df_metrics = pd.DataFrame(rows, columns=[
    "sent_index",
    "n_tokens", "n_types", "ttr",
    "n_nouns", "n_verbs", "n_adjs", "n_advs", "lexical_density",
    "avg_word_len", "syllables", "flesch", "chars",
    "sentence_text"
])

# Show the first few rows interactively
df_metrics.head()

Unnamed: 0,sent_index,n_tokens,n_types,ttr,n_nouns,n_verbs,n_adjs,n_advs,lexical_density,avg_word_len,syllables,flesch,chars,sentence_text
0,1,40,34,0.85,10,6,5,2,0.575,4.83,60,39.3,239,"Emma Woodhouse, handsome, clever, and rich, wi..."
1,2,32,27,0.844,8,3,4,2,0.531,4.62,48,47.5,186,She was the youngest of the two daughters of a...
2,3,41,33,0.805,8,8,5,3,0.585,4.39,59,43.5,223,Her mother\nhad died too long ago for her to h...
3,4,25,23,0.92,7,2,4,2,0.6,4.68,39,49.5,151,Sixteen years had Miss Taylor been in Mr. Wood...
4,5,8,8,1.0,2,1,0,1,0.5,4.5,13,61.2,51,Between _them_ it was more the intimacy\nof si...


In [5]:
    # Print a tiny summary
    print("=== Emma, Chapter I per-sentence metrics ===")
    print(f"Sentences: {len(rows)}")
    print(f"Avg tokens per sentence: {mean(r['n_tokens'] for r in rows):.2f}")
    print(f"Avg lexical density:      {mean(r['lexical_density'] for r in rows):.3f}")
    print(f"Avg Flesch score:         {mean(r['flesch'] for r in rows):.1f}")


=== Emma, Chapter I per-sentence metrics ===
Sentences: 162
Avg tokens per sentence: 20.36
Avg lexical density:      0.570
Avg Flesch score:         71.0
