# TCR Feature Extraction: Readability Formulas

This module provides functions to compute standard readability metrics for a given text using the `textstat` library. These metrics estimate how easy or difficult a text is to read, often expressed as a U.S. grade level or a score.

**Features computed:**
- SMOG Index
- Automated Readability Index (ARI)
- Dale-Chall Readability Score
- Linsear Write Formula
- Gunning-Fog Index
- Coleman-Liau Index
- Flesch Reading Ease
- Flesch Kincaid Grade Level

**Usage:**

In [4]:
import nltk
import textstat
import math
import statistics
from collections import Counter

# Ensure necessary NLTK data is available
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def get_readability_scores(text):
    """
    Compute core readability metrics.
    """
    return {
        'SMOG': textstat.smog_index(text),
        'ARI': textstat.automated_readability_index(text),
        'Dale-Chall': textstat.dale_chall_readability_score(text),
        'Linsear Write': textstat.linsear_write_formula(text),
        'Gunning-Fog': textstat.gunning_fog(text),
        'Coleman-Liau': textstat.coleman_liau_index(text),
        'Flesch Reading Ease': textstat.flesch_reading_ease(text),
        'Flesch-Kincaid Grade': textstat.flesch_kincaid_grade(text)
    }

def get_length_stats(text):
    """
    Average word length, sentence length, and their standard deviations.
    """
    words = nltk.word_tokenize(text)
    sentences = nltk.sent_tokenize(text)
    chars_per_word = [len(w) for w in words]
    words_per_sent = [len(nltk.word_tokenize(s)) for s in sentences]
    
    return {
        'AvgChars/Word': statistics.mean(chars_per_word),
        'StdChars/Word': statistics.pstdev(chars_per_word),
        'AvgWords/Sentence': statistics.mean(words_per_sent),
        'StdWords/Sentence': statistics.pstdev(words_per_sent)
    }

def get_hapax_dislegomena(text):
    """
    Count words occurring exactly once (hapax) or twice (dis legomena).
    """
    tokens = nltk.word_tokenize(text.lower())
    freqs = Counter(tokens)
    return {
        'HapaxLegomena': sum(1 for c in freqs.values() if c == 1),
        'DisLegomena': sum(1 for c in freqs.values() if c == 2)
    }

def get_entropy_perplexity(text):
    """
    Shannon entropy and derived perplexity of the token distribution.
    """
    tokens = nltk.word_tokenize(text.lower())
    freqs = Counter(tokens)
    total = len(tokens)
    entropy = -sum((c/total) * math.log2(c/total) for c in freqs.values())
    perplexity = 2**entropy
    return {
        'Entropy': entropy,
        'Perplexity': perplexity
    }

def get_lexical_diversity(text, window_size=100):
    """
    Compute TTR variations (MATTR) and MTLD.
    """
    tokens = nltk.word_tokenize(text.lower())
    types = set(tokens)
    ttr = len(types)/len(tokens)

    # Moving-Average TTR (MATTR)
    mattr_values = []
    for i in range(len(tokens) - window_size + 1):
        window = tokens[i:i+window_size]
        mattr_values.append(len(set(window))/window_size)
    mattr = statistics.mean(mattr_values) if mattr_values else 0

    # MTLD (approximate)
    def mtld_calc(tokens, threshold=0.72):
        factors = 0
        types_set = set()
        token_count = 0
        for t in tokens:
            types_set.add(t)
            token_count += 1
            if len(types_set)/token_count <= threshold:
                factors += 1
                types_set.clear()
                token_count = 0
        if token_count > 0:
            factors += (1 - (len(types_set)/token_count - threshold)) / (1 - threshold)
        return len(tokens)/factors if factors else 0

    mtld = mtld_calc(tokens)
    
    return {
        'TTR': ttr,
        'MATTR': mattr,
        'MTLD': mtld
    }

def get_functional_diversity(text):
    """
    Ratio of content words to function words via POS tags.
    """
    tokens = nltk.word_tokenize(text)
    tags = nltk.pos_tag(tokens)
    function_tags = {'DT','IN','CC','TO','PRP','PRP$','MD','RB','WRB','WP','WP$'}
    func = sum(1 for _, t in tags if t in function_tags)
    content = sum(1 for _, t in tags if t not in function_tags)
    return {'FunctionalDiversity': content/(func if func else 1)}

def extract_tcr_features(text):
    """
    Gather all TCR features into a single dictionary.
    """
    features = {}
    features.update(get_readability_scores(text))
    features.update(get_length_stats(text))
    features.update(get_hapax_dislegomena(text))
    features.update(get_entropy_perplexity(text))
    features.update(get_lexical_diversity(text))
    features.update(get_functional_diversity(text))
    return features

# Example usage
sample_text = "The quick brown fox jumps over the lazy dog. It served as a pangram widely used for testing fonts."
features = extract_tcr_features(sample_text)
print(features)


{'SMOG': 3.1291, 'ARI': 3.151578947368421, 'Dale-Chall': 9.094015789473683, 'Linsear Write': 3.75, 'Gunning-Fog': 3.8000000000000003, 'Coleman-Liau': 4.894736842105264, 'Flesch Reading Ease': 90.32934210526317, 'Flesch-Kincaid Grade': 3.0202631578947354, 'AvgChars/Word': 3.8095238095238093, 'StdChars/Word': 1.8157786633273922, 'AvgWords/Sentence': 10.5, 'StdWords/Sentence': 0.5, 'HapaxLegomena': 17, 'DisLegomena': 2, 'Entropy': 4.20184123230257, 'Perplexity': 18.402644982465112, 'TTR': 0.9047619047619048, 'MATTR': 0, 'MTLD': 7.212616822429907, 'FunctionalDiversity': 1.625}


[nltk_data] Downloading package punkt to /Users/sali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sali/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [1]:
# Install these first if you don’t have them:
# pip install gensim transformers openai torch
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import openai
from gensim.models import KeyedVectors
import os

from dotenv import load_dotenv
load_dotenv()  # Load environment variables from .env file


# # 1) Word2Vec (via gensim)
def get_word2vec_embedding(text, w2v_path="../embedding-models/GoogleNews-vectors-negative300.bin", size=300):
    """
    - text: string
    - w2v_path: path to a .bin or .kv model file (e.g. GoogleNews-vectors-negative300.bin)
    Returns the mean of the token embeddings.
    """
    # load once (outside function in real code)
    w2v = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
    tokens = text.lower().split()
    vecs = [w2v[word] for word in tokens if word in w2v]
    if not vecs:
        return np.zeros(size)
    return np.mean(vecs, axis=0)


# 3) BERT (via HuggingFace Transformers)

def get_bert_embedding(text, model_name='bert-base-uncased', layer=-2):
    """
    Returns the mean of the last hidden states from one BERT layer.
    """
    tok = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    inputs = tok(text, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        # hidden_states is tuple: one tensor per layer
        hidden = outputs.hidden_states[layer]  # e.g. second-to-last
        # [batch, seq_len, hidden_dim]
        return hidden.mean(dim=1).squeeze().numpy()


# 4) OpenAI embeddings

def get_openai_embedding(text, model="text-embedding-3-small"):
    """
    Returns the OpenAI embedding for the whole text.
    """
    
    client = openai.Client(api_key=os.getenv('OPENAI_API_KEY'))
    resp = client.embeddings.create(
        input=text,
        model=model
    )
    return np.array(resp.data[0].embedding)  


# # ===== Example usage =====


sample = "The quick brown fox jumps over the lazy dog."

# Word2Vec (download Google News binary, set path)
# w2v_vec = get_word2vec_embedding(sample, '/path/to/GoogleNews-vectors-negative300.bin')

# GloVe (convert glove txt to word2vec format or use no_header=True)
# glove_vec = get_word2vec_embedding(sample)
# print("GloVe embedding shape:", glove_vec.shape)

# # BERT
# bert_vec = get_bert_embedding(sample)
# print("BERT embedding shape:", bert_vec.shape)

# OpenAI (set your OPENAI_API_KEY env var or pass directly)
openai_vec = get_openai_embedding(sample)
print("OpenAI embedding length:", len(openai_vec))


OpenAI embedding length: 1536
