# Ngrams Vectorized

Still looking for common ngrams, but hoping to avoid the 40-minute wait of the NLTK approach. We will need something faster if we are going, eventually, to validate our results against the Common Crawl dataset.

In [None]:
# IMPORTS
# Requires Brown and Punkt corpora from NLTK: nltk.download('pkg_name')

import string
import json
from nltk.corpus import brown
from nltk.util import ngrams
from nltk.corpus import brown
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer, util
import torch
# from nltk.tokenize import word_tokenize
# from collections import Counter

In [None]:

# LOAD embedding model
# 'all-mpnet-base-v2' provides a great balance of accuracy and speed for 2026.
model = SentenceTransformer('all-mpnet-base-v2')

In [2]:

# Prepare Reference Corpus (The "Known" World)
# Brown is a toy corpus but useful to illustrate the concept.
sentences = [" ".join(sent) for sent in brown.sents()] 

print(f"Encoding {len(sentences)} reference sentences... (takes ~30s on CPU)")
corpus_embeddings = model.encode(sentences, convert_to_tensor=True)

Encoding 57340 reference sentences... (takes ~30s on CPU)


In [None]:
# Derive Possible LLM Candidates
# These are the phrases you extracted using your "Anchor Probing" script.
llm_candidates = [
    "A bird in the hand is worth two in the bush", # Traditional
    "The data is the new oil of the digital age",    # Modern Maxim (Novel)
    "A bug in production is a thief of sleep",     # Modern Maxim (Novel)
    "Don't count your chickens before they hatch"   # Traditional
]
candidate_embeddings = model.encode(llm_candidates, convert_to_tensor=True)

# Automate the Comparison
# We find the 'Nearest Neighbor' in the human corpus for every LLM phrase.
cosine_scores = util.cos_sim(candidate_embeddings, corpus_embeddings)

print("\n" + "="*70)
print(f"{'LLM CANDIDATE':<45} | {'SIMILARITY':<10} | {'STATUS'}")
print("="*70)

for i, candidate in enumerate(llm_candidates):
    # Find the sentence in the human corpus with the highest similarity score
    max_score, index = torch.max(cosine_scores[i], dim=0)
    score = max_score.item()
    
    # Thresholds:
    # > 0.90: Exact/Near-exact match (Extant & Traditional)
    # 0.75 - 0.90: Variation / Anti-proverb
    # < 0.70: Semantic Novelty (Potentially a "New" Proverb)
    
    if score > 0.90:
        status = "OLD TRUTH"
    elif score > 0.75:
        status = "VARIANT"
    else:
        status = "NOVEL MAXIM"
        
    print(f"{candidate[:44]:<45} | {score:.4f}     | {status}")

In [None]:
# LOAD DATA
with open('responses/r-di-5000-4.json', 'r') as f:
    data = json.load(f)

In [4]:
# FUNCTIONS

def preprocess(text):
    """Normalize text by lowercasing and removing punctuation."""
    text = text.lower()
    return text.translate(str.maketrans('', '', string.punctuation))

def get_similarity(a, b):
    """Calculate fuzzy similarity ratio between two strings (0.0 to 1.0)."""
    return SequenceMatcher(None, a, b).ratio()

def build_reference_set(n_min=8, n_max=20):
    """Extract Reference N-Grams from Brown Corpus"""
    print("Extracting reference n-grams from Brown Corpus (this may take a minute)...")
    words = [preprocess(w) for w in brown.words()]
    reference_ngrams = set()
    
    # We focus on a specific range to save memory
    for n in range(n_min, n_max + 1):
        grams = ngrams(words, n)
        for g in grams:
            reference_ngrams.add(" ".join(g))
    return reference_ngrams

def analyze_candidates(candidates, reference_set, threshold=0.85):
    """Compare Candidates against Corpus"""
    results = []
    for candidate in candidates:
        prep_cand = preprocess(candidate)
        
        # Check for Exact Match (O(1) lookup)
        if prep_cand in reference_set:
            results.append({"phrase": candidate, "status": "EXTANT", "score": 1.0})
            continue
            
        # Check for Fuzzy Match (more computationally expensive)
        # Note: In a large-scale test, use a Vector DB or Keyword filter here.
        max_sim = 0.0
        # Optimization: Only check reference n-grams of similar length
        for ref in reference_set:
            if abs(len(ref) - len(prep_cand)) < 15: # Rough length filter
                sim = get_similarity(prep_cand, ref)
                if sim > max_sim:
                    max_sim = sim
                if max_sim >= threshold: break
        
        if max_sim >= threshold:
            results.append({"phrase": candidate, "status": "EXTANT (VARIANT)", "score": max_sim})
        else:
            results.append({"phrase": candidate, "status": "NOVEL", "score": max_sim})
            
    return results

In [5]:
# RUN
# Mock list of n-grams extracted from your LLM probing
extracted_ngrams = [
    "A bird in the hand is worth two in the bush", # Traditional
    "The data is the new oil of the digital economy", # Modern / Novel
    "A meeting that could have been an email is a thief of time", # Modern / Novel
    "All that glitters is not gold" # Traditional
]

ref_set = build_reference_set(8, 20)
analysis = analyze_candidates(extracted_ngrams, ref_set)

print("\n" + "="*60)
print(f"{'CANDIDATE PHRASE':<45} | {'STATUS':<10}")
print("="*60)
for res in analysis:
    print(f"{res['phrase'][:44]:<45} | {res['status']}")

Extracting reference n-grams from Brown Corpus (this may take a minute)...

CANDIDATE PHRASE                              | STATUS    
A bird in the hand is worth two in the bush   | NOVEL
The data is the new oil of the digital econo  | NOVEL
A meeting that could have been an email is a  | NOVEL
All that glitters is not gold                 | NOVEL


In [None]:
# Let's save the Brown reference set
# as a JSON for easier readability

with open('ngrams-brown-nltk.json', 'w') as f:
    json.dump(list(ref_set), f)