# Ngrams

In the generating and inspection notebooks, we simply count the number of times a given proverb is repeated. One of the things we notice is that there is a lot of repetition of certain proverbs, usually with fairly small variations that do not change the meaning of the proverb.

One step in looking for the commonalities between proverbs is to look for ngrams. An n-gram is a contiguous sequence of 'n' items from a given sample of text or speech. Ngrams are widely used in natural language processing for tasks like text prediction, language modeling, and machine translation. 

Here we are looking to see if we can map the underlying structure of the proverbs by looking for common ngrams.

In [3]:
# IMPORTS
# Requires Brown and Punkt corpora from NLTK: nltk.download('pkg_name')
import string
import json
import nltk
from nltk.util import ngrams
from nltk.corpus import brown
from difflib import SequenceMatcher
from nltk.tokenize import word_tokenize
from collections import Counter

# Read file into one big text:
# Read file into one big text:
with open('responses/r-di-5000-4.json', 'r') as f:
    data = json.load(f)

In [None]:
# FUNCTIONS

def preprocess(text):
    """Normalize text by lowercasing and removing punctuation."""
    text = text.lower()
    return text.translate(str.maketrans('', '', string.punctuation))

def get_similarity(a, b):
    """Calculate fuzzy similarity ratio between two strings (0.0 to 1.0)."""
    return SequenceMatcher(None, a, b).ratio()

def build_reference_set(n_min=8, n_max=20):
    """Extract Reference N-Grams from Brown Corpus"""
    print("Extracting reference n-grams from Brown Corpus (this may take a minute)...")
    words = [preprocess(w) for w in brown.words()]
    reference_ngrams = set()
    
    # We focus on a specific range to save memory
    for n in range(n_min, n_max + 1):
        grams = ngrams(words, n)
        for g in grams:
            reference_ngrams.add(" ".join(g))
    return reference_ngrams

def analyze_candidates(candidates, reference_set, threshold=0.85):
    """Compare Candidates against Corpus"""
    results = []
    for candidate in candidates:
        prep_cand = preprocess(candidate)
        
        # Check for Exact Match (O(1) lookup)
        if prep_cand in reference_set:
            results.append({"phrase": candidate, "status": "EXTANT", "score": 1.0})
            continue
            
        # Check for Fuzzy Match (more computationally expensive)
        # Note: In a large-scale test, use a Vector DB or Keyword filter here.
        max_sim = 0.0
        # Optimization: Only check reference n-grams of similar length
        for ref in reference_set:
            if abs(len(ref) - len(prep_cand)) < 15: # Rough length filter
                sim = get_similarity(prep_cand, ref)
                if sim > max_sim:
                    max_sim = sim
                if max_sim >= threshold: break
        
        if max_sim >= threshold:
            results.append({"phrase": candidate, "status": "EXTANT (VARIANT)", "score": max_sim})
        else:
            results.append({"phrase": candidate, "status": "NOVEL", "score": max_sim})
            
    return results

# --- EXECUTION ---
# Mock list of n-grams extracted from your LLM probing
extracted_ngrams = [
    "A bird in the hand is worth two in the bush", # Traditional
    "The data is the new oil of the digital economy", # Modern / Novel
    "A meeting that could have been an email is a thief of time", # Modern / Novel
    "All that glitters is not gold" # Traditional
]

ref_set = build_reference_set(8, 20)
analysis = analyze_candidates(extracted_ngrams, ref_set)

print("\n" + "="*60)
print(f"{'CANDIDATE PHRASE':<45} | {'STATUS':<10}")
print("="*60)
for res in analysis:
    print(f"{res['phrase'][:44]:<45} | {res['status']}")

In [11]:
def preprocess_text(text):
    """Tokenizes text and converts to lowercase, removing punctuation."""
    # Tokenize the text into words
    tokens = word_tokenize(text.lower())
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation and word.isalnum()]
    return tokens

# Get the tokenized list from the corpus
word_tokens = preprocess_text(one_big_string)
print(f"Total number of tokens after preprocessing: {len(word_tokens)}")

Total number of tokens after preprocessing: 4986


In [22]:
def get_all_ngrams_in_range(tokens, min_n, max_n):
    """
    Generates a list of all n-grams for n within the specified range.
    """
    all_ngrams = []
    # Loop from min_n up to and including max_n
    for n in range(min_n, max_n + 1):
        # The ngrams function yields tuples of n tokens
        n_gram_generator = ngrams(tokens, n)
        # Convert the generator results to a list and extend the master list
        all_ngrams.extend(list(n_gram_generator))
    return all_ngrams

In [23]:
# Arbitrary numbers until they are not:
MIN_N = 8
MAX_N = 20

# Get all n-grams
long_ngrams = get_all_ngrams_in_range(word_tokens, MIN_N, MAX_N)

# The n-grams are returned as tuples of tokens, e.g., ('the', 'complexity', 'of', 'modern', 'life', 'often', 'masks', 'the')
# We can join them to view them as phrases:
print(f"\nExample of a generated {MIN_N}-gram (first one):")
print(' '.join(long_ngrams[0]))


Example of a generated 8-gram (first one):
do compare your to someone else highlight reel


In [24]:
# Count the frequency of each unique n-gram
ngram_counts = Counter(long_ngrams)

# Define how many top results you want to see
TOP_K = 10 

# Get the top K most common n-grams
most_common_ngrams = ngram_counts.most_common(TOP_K)

print(f"\n--- Top {TOP_K} Most Frequent N-grams ({MIN_N} to {MAX_N} tokens) ---")
for n_gram_tuple, count in most_common_ngrams:
    # Join the tuple tokens into a single string phrase
    phrase = ' '.join(n_gram_tuple)
    n_length = len(n_gram_tuple)
    print(f"[{n_length}-gram, Count: {count}]: \"{phrase}\"")


--- Top 10 Most Frequent N-grams (8 to 20 tokens) ---
[8-gram, Count: 15]: "always believe that something wonderful is about to"
[8-gram, Count: 15]: "believe that something wonderful is about to happen"
[9-gram, Count: 15]: "always believe that something wonderful is about to happen"
[8-gram, Count: 14]: "keep your face always toward the sunshine and"
[8-gram, Count: 14]: "your face always toward the sunshine and shadows"
[8-gram, Count: 14]: "face always toward the sunshine and shadows will"
[8-gram, Count: 14]: "always toward the sunshine and shadows will fall"
[8-gram, Count: 14]: "toward the sunshine and shadows will fall behind"
[8-gram, Count: 14]: "the sunshine and shadows will fall behind you"
[8-gram, Count: 14]: "happiness is not the absence of problems it"
