# 5 - Ground Truth Verification

In [1]:
# IMPORTS, KEYS, MODELS
from serpapi import GoogleSearch
import json
from collections import Counter
from sentence_transformers import SentenceTransformer, util

# Load API key
keys = json.load(open("../apikeys.json"))
SERP_API_KEY = keys["SerpApi"]["key"]

# Load model (all-MiniLM-L6-v2 is fast and accurate)
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('../models/BERT-all-mpnet-base-v2')

# SAVE MODEL (for offline use)
# model.save('../models/BERT-all-mpnet-base-v2')
# THEN TO LOAD IT AGAIN:
# model = SentenceTransformer(modelPath)

## Loading Results & Getting Rid of Duplicates

In [6]:
# Load file with texts & extract just the text field
with open('outputs/di-hermes-5000-1.json', 'r') as f:
    data = json.load(f)

texts = [entry['text'] for entry in data]

# With the texts loaded, the first step is to check 
# for exact duplicates and to reduce the number of texts
# before using more computationally expensive similarity checks.

# COUNT OCCURRENCES
counts = Counter(texts)

# FIND PROVERBS THAT APPEARED MORE THAN ONCE
duplicates = {text: count for text, count in counts.items() if count > 1}
print(f"Unique Proverbs: {len(counts)}")
print(f"Total Repetitions: {sum(duplicates.values()) - len(duplicates)}")

# REMOVE EXACT DUPLICATES & SORT BY LENGTH 
# sorting = shortest version is "anchor" for consolidation
deduped = list(set(texts)) # If first method is used
uniques = sorted(deduped, key=len)
print(f"Deduped Unique Proverbs: {len(uniques)}")
print(f"Number of duplicated proverbs: {len(duplicates)}")
print(f"Average number of repetitions proverb: {sum(duplicates.values()) / len(duplicates):.2f}")

# TO SEE REPETITIONS
print("\nMost Frequent Repetitions:")
for text, count in sorted(duplicates.items(), key=lambda x: x[1], reverse=True)[0:10]:
    print(f"[{count}x] {text}")

Unique Proverbs: 2453
Total Repetitions: 2547
Deduped Unique Proverbs: 2453
Number of duplicated proverbs: 141
Average number of repetitions proverb: 19.06

Most Frequent Repetitions:
[714x] Comparison is the thief of joy.
[617x] Don't believe everything you read on the internet just because there's a picture with a quote next to it.
[340x] Don't compare your behind-the-scenes to everyone else's highlight reel.
[209x] Don't believe everything you read on the internet.
[78x] Don't compare your behind-the-scenes to someone else's highlight reel.
[32x] In the world of the internet, you can be anything you want - it's up to you to decide what that is.
[28x] Don't compare your behind-the-scenes footage with everyone else's highlight reel.
[22x] Comparison is the thief of joy in the digital age.
[21x] comparison is the thief of joy
[20x] Comparison is the thief of joy; don't compare your behind-the-scenes to everyone else's highlight reel.


In [7]:
# NOTA BENE: this takes anywhere from 8 to 30 minutes to run

# EMPTY LIST TO HOLD LOOP OUTPUT
unique_proverbs = []

# THRESHOLD FOR SIMILARITY (lower = more aggressive)
threshold = 0.50  # Aggressive grouping

for current in uniques:
    if not unique_proverbs:
        unique_proverbs.append(current)
        continue
    
    # Compare current sentence against our accepted unique list
    current_emb = model.encode(current, convert_to_tensor=True)
    unique_embs = model.encode(unique_proverbs, convert_to_tensor=True)
    
    scores = util.cos_sim(current_emb, unique_embs)[0]
    
    # If it's not similar to anything we already have, add it
    if max(scores) < threshold:
        unique_proverbs.append(current)

# Display results
print(f"--- Original Count: {len(uniques)} | Final Count: {len(unique_proverbs)} ---")
for p in unique_proverbs:
    print(f"‚úì {p}")

KeyboardInterrupt: 

In [None]:
# Save the unique proverbs to a new file
with open('outputs/uniques-hermes-1.txt', 'w') as f:
    f.writelines([p + '\n' for p in unique_proverbs])

In [7]:
# See what's in the list
for proverb in unique_proverbs[10:20]:
    print(proverb)

This Is The Way.
Live laugh love.
It is what it is
Die mad about it.
Bros before hoes.
The cake is a lie.
Slay do not splay.
Trust, but verify.
Go big or go home.
Birds aren‚Äôt real.


## Search / Validate

Having whittled down the responses from the LLM to a manageable number of unique proverbs, we can now use SerpApi to search for each proverb and see if there are any results. If there are results, we can assume that the proverb is valid.

In [None]:
def verify_extant(phrase):
    """
    Checks the web for the phrase and looks for 'canonical' markers.
    """
    # TOTAL HIT COUNT CHECK 
    # (uses exact phrase match)
    params = {
        "q": f'"{phrase}"',  # Quoted for exact match
        "engine": "google",
        "api_key": SERP_API_KEY
    }
    
    search = GoogleSearch(params)
    results = search.get_dict()
    
    # Extract total results (Google hit count)
    total_results = results.get("search_information", {}).get("total_results", 0)
    
    # TARGETED SITE CHECK 
    # We check if the phrase appears on known authority sites
    authority_sites = ["oxfordreference.com", "phrases.org.uk", "theidioms.com"]
    site_query = f'"{phrase}" site:' + " OR site:".join(authority_sites)
    
    site_params = {**params, "q": site_query}
    site_search = GoogleSearch(site_params)
    site_results = site_search.get_dict()
    
    authority_count = site_results.get("search_information", {}).get("total_results", 0)
    
    # NOVELTY LOGIC
    # High LLM Stability + Low Search Hits = A Discovery
    if total_results < 1000 and authority_count == 0:
        return {
            "verdict": "üåü NOVEL MAXIM",
            "hits": total_results,
            "details": "High consensus in AI, but virtually zero footprint in human dictionaries."
        }
    elif authority_count > 0:
        return {
            "verdict": "üìö DOCUMENTED PROVERB",
            "hits": total_results,
            "details": f"Found on {authority_count} authority websites."
        }
    else:
        return {
            "verdict": "üåê COMMON IDIOM",
            "hits": total_results,
            "details": "Frequently used online but not officially documented as a proverb."
        }

In [None]:
# ORIGINAL TESTS
# d1 = verify_external_existence("The internet is forever and nothing is ever really deleted.")
# d2 = verify_external_existence("If you lurk long enough on any online community, you'll eventually see yourself in a post.")

# print(d1)
# print(d2)

In [19]:
# BUILD A LIST
verities = [verify_extant(p) for p in unique_proverbs[0:10]]

# What we got?
for v in verities:
    print(v)

{'verdict': 'üåê COMMON IDIOM', 'hits': 2030000, 'details': 'Frequently used online but not officially documented as a proverb.'}
{'verdict': 'üåê COMMON IDIOM', 'hits': 2760000, 'details': 'Frequently used online but not officially documented as a proverb.'}
{'verdict': 'üìö DOCUMENTED PROVERB', 'hits': 1690000000, 'details': 'Found on 484 authority websites.'}
{'verdict': 'üìö DOCUMENTED PROVERB', 'hits': 13700, 'details': 'Found on 986 authority websites.'}
{'verdict': 'üìö DOCUMENTED PROVERB', 'hits': 241000000, 'details': 'Found on 5 authority websites.'}
{'verdict': 'üìö DOCUMENTED PROVERB', 'hits': 1, 'details': 'Found on 45 authority websites.'}
{'verdict': 'üìö DOCUMENTED PROVERB', 'hits': 336000000, 'details': 'Found on 54 authority websites.'}
{'verdict': 'üåê COMMON IDIOM', 'hits': 6540000, 'details': 'Frequently used online but not officially documented as a proverb.'}
{'verdict': 'üìö DOCUMENTED PROVERB', 'hits': 46600, 'details': 'Found on 10 authority websites.

In [22]:
for i, p in enumerate(unique_proverbs[0:10]):
    v = verities[i]
    print(f"{v['verdict']} ({v['hits']} hits): {p} -- {v['details']}")

üåê COMMON IDIOM (2030000 hits): Comparison is the thief of joy. -- Frequently used online but not officially documented as a proverb.
üåê COMMON IDIOM (2760000 hits): The struggle is real, but so is the wifi signal. -- Frequently used online but not officially documented as a proverb.
üìö DOCUMENTED PROVERB (1690000000 hits): No one is actually happy with their life choices. -- Found on 484 authority websites.
üìö DOCUMENTED PROVERB (13700 hits): If you're not paying for a product, you are the product. -- Found on 986 authority websites.
üìö DOCUMENTED PROVERB (241000000 hits): If you're not having anxiety, you're not paying attention. -- Found on 5 authority websites.
üìö DOCUMENTED PROVERB (1 hits): If you don't post about it on social media, it didn't happen. -- Found on 45 authority websites.
üìö DOCUMENTED PROVERB (336000000 hits): The struggle is real and I'm not even getting paid to have it. -- Found on 54 authority websites.
üåê COMMON IDIOM (6540000 hits): The most un

In [None]:
with open('outputs/verification-3-0-10.txt', 'w') as f:
    for i, p in enumerate(unique_proverbs):
        v = verities[i]
        f.write(f"{v['verdict']} ({v['hits']} hits): {p} -- {v['details']}\n")

Okay, I just re-ran the search for the same proverbs and burned up whatever number of free credits I have on SerpApi. To improve the possibility of finding interesting results, I think I will filter all my previous responses to include keywords related to social media, life online, etc. I think I will do that in a new notebook.