# 6 - Focusing Search

This notebook seeks to compile the set of responses, remove the duplicates, and then filter for keywords focused on tech. 

In [None]:
# IMPORTS
import json
import re
from sentence_transformers import SentenceTransformer, util

# LOAD MODEL
model = SentenceTransformer('../models/BERT-all-mpnet-base-v2')

# REGEX

# PATTERNS USED FOR MATCHING
# \b              : Word boundary (prevents matching 'apple' in 'pineapple')
# (?: ... )       : Non-capturing group (for efficiency)
# -? or \s? or ?  : Makes hyphens, spaces, or specific letters optional
# |               : The "OR" operator

# KEYWORD LIST
TECH_KEYWORDS = [
    "wi-?fi", "social[- ]?media", "algorithm[s]?", "smart[- ]?phone[s]?", 
    "internet", "online", "app[s]?", "digital", "data", "cloud", 
    "viral", "post[s]?", "comment[s]?", "screen[s]?", "link[s]?", "network","net", "byte[s]?", "pixel[s]?", "device[s]?", "gadget[s]?", "tech", "technology", "identity", "identities", "download[s]", "upload[s]", "hack[s]?", "hacker[s]?", "ether", "share", "follow[s]?", "subscribe", "thread[s]?", "troll[s]?", "signal[s]?", "content"
]

# COMPILED REGEX PATTERN
tech_pattern = re.compile(r"\b(?:" + "|".join(TECH_KEYWORDS) + r")\b", re.IGNORECASE)

# FUNCTION THAT COMBINES BOTH ABOVE
def is_tech_proverb(text):
    return bool(tech_pattern.search(text))

In [None]:
# LOAD FILES

list_of_filenames = ['outputs/di-5000-1.json', 'outputs/di-5000-2.json', 'outputs/di-5000-3.json', 'outputs/di-5000-4.json']

for filename in list_of_filenames:
    with open(filename, 'r') as f:
        data = json.load(f)
    
    # EXTRACT JUST THE TEXT FROM THE DATA
    texts = [entry['text'] for entry in data]

    # IF KEY ERROR ABOVE
    # (avoids potential KeyError if 'text' key is missing)
    # texts = [entry.get('text', '') for entry in data]
    
    # FILTER TEXTS FOR TECH PROVERBS
    tech_proverbs = [text for text in texts if is_tech_proverb(text)]
    
    # PRINT RESULTS
    print(f"File: {filename}")
    print(f"Total Proverbs: {len(texts)}")
    print(f"Tech Proverbs: {len(tech_proverbs)}")
    print("Sample Tech Proverbs:")
    for proverb in tech_proverbs[:5]:  # Print first 5 tech proverbs as sample
        print(f"- {proverb}")
    print("\n")

File: outputs/di-5000-1.json
Total Proverbs: 10000
Tech Proverbs: 4429
Sample Tech Proverbs:
- People are increasingly feeling isolated and disconnected from others despite having hundreds of online friends and followers on various social media platforms.
- The older generation thinks they can just install an app and understand the younger generation.
- People are increasingly feeling isolated and disconnected from others despite having hundreds of online friends and being constantly connected to their devices.
- People are so busy watching their screens and monitoring their online presence that they often forget to be present in the moment and engage with the world around them.
- The rise of social media has created a culture of constant comparison and validation seeking where people curate highlight reels of their lives and feel pressure to present a


File: outputs/di-5000-2.json
Total Proverbs: 5000
Tech Proverbs: 353
Sample Tech Proverbs:
- Lowkey, no cap, adulting is hard. Compar

In [3]:
# REMOVE EXACT DUPLICATES & SORT BY LENGTH 
# sorting = shortest version is "anchor" for consolidation
deduped = list(set(tech_proverbs)) # If first method is used
uniques = sorted(deduped, key=len)

print(f"Unique Tech Proverbs after Deduplication: {len(uniques)}")

Unique Tech Proverbs after Deduplication: 1055


In [5]:
# NOTA BENE: for 1000 proverbs, this takes about ~8 minutes to run

# EMPTY LIST TO HOLD LOOP OUTPUT
unique_proverbs = []

# THRESHOLD FOR SIMILARITY (lower = more aggressive)
threshold = 0.50  # Aggressive grouping

for current in uniques:
    if not unique_proverbs:
        unique_proverbs.append(current)
        continue
    
    # Compare current sentence against our accepted unique list
    current_emb = model.encode(current, convert_to_tensor=True)
    unique_embs = model.encode(unique_proverbs, convert_to_tensor=True)
    
    scores = util.cos_sim(current_emb, unique_embs)[0]
    
    # If it's not similar to anything we already have, add it
    if max(scores) < threshold:
        unique_proverbs.append(current)

# Display results
print(f"--- Original Count: {len(uniques)} | Final Count: {len(unique_proverbs)} ---")
for p in unique_proverbs:
    print(f"✓ {p}")

--- Original Count: 1055 | Final Count: 71 ---
✓ Don't read the comments.
✓ The internet is forever so be careful what you post.
✓ If you lurk on the internet long enough, eventually you see yourself in a meme.
✓ If you lurk long enough on any online community, you'll eventually see someone be a total Karen.
✓ If you lurk long enough on any online community, you'll eventually see someone mention their ex.
✓ If you lurk long enough on any online community, you'll eventually see a user try to sell some rocks.
✓ If you lurk long enough on any online community, you'll eventually see a user throw shade at lurkers.
✓ If you lurk long enough on any online community, you'll eventually see a user trying to relive a trauma.
✓ If you lurk long enough on any online community, you'll eventually see someone mention the early internet.
✓ If you lurk long enough on any online community, you'll eventually see a variation of the same 3 arguments.
✓ If you lurk long enough on any online community, you'll

In [6]:
# TO HAND INSPECT
with open('outputs/proverbs_tech.txt', 'w') as f:
    f.writelines([p + '\n' for p in unique_proverbs])