# In Search of Contexts

## Strict Search

In [None]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search  # pip install googlesearch-python

def get_proverb_context(proverb, num_urls=3):
    print(f"Searching for context: '{proverb}'...")
    
    # 1. Search Google for the exact phrase (using quotes)
    # We use a User-Agent so websites don't immediately block the request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    }
    
    contexts = []
    
    # search() returns a generator of URLs
    for url in search(f'"{proverb}"', num_results=num_urls):
        try:
            # 2. Visit the page
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_status != 200:
                continue
                
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 3. Find all <p> tags and check if the proverb is inside
            for paragraph in soup.find_all('p'):
                p_text = paragraph.get_text().strip()
                
                # Case-insensitive check
                if proverb.lower() in p_text.lower():
                    contexts.append({
                        "source": url,
                        "text": p_text
                    })
                    # We only need one example per website to keep it clean
                    break 
                    
        except Exception as e:
            print(f"Could not scan {url}: {e}")
            
    return contexts

# Example Usage
my_proverb = "If you're not paying for the product, you are the product"
results = get_proverb_context(my_proverb)

for i, entry in enumerate(results, 1):
    print(f"\n--- Example {i} ---")
    print(f"Source: {entry['source']}")
    print(f"Context: {entry['text']}")

## Fuzzy Search

In [None]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search
from rapidfuzz import fuzz

def get_fuzzy_proverb_context(proverb, num_urls=3, threshold=85):
    print(f"Searching for: '{proverb}' (Fuzzy Threshold: {threshold})")
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/119.0.0.0"}
    contexts = []
    
    # We remove quotes from the search to allow Google to find variations
    for url in search(proverb, num_results=num_urls):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            for paragraph in soup.find_all('p'):
                p_text = paragraph.get_text().strip()
                
                # 'partial_ratio' is the secret sauce here. 
                # It finds the proverb even if it's buried in a long sentence.
                score = fuzz.partial_ratio(proverb.lower(), p_text.lower())
                
                if score >= threshold:
                    contexts.append({
                        "source": url,
                        "score": round(score, 2),
                        "text": p_text
                    })
                    break # Move to the next URL after a match
                    
        except Exception as e:
            continue
            
    return contexts

# Example usage
proverb = "The internet is forever"
results = get_fuzzy_proverb_context(proverb)

for res in results:
    print(f"\n[{res['score']}% Match] Source: {res['source']}")
    print(f"Context: {res['text']}")

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from googlesearch import search
from rapidfuzz import fuzz

# 1. Setup our Tech Filter
TECH_KEYWORDS = [
    "wi-?fi", "social[- ]?media", "algorithm[s]?", "smart[- ]?phone[s]?", 
    "internet", "online", "app[s]?", "digital", "data", "cloud", 
    "viral", "post", "comment[s]?", "screen"
]
tech_pattern = re.compile(r"\b(?:" + "|".join(TECH_KEYWORDS) + r")\b", re.IGNORECASE)

def is_tech_proverb(text):
    return bool(tech_pattern.search(text))

def get_filtered_context(proverb_list):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/119.0.0.0"}
    final_results = {}

    # Step A: Filter the list first to save time/API calls
    tech_only = [p for p in proverb_list if is_tech_proverb(p)]
    print(f"--- Processing {len(tech_only)} tech-related proverbs ---\n")

    for proverb in tech_only:
        print(f"Finding context for: '{proverb}'...")
        found = False
        
        # Step B: Google Search (No quotes to allow slight variations)
        for url in search(proverb, num_results=3):
            if found: break
            try:
                response = requests.get(url, headers=headers, timeout=8)
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Step C: Scrape <p> tags and Fuzzy Match
                for paragraph in soup.find_all('p'):
                    p_text = paragraph.get_text().strip()
                    
                    # We want a high match score (>80) to ensure it's actually our proverb
                    if fuzz.partial_ratio(proverb.lower(), p_text.lower()) > 80:
                        final_results[proverb] = {
                            "context": p_text,
                            "source": url
                        }
                        found = True
                        print(f" [âœ“] Found match on {url[:30]}...")
                        break
            except Exception:
                continue
                
    return final_results

# --- Execution ---
my_list = [
    "The internet is forever.",
    "A bird in the hand is worth two in the bush.", # Should be skipped
    "The algorithm knows you better than your mother.",
    "Don't read the comments.",
    "Patience is a virtue." # Should be skipped
]

final_data = get_filtered_context(my_list)