# In Search of Contexts

## Strict Search

In [None]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search  # pip install googlesearch-python

def get_proverb_context(proverb, num_urls=3):
    print(f"Searching for context: '{proverb}'...")
    
    # 1. Search Google for the exact phrase (using quotes)
    # We use a User-Agent so websites don't immediately block the request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
    }
    
    contexts = []
    
    # search() returns a generator of URLs
    for url in search(f'"{proverb}"', num_results=num_urls):
        try:
            # 2. Visit the page
            response = requests.get(url, headers=headers, timeout=10)
            if response.status_status != 200:
                continue
                
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 3. Find all <p> tags and check if the proverb is inside
            for paragraph in soup.find_all('p'):
                p_text = paragraph.get_text().strip()
                
                # Case-insensitive check
                if proverb.lower() in p_text.lower():
                    contexts.append({
                        "source": url,
                        "text": p_text
                    })
                    # We only need one example per website to keep it clean
                    break 
                    
        except Exception as e:
            print(f"Could not scan {url}: {e}")
            
    return contexts



In [None]:
# Example Usage
my_proverb = "The struggle is real, but so is the wifi signal"
results = get_proverb_context(my_proverb)

for i, entry in enumerate(results, 1):
    print(f"\n--- Example {i} ---")
    print(f"Source: {entry['source']}")
    print(f"Context: {entry['text']}")

## Fuzzy Search

In [None]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search
from rapidfuzz import fuzz

def get_fuzzy_proverb_context(proverb, num_urls=3, threshold=85):
    print(f"Searching for: '{proverb}' (Fuzzy Threshold: {threshold})")
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/119.0.0.0"}
    contexts = []
    
    # We remove quotes from the search to allow Google to find variations
    for url in search(proverb, num_results=num_urls):
        try:
            response = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            for paragraph in soup.find_all('p'):
                p_text = paragraph.get_text().strip()
                
                # 'partial_ratio' is the secret sauce here. 
                # It finds the proverb even if it's buried in a long sentence.
                score = fuzz.partial_ratio(proverb.lower(), p_text.lower())
                
                if score >= threshold:
                    contexts.append({
                        "source": url,
                        "score": round(score, 2),
                        "text": p_text
                    })
                    break # Move to the next URL after a match
                    
        except Exception as e:
            continue
            
    return contexts

# Example usage
proverb = "The struggle is real, but so is the wifi signal"
results = get_fuzzy_proverb_context(proverb)

for res in results:
    print(f"\n[{res['score']}% Match] Source: {res['source']}")
    print(f"Context: {res['text']}")

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from googlesearch import search
from rapidfuzz import fuzz

# 1. Setup our Tech Filter
TECH_KEYWORDS = [
    "wi-?fi", "social[- ]?media", "algorithm[s]?", "smart[- ]?phone[s]?", 
    "internet", "online", "app[s]?", "digital", "data", "cloud", 
    "viral", "post", "comment[s]?", "screen"
]
tech_pattern = re.compile(r"\b(?:" + "|".join(TECH_KEYWORDS) + r")\b", re.IGNORECASE)

def is_tech_proverb(text):
    return bool(tech_pattern.search(text))

def get_filtered_context(proverb_list):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/119.0.0.0"}
    final_results = {}

    # Step A: Filter the list first to save time/API calls
    tech_only = [p for p in proverb_list if is_tech_proverb(p)]
    print(f"--- Processing {len(tech_only)} tech-related proverbs ---\n")

    for proverb in tech_only:
        print(f"Finding context for: '{proverb}'...")
        found = False
        
        # Step B: Google Search (No quotes to allow slight variations)
        for url in search(proverb, num_results=3):
            if found: break
            try:
                response = requests.get(url, headers=headers, timeout=8)
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Step C: Scrape <p> tags and Fuzzy Match
                for paragraph in soup.find_all('p'):
                    p_text = paragraph.get_text().strip()
                    
                    # We want a high match score (>80) to ensure it's actually our proverb
                    if fuzz.partial_ratio(proverb.lower(), p_text.lower()) > 80:
                        final_results[proverb] = {
                            "context": p_text,
                            "source": url
                        }
                        found = True
                        print(f" [‚úì] Found match on {url[:30]}...")
                        break
            except Exception:
                continue
                
    return final_results

# --- Execution ---
my_list = [
    "The internet is forever.",
    "A bird in the hand is worth two in the bush.", # Should be skipped
    "The algorithm knows you better than your mother.",
    "Don't read the comments.",
    "Patience is a virtue." # Should be skipped
]

final_data = get_filtered_context(my_list)

In [None]:
import requests
from bs4 import BeautifulSoup
from googlesearch import search
from rapidfuzz import fuzz

def get_fuzzy_proverb_context(proverb, num_urls=5, threshold=70): # Lowered threshold slightly
    print(f"üîç Searching Google for: '{proverb}'")
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}
    contexts = []
    
    # Track if we actually find URLs
    found_urls = list(search(proverb, num_results=num_urls))
    if not found_urls:
        print("‚ùå No URLs found. Google might be blocking the request or the query is too specific.")
        return []

    for url in found_urls:
        try:
            print(f"üåê Checking: {url}...")
            response = requests.get(url, headers=headers, timeout=5)
            if response.status_code != 200:
                continue
                
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Expanded search to include divs and list items
            for tag in soup.find_all(['p', 'div', 'li', 'blockquote']):
                p_text = tag.get_text().strip()
                
                # Clean up the text (remove excessive newlines)
                p_text = " ".join(p_text.split())
                
                if len(p_text) < len(proverb): # Skip fragments shorter than the proverb
                    continue

                score = fuzz.partial_ratio(proverb.lower(), p_text.lower())
                
                if score >= threshold:
                    contexts.append({
                        "source": url,
                        "score": round(score, 2),
                        "text": p_text[:500] # Limit length for readability
                    })
                    break 
                    
        except Exception as e:
            print(f"‚ö†Ô∏è Error accessing {url}: {e}")
            continue
            
    return contexts

# Example usage
proverb = "The struggle is real, but so is the wifi signal"
results = get_fuzzy_proverb_context(proverb)

if not results:
    print("\n--- No matches found in the identified pages ---")
else:
    for res in results:
        print(f"\n‚úÖ [{res['score']}% Match] Source: {res['source']}")
        print(f"Context: {res['text']}")

In [3]:
import asyncio
from playwright.async_api import async_playwright
from rapidfuzz import fuzz
from googlesearch import search

async def get_dynamic_proverb_context(proverb, num_urls=3, threshold=75):
    print(f"üöÄ Launching browser to find: '{proverb}'")
    
    results = []
    # Get URLs first (google search doesn't need a headless browser usually)
    urls = list(search(proverb, num_results=num_urls))

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        for url in urls:
            try:
                print(f"üåê Navigating to: {url}")
                # wait_until="networkidle" tells the browser to wait until 
                # no new network requests are being made (JS is finished)
                await page.goto(url, wait_until="networkidle", timeout=15000)
                
                # Get all text from the body
                page_content = await page.content()
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(page_content, 'html.parser')

                for tag in soup.find_all(['p', 'div', 'span', 'li']):
                    text = " ".join(tag.get_text().split())
                    if len(text) < len(proverb): continue

                    score = fuzz.partial_ratio(proverb.lower(), text.lower())
                    if score >= threshold:
                        results.append({"source": url, "score": score, "text": text[:500]})
                        break
            except Exception as e:
                print(f"‚ö†Ô∏è Could not load {url}: {e}")

        await browser.close()
    return results

In [4]:
# To run this in a standard script:
results = asyncio.run(get_dynamic_proverb_context("The struggle is real, but so is the wifi"))

RuntimeError: asyncio.run() cannot be called from a running event loop

In [5]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
from googlesearch import search
from rapidfuzz import fuzz

async def get_proverb_context_jupyter(proverb, num_urls=3, threshold=75):
    print(f"üîç Searching for: '{proverb}'")
    
    # 1. Get the URLs from Google (Synchronous call is fine here)
    urls = list(search(proverb, num_results=num_urls))
    
    if not urls:
        print("‚ùå No search results found.")
        return []

    results = []
    
    # 2. Start Playwright
    async with async_playwright() as p:
        # Launching the browser
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0"
        )
        page = await context.new_page()

        for url in urls:
            try:
                print(f"üåê Loading: {url}")
                # wait_until="networkidle" is key for JavaScript-heavy sites
                await page.goto(url, wait_until="domcontentloaded", timeout=20000)
                
                # Give it an extra second for any slow-loading JS
                await asyncio.sleep(1) 
                
                # Get the fully rendered HTML
                content = await page.content()
                soup = BeautifulSoup(content, 'html.parser')
                
                # Search through potential text containers
                for tag in soup.find_all(['p', 'div', 'span', 'li', 'blockquote']):
                    text = " ".join(tag.get_text().split())
                    
                    if len(text) < len(proverb):
                        continue
                        
                    score = fuzz.partial_ratio(proverb.lower(), text.lower())
                    
                    if score >= threshold:
                        results.append({
                            "source": url,
                            "score": round(score, 2),
                            "text": text[:500] # Snippet for display
                        })
                        break # Found it on this page, move to next URL
                        
            except Exception as e:
                print(f"‚ö†Ô∏è Error on {url}: {e}")
                
        await browser.close()
    return results

In [6]:
# --- HOW TO RUN IN JUPYTER ---
# You don't need asyncio.run(). Just use 'await' directly:

proverb_to_test = "The struggle is real, but so is the wifi signal"
results = await get_proverb_context_jupyter(proverb_to_test)

if results:
    for r in results:
        print(f"\n‚úÖ [{r['score']}%] {r['source']}")
        print(f"Context: {r['text']}")
else:
    print("\nNo matches found.")

üîç Searching for: 'The struggle is real, but so is the wifi signal'
‚ùå No search results found.

No matches found.
