In [None]:
import ollama
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

def extract_clean_text(url, max_chars=5000):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
        tag.decompose()
    
    main_content = (
        soup.find('main') or 
        soup.find('article') or 
        soup.find('div', class_=re.compile(r'content|main|article')) or
        soup.find('body')
    )
    
    text = main_content.get_text(separator=' ', strip=True) if main_content else soup.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())
    
    return text[:max_chars]

def score_relevance(content, description, model="llama3.1:8b"):
    prompt = f"""You are a relevance scoring system. Your ONLY job is to output a single number from 0-100.

Description of what we're looking for:
{description}

Content to evaluate:
{content}

Instructions:
- Score from 0 (completely irrelevant) to 100 (perfectly relevant)
- Output ONLY the number, nothing else
- No explanation, no text, just the score

Score:"""

    response = ollama.generate(
        model=model,
        prompt=prompt,
        options={
            'temperature': 0.1,  
            'num_predict': 10,  
            'stop': ['\n', '.', ' '],  
        }
    )
    
    score_text = response['response'].strip()
    
    try:
        score = int(re.search(r'\d+', score_text).group())
        return min(max(score, 0), 100) 
    except:
        print(f"Warning: Could not parse score from '{score_text}', defaulting to 50")
        return 50


In [22]:
paper_url = "https://ieeexplore.ieee.org/abstract/document/10787395"
description = "Paper using SDDP to solve a multi-stage problem to increase resilience of a power grid against hurricanes. The model places mobile energy sources, and once placed minimizes load shed, all the while hedging against forecast uncertainty. Uses a shrinking horizon to account for forecast updates."

df = pd.read_csv('~/Downloads/mess.csv')

scores = [-1 for i in range(len(df.index))]

for i, link in enumerate(df.Hyperlink):
    content = extract_clean_text(link)
    score = score_relevance(content, description)
    scores[i] = score

In [23]:
df.insert(1, 'Relevance', scores)
df.to_csv('mess_relevance.csv')