In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.actions.wheel_input import ScrollOrigin
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import ollama
import requests
import re

import time
import itertools
import pandas as pd


keywords = {'optional': ['Energy storage', 'Multistage', 'SDDP', 'rolling horizon'],
            'required': ['disaster']}
description = "Paper using SDDP to solve a multi-stage problem to increase resilience of a power grid against hurricanes. The model places mobile energy sources, and once placed minimizes load shed, all the while hedging against forecast uncertainty. Uses a shrinking horizon to account for forecast updates."
min_length = 3
min_year = 2022


df = pd.DataFrame(columns=['name', 'year', 'authors', 'journal', 'categories', 'key ideas', 'link'])
url = "https://scholar.google.com/"

combinations = []
optional = keywords['optional']
for L in range(len(optional) + 1):
    for subset in itertools.combinations(optional, L):
        if len(subset) + len(keywords['required']) >= min_length:
            combinations.append(tuple(keywords['required'] + [s for s in subset]))




for c in combinations:#
    op = webdriver.ChromeOptions()
    #op.add_argument('headless')
    driver = webdriver.Chrome(options=op)
    query = ', '.join(c)
    with driver as browser:
        browser.set_window_size(1024, 768)
        browser.get(url)
        action = ActionChains(driver)

        results_div = driver.find_element(By.CSS_SELECTOR, "input[name='q']")
        action.move_to_element(results_div).click().send_keys(query).perform()
        action.move_to_element(results_div).click().send_keys(Keys.ENTER).perform()

        #q_button = driver.find_element(By.CSS_SELECTOR, "#gs_hdr_tsb")

        time.sleep(2)

        soup = BeautifulSoup(browser.page_source, 'html.parser')

        articles = soup.find_all("div", {"class": 'gs_r'})
    
    new_df = pd.DataFrame(columns=['name', 'year', 'authors', 'journal', 'categories', 'key ideas', 'link'], index=[i for i in range(len(articles)-1)])

    for i, article in enumerate(articles):
        try:
            title_data =  article.find('div', {'class': 'gs_ri'}).find('h3').find('a')
            title = title_data.text
            link = title_data['href']

            author_data = article.find('div', {'class': 'gs_ri'}).find('div', {'class': 'gs_a'})
            date = int(author_data.text.split("-")[-2].split(',')[-1].strip())
            journal = author_data.text.split("-")[-1]

            authors_list = author_data.text.split("-")[0][:-1].split(',')

            if len(authors_list) > 3:
                authors = authors_list[0] + " et al."
            else:
                authors = " &".join(authors_list)

            new_row = pd.DataFrame(columns=['name', 'year', 'authors', 'journal', 'categories', 'key ideas', 'link'], index=[0])

            if date >= min_year:
                new_df.iloc[i] = [title, date, authors, journal, query, "", link] 
            
            #new_df = pd.concat(new_df, new_row)
        except:
            continue
    new_df = new_df.dropna()

    df = pd.concat([df, new_df])
    time.sleep(2)



def extract_clean_text(url, max_chars=5000):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):
        tag.decompose()
    
    main_content = (
        soup.find('main') or 
        soup.find('article') or 
        soup.find('div', class_=re.compile(r'content|main|article')) or
        soup.find('body')
    )
    
    text = main_content.get_text(separator=' ', strip=True) if main_content else soup.get_text(separator=' ', strip=True)
    text = ' '.join(text.split())
    
    return text[:max_chars]

def score_relevance(content, description, model="llama3.1:8b"):
    prompt = f"""You are a relevance scoring system. Your ONLY job is to output a single number from 0-100.

Description of what we're looking for:
{description}

Content to evaluate:
{content}

Instructions:
- Score from 0 (completely irrelevant) to 100 (perfectly relevant)
- Output ONLY the number, nothing else
- No explanation, no text, just the score

Score:"""

    response = ollama.generate(
        model=model,
        prompt=prompt,
        options={
            'temperature': 0.1,  
            'num_predict': 10,  
            'stop': ['\n', '.', ' '],  
        }
    )
    
    score_text = response['response'].strip()
    
    try:
        score = int(re.search(r'\d+', score_text).group())
        return min(max(score, 0), 100) 
    except:
        print(f"Warning: Could not parse score from '{score_text}', defaulting to 50")
        return 50
    

scores = [-1 for i in range(len(df.index))]

for i, link in enumerate(df.Hyperlink):
    content = extract_clean_text(link)
    score = score_relevance(content, description)
    scores[i] = score

df.insert(1, 'Relevance', scores)
df.to_csv('results.csv')