In [1]:
import http.client
import json
from dotenv import load_dotenv
import os
import requests

load_dotenv()

True

In [2]:
def search_serper(query, pages=3):
    all_results = []
    for page in range(1, pages + 1):
        conn = http.client.HTTPSConnection("google.serper.dev")
        payload = json.dumps({"q": query, "page": page})
        headers = {
            'X-API-KEY': os.getenv("SERPER_API_KEY"),
            'Content-Type': 'application/json'
        }
        conn.request("POST", "/search", payload, headers)
        res = conn.getresponse()
        data = json.loads(res.read().decode("utf-8"))
        all_results.extend(data.get("organic", []))
    return all_results

In [11]:
from bs4 import BeautifulSoup

def is_accessible(url, timeout=10):
    try:
        r = requests.get(url, timeout=timeout, allow_redirects=True,
                        headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'})
        return r.status_code == 200, r.text
    except:
        return False, None

def extract_paragraph(html, snippet):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    sentences = text.replace('\n', ' ').split('.')
    
    keywords = snippet.replace('...', '').strip().split()[:5]
    keyword_str = ' '.join(keywords)
    
    for sentence in sentences:
        if keyword_str in sentence:
            return sentence.strip() + '.'
    return snippet

In [12]:
def get_accessible_results(query, target=10, pages=3):
    results = search_serper(query, pages)
    accessible = []
    for r in results:
        if len(accessible) >= target:
            break
        success, html = is_accessible(r["link"])
        if success:
            r["paragraph"] = extract_paragraph(html, r["snippet"])
            accessible.append(r)
    return accessible

In [13]:
from prompt_template import RAG_PROMPT_TEMPLATE
from langchain_core.prompts import PromptTemplate 
from model import llm_answer

def web_rag(query, llm, target=10):
    RAG_PROMPT = PromptTemplate(
        template=RAG_PROMPT_TEMPLATE,
        input_variables=["context", "question"]
    )

    docs = get_accessible_results(query, target=target, pages=3)
    context = "\n\n".join([f"[{i+1}] {doc['title']}\n{doc['paragraph']}" for i, doc in enumerate(docs)])
    prompt = RAG_PROMPT.format(context=context, question=query)
    response = llm_answer(llm[0], llm[1], prompt)
    
    return {
        "query": query,
        "answer": response,
        "source_documents": docs
    }

In [None]:
# 이 부분은 테스트하는 main.py 코드로 갈듯듯 
from model import llm_load

llm = llm_load()


Loading checkpoint shards: 100%|██████████| 5/5 [00:45<00:00,  9.04s/it]


: 

In [16]:
result = web_rag("What is Eleanor Davis's occupation?", llm)
result

{'query': "What is Eleanor Davis's occupation?",
 'answer': ' Eleanor Davis is a cartoonist and illustrator.',
 'source_documents': [{'title': 'Eleanor Davis',
   'link': 'https://en.wikipedia.org/wiki/Eleanor_Davis',
   'snippet': 'Eleanor McCutcheon Davis (born January 16, 1983) is an American cartoonist and illustrator. Davis and Jillian Tamaki in discussion, 2017',
   'position': 1,
   'paragraph': 'com/ Davis and Jillian Tamaki in discussion, 2017 Eleanor McCutcheon Davis (born January 16, 1983) is an American cartoonist and illustrator.'},
  {'title': 'About / Contact',
   'link': 'https://doing-fine.com/?page_id=2',
   'snippet': "I'm a cartoonist and illustrator. My books include How To Be Happy, You and a Bike and a Road, Why Art? and The Secret Science Alliance and the Copycat Crook.",
   'position': 2,
   'paragraph': "I'm a cartoonist and illustrator. My books include How To Be Happy, You and a Bike and a Road, Why Art? and The Secret Science Alliance and the Copycat Crook.