In [1]:
from dotenv import dotenv_values
import requests

from helpers.google_interactor import GoogleInteractor
from helpers.html_parser import HTMLParser 
from helpers.storage_manager import StorageManager
from helpers.website_refiner_model import WebsiteRefinerModel

# Load environment variables
config = dotenv_values("../.env")
google_api_key = config['GOOGLE_API_KEY']
search_engine_id = config['SEARCH_ENGINE_ID']

# Instantiate helpers
google_interactor = GoogleInteractor(api_key=google_api_key, search_engine_id=search_engine_id)
storage_manager = StorageManager("../data/")    
website_refiner = WebsiteRefinerModel(api_key=config['OPEN_AI_API_KEY'])

# Number of links to consider
K = 5

# Declare trial number
trial_number = 1

# Select a query to use for the current trial
query = "Where to buy life insurance in Mississippi?"

# Submit the query to Google
search_results = google_interactor.search_google(query=query, num_results=10)
    
# Store the response
storage_manager.save_to_folder("InitialGoogleResponses", search_results, f"{trial_number}.json")

# Retrieve the stored response
search_results = storage_manager.load_from_folder("InitialGoogleResponses", f"{trial_number}.json")


raw_text_from_html = []
items = search_results['items']

# For each item of raw text that was stored ...
for item in search_results['items'][:K]:
    link = item['link']

    # Fet the HTML associated with the link
    response = requests.get(link)

    # Get the raw text associated with the HTML
    parsed_text = HTMLParser.parse_html(response.content)

    # Store the raw text from the HTML
    raw_text_from_html.append({"link" : link, "raw_text" : parsed_text})
    
storage_manager.save_to_folder("RawTextFromHTML", raw_text_from_html, f"{trial_number}.json")

Data saved to ../data/InitialGoogleResponses/1.json
Data loaded from ../data/InitialGoogleResponses/1.json
Data saved to ../data/RawTextFromHTML/1.json


In [2]:

raw_text_from_html = storage_manager.load_from_folder("RawTextFromHTML", f"{trial_number}.json")

website_refiner_responses = []
# For each element of raw text that was stored ...
for current_item in raw_text_from_html:
    
    link = current_item['link']
    raw_text = current_item['raw_text']
    


    # Use the first raw text for refinement
    response_from_website_refiner = website_refiner.refine_text(raw_text=raw_text, query=query)
    
    if response_from_website_refiner:
        website_refiner_responses.append(response_from_website_refiner)
    else:
        print("Failed to refine text.")

website_refiner_responses

Data loaded from ../data/RawTextFromHTML/1.json


[{'id': 'chatcmpl-AWneBqRhoKdvIGtOD9Yq8CJsCf72v',
  'choices': [{'finish_reason': 'stop',
    'index': 0,
    'logprobs': None,
    'message': {'content': "To buy life insurance in Mississippi, you can visit the Mississippi Insurance Department's website for resources and information. You can also search for licensed insurance agents in your area, interview them, and shop around for the best coverage. For assistance, you can contact the Mississippi Insurance Department Consumer Help Line at 800-562-2957 or 601-359-2453 in the Jackson area.",
     'refusal': None,
     'role': 'assistant',
     'audio': None,
     'function_call': None,
     'tool_calls': None}}],
  'created': 1732380703,
  'model': 'gpt-4o-mini-2024-07-18',
  'object': 'chat.completion',
  'service_tier': None,
  'system_fingerprint': 'fp_0705bf87c0',
  'usage': {'completion_tokens': 78,
   'prompt_tokens': 2761,
   'total_tokens': 2839,
   'completion_tokens_details': {'accepted_prediction_tokens': 0,
    'audio_token

In [3]:
storage_manager.save_to_folder("WebsiteRefinerModelResponses", website_refiner_responses, f"{trial_number}.json")

Data saved to ../data/WebsiteRefinerModelResponses/1.json
