In [1]:
from dotenv import dotenv_values
import requests
import sys


"""
Configuration
"""

path_to_src_folder = "../src"
sfblic_link = "https://www.sfbli.com/"

# Load environment variables
config = dotenv_values("../.env")
google_api_key = config['GOOGLE_API_KEY']
search_engine_id = config['SEARCH_ENGINE_ID']

sys.path.append(path_to_src_folder)

from helpers.google_interactor import GoogleInteractor
from helpers.html_parser import HTMLParser 
from helpers.storage_manager import StorageManager
from helpers.refiner_model import RefinerModel
from helpers.improver_model import ImproverModel
from helpers.writer_citer_model import WriterCiterModel
from helpers.visibility import parse_citations_from_text, compute_citation_scores

"""
============================================
Run through a single trial of the experiment
============================================
"""

"""
Set up
"""

# Number of links to consider
K = 5

# Declare trial number
trial_number = 14

# Select a query to use for the current trial
query = "What is life insurance?"

# Instantiate helpers
google_interactor = GoogleInteractor(api_key=google_api_key, search_engine_id=search_engine_id)
storage_manager = StorageManager("../data/")    
website_refiner = RefinerModel(api_key=config['OPEN_AI_API_KEY'])
corpus_refiner = RefinerModel(api_key=config['OPEN_AI_API_KEY'])
improver_model = ImproverModel(api_key=config['OPEN_AI_API_KEY'])
writer_citer = WriterCiterModel(api_key=config['OPEN_AI_API_KEY'])

# Load the clean corpus text
with open("../data/clean_content.txt") as f:
    corpus_text = f.read()

In [2]:
"""
Submit the query to Google and store the results
"""

# Submit the query to Google
search_results = google_interactor.search_google(query=query, num_results=10)
    
# Store the response
storage_manager.save_to_folder("InitialGoogleResponses", search_results, f"{trial_number}.json")

Data saved to ../data/InitialGoogleResponses/14.json


In [3]:
"""
View links
"""

top_k_links = [item['link'] for item in search_results['items']][:K]
top_k_links

['https://www.libertymutual.com/insurance-resources/life/how-does-life-insurance-work',
 'https://www.benefits.va.gov/insurance/',
 'https://www.va.gov/life-insurance/options-eligibility/valife/',
 'https://content.naic.org/insurance-topics/life-insurance',
 'https://www.va.gov/life-insurance/']

In [4]:
"""
Check to see if SFBLIC is in the top K links. 
If not, add it
"""

if sfblic_link in top_k_links:
    updated_links = top_k_links
else:
    top_k_links[K-1] = sfblic_link
    updated_links = top_k_links

updated_links

['https://www.libertymutual.com/insurance-resources/life/how-does-life-insurance-work',
 'https://www.benefits.va.gov/insurance/',
 'https://www.va.gov/life-insurance/options-eligibility/valife/',
 'https://content.naic.org/insurance-topics/life-insurance',
 'https://www.sfbli.com/']

In [5]:
storage_manager.save_to_folder("UpdatedGoogleResponses", updated_links, f"{trial_number}.json")

Data saved to ../data/UpdatedGoogleResponses/14.json


In [6]:
"""
Get Raw Text from the HTML associated with the links from Google
"""

# Retrieve the stored response
updated_links = storage_manager.load_from_folder("UpdatedGoogleResponses", f"{trial_number}.json")

raw_text_from_html = []

# For each item of raw text that was stored ...
for link in updated_links:    

    try:

        # Fet the HTML associated with the link
        response = requests.get(link)
    
        # Get the raw text associated with the HTML
        parsed_text = HTMLParser.parse_html(response.content)
    
        # Store the raw text from the HTML
        raw_text_from_html.append({"link" : link, "raw_text" : parsed_text})

    except:
        print("Something happened with ", link)
    
storage_manager.save_to_folder("RawTextFromHTML", raw_text_from_html, f"{trial_number}.json")

Data loaded from ../data/UpdatedGoogleResponses/14.json
Data saved to ../data/RawTextFromHTML/14.json


In [7]:
"""
Get the refined text from the raw text
"""

raw_text_from_html = storage_manager.load_from_folder("RawTextFromHTML", f"{trial_number}.json")

website_refiner_responses = []
# For each element of raw text that was stored ...
for current_item in raw_text_from_html:
    
    link = current_item['link']
    raw_text = current_item['raw_text']   

    
    response_from_website_refiner = website_refiner.refine_text(raw_text=raw_text, query=query)
    
    if response_from_website_refiner:
        response_from_website_refiner['link'] = link
        website_refiner_responses.append(response_from_website_refiner)
    else:
        print("Failed to refine text.")

storage_manager.save_to_folder("WebsiteRefinerModelResponses", website_refiner_responses, f"{trial_number}.json")

Data loaded from ../data/RawTextFromHTML/14.json
Data saved to ../data/WebsiteRefinerModelResponses/14.json


In [8]:
for item in website_refiner_responses:
    print(item['link'])
    print(item['choices'][0]['message']['content'])
    print()

https://www.libertymutual.com/insurance-resources/life/how-does-life-insurance-work
Life insurance is a contract between an insurance company and policyholder. In exchange for a premium, the life insurance company agrees to pay a sum of money to one or more named beneficiaries upon the death of the policyholder. The purpose of life insurance is to help provide financial security to your loved ones upon your death. However, some life policies also offer living benefits, allowing a part of the policy's death benefit to be paid while the policyholder is still alive under certain conditions.

https://www.benefits.va.gov/insurance/
Life insurance is a financial product that provides a death benefit to beneficiaries upon the insured person's death. The text mentions several life insurance programs available for Veterans:

1. **Veterans Affairs Life Insurance (VALife)** - A new guaranteed acceptance whole life insurance program for service-connected Veterans aged 80 and under, offering covera

In [9]:
"""
Filter the corpus to information that may be relevant to the query
"""

response_from_corpus_refiner = corpus_refiner.refine_text(raw_text=corpus_text, query=query)
storage_manager.save_to_folder("CorpusRefinerModelResponses", response_from_corpus_refiner, f"{trial_number}.json")

Data saved to ../data/CorpusRefinerModelResponses/14.json


In [10]:
print(response_from_corpus_refiner['choices'][0]['message']['content'])

Life insurance is a contract between an insurance company and the policyholder. In exchange for a premium, the life insurance company agrees to pay a sum of money to one or more named beneficiaries upon the death of the policyholder. The purpose of life insurance is to provide financial security to your loved ones upon your death. 

Life insurance is important because it protects your spouse and children from potentially devastating financial losses if something happens to you. It provides financial security, helps pay off debts, living expenses, and any medical or final expenses. The policy delivers a specified sum of money upon your death, which is generally not subject to federal income taxes, ensuring that your beneficiaries receive full benefits.

There are two main categories of life insurance policies: 
1. **Term Life Insurance**: Provides coverage for a specific period (e.g., 10, 20, or 30 years). If the policyholder dies during the coverage period, the beneficiaries receive th

In [11]:
"""
Improve the text from the SFBLIC site
"""

# Get the refined text from the website
website_refiner_responses = storage_manager.load_from_folder("WebsiteRefinerModelResponses", f"{trial_number}.json")
sfblic_response = [response for response in website_refiner_responses if response['link'] == 'https://www.sfbli.com/'][0]
refined_text_from_website = sfblic_response['choices'][0]['message']['content']

# Get the refined text from the corpus
response_from_corpus_refiner = storage_manager.load_from_folder("CorpusRefinerModelResponses", f"{trial_number}.json")
refined_text_from_corpus = response_from_corpus_refiner['choices'][0]['message']['content']


# Generate improved text
improved_text = improver_model.improve_text(
    query=query,
    refined_text_from_website=refined_text_from_website,
    refined_text_from_corpus=refined_text_from_corpus
)

if improved_text:
    print("Improved Text generated...")
    
else:
    print("Failed to improve text.")


# Save to disk
storage_manager.save_to_folder("ImproverModelResponses", improved_text, f"{trial_number}.json")


Data loaded from ../data/WebsiteRefinerModelResponses/14.json
Data loaded from ../data/CorpusRefinerModelResponses/14.json
Improved Text generated...
Data saved to ../data/ImproverModelResponses/14.json


In [12]:
print(improved_text['choices'][0]['message']['content'])

Life insurance is a contract between an insurance company and the policyholder. In exchange for paying a premium, the life insurance company agrees to provide a specified sum of money to one or more named beneficiaries upon the policyholder's death. The primary purpose of life insurance is to offer financial security to your loved ones after your passing, helping them manage potential financial burdens such as debts, living expenses, and any medical or funeral costs. There are two main types of life insurance: Term Life Insurance, which offers coverage for a set period, and Permanent Life Insurance, which includes coverage for the policyholder's lifetime and can include a cash value component. Life insurance helps protect your beneficiaries from financial distress and delivers financial peace of mind.


In [13]:
"""
Create multiple sets of reference material
"""

# Create a set of reference material that has the original (unimproved) content from the website

website_refiner_model_responses = storage_manager.load_from_folder("WebsiteRefinerModelResponses", f"{trial_number}.json")

unmodified_set_of_reference_material =  [
    {
        "link" : item['link'],
        "information" : item['choices'][0]['message']['content']
    }
    for item in website_refiner_model_responses
]

# Create a set of reference material that has the new (improved) content
improved_text = storage_manager.load_from_folder('ImproverModelResponses', f"{trial_number}.json")
improved_version = improved_text['choices'][0]['message']['content']


# Create a new set of reference material

new_set_of_reference_material = []

for item in unmodified_set_of_reference_material:
    if item['link'] == 'https://www.sfbli.com/':
        new_set_of_reference_material.append({"link" : link, "information" : improved_version})
    else:
        new_set_of_reference_material.append(item)

Data loaded from ../data/WebsiteRefinerModelResponses/14.json
Data loaded from ../data/ImproverModelResponses/14.json


In [14]:
"""
Store Sets of Reference Material
"""

sets_of_reference_material = []

improvement_method = 0
set_of_reference_material = unmodified_set_of_reference_material

sets_of_reference_material.append({
    "improvement_method" : improvement_method,
    "set_of_reference_material" : set_of_reference_material,
})

improvement_method = 1
set_of_reference_material = new_set_of_reference_material

sets_of_reference_material.append({
    "improvement_method" : improvement_method,
    "set_of_reference_material" : set_of_reference_material,
})


storage_manager.save_to_folder("SetsOfReferenceMaterial", sets_of_reference_material, f"{trial_number}.json")

Data saved to ../data/SetsOfReferenceMaterial/14.json


In [15]:
"""
Send sets of reference material to the writer-citer
"""

writer_citer_model_responses = [] 

sets_of_reference_material = storage_manager.load_from_folder("SetsOfReferenceMaterial", f"{trial_number}.json")

# Unimproved Method
# -----------------

set_of_reference_material = sets_of_reference_material[0]
improvement_method = set_of_reference_material['improvement_method']
reference_material = set_of_reference_material['set_of_reference_material']

# Generate a cited response
cited_response = writer_citer.generate_cited_response(
    rephrased_query=query,
    set_of_reference_query=reference_material
)

cited_response['improvement_method'] = improvement_method

writer_citer_model_responses.append(cited_response)

# Improved Method
# -----------------

set_of_reference_material = sets_of_reference_material[1]
improvement_method = set_of_reference_material['improvement_method']
reference_material = set_of_reference_material['set_of_reference_material']

# Generate a cited response
cited_response = writer_citer.generate_cited_response(
    rephrased_query=query,
    set_of_reference_query=reference_material
)

cited_response['improvement_method'] = improvement_method

writer_citer_model_responses.append(cited_response)


Data loaded from ../data/SetsOfReferenceMaterial/14.json


In [16]:
for item in writer_citer_model_responses:
    print(item['choices'][0]['message']['content'])
    print("---")

Life insurance is a contract between an insurance company and a policyholder. In exchange for a premium, the insurance company pledges to pay a sum of money to designated beneficiaries upon the policyholder's death, providing financial security to the beneficiaries [1]. Life insurance can be categorized into different types, including term life insurance and whole life insurance, each serving various financial needs and offering specific coverage options [1][4].

Moreover, some life insurance policies offer living benefits, which allow a portion of the policy's death benefit to be accessed while the policyholder is still alive, under certain conditions [1]. Illustrations of life insurance policies are essential tools that demonstrate how such policies are expected to perform over time, detailing benefits, required premiums, and related expenses to help consumers understand the product better [4]. 

Specific programs, such as those offered by Veterans Affairs, cater to particular groups

In [17]:
# Save to disk
storage_manager.save_to_folder("WriterCiterModelResponses", writer_citer_model_responses, f"{trial_number}.json")

Data saved to ../data/WriterCiterModelResponses/14.json


In [18]:
"""
Compute visibility scores
"""

def get_sfblic_link_number(sets_of_reference_material, improvement_method):

    set_of_reference_material = [
        s for s in sets_of_reference_material if s['improvement_method'] == improvement_method
    ][0]['set_of_reference_material']

    link_number = 1
    
    for piece_of_reference_material in set_of_reference_material:
        if piece_of_reference_material['link'] == sfblic_link:
            return link_number
        else:
            link_number += 1
            

def get_writer_citer_response(improvement_method): 
    return [        
        r for r in writer_citer_model_responses if r['improvement_method'] == improvement_method
    ][0]['choices'][0]['message']['content']

    

In [19]:
sets_of_reference_material = storage_manager.load_from_folder("SetsOfReferenceMaterial", f"{trial_number}.json")
writer_citer_model_responses = storage_manager.load_from_folder("WriterCiterModelResponses", f"{trial_number}.json")

Data loaded from ../data/SetsOfReferenceMaterial/14.json
Data loaded from ../data/WriterCiterModelResponses/14.json


In [21]:
sfblic_link_number = get_sfblic_link_number(sets_of_reference_material, improvement_method)
sfblic_link_number  

5

In [28]:
records = []

for improvement_method in [0, 1]:

    text = get_writer_citer_response(improvement_method)
    parsed_data = parse_citations_from_text(text)
    scores = compute_citation_scores(parsed_data, num_citations=5, normalize=False)
    normalized_scores = compute_citation_scores(parsed_data, num_citations=5, normalize=True)
    sfblic_score = scores[sfblic_link_number - 1]
    normalized_sfblic_score = normalized_scores[sfblic_link_number - 1]
    
    record = {
        "improvement_method" : improvement_method,
        "scores" : scores,
        "normalized_scores" : normalized_scores,
        "sfblic_score" : sfblic_score,
        "normalized_sfblic_score" : normalized_sfblic_score,    
        
    }

    records.append(record)

records

[{'improvement_method': 0,
  'scores': [38.74382565633974,
   5.702131338157356,
   5.702131338157356,
   21.85886949896214,
   0],
  'normalized_scores': [0.5380566937286749,
   0.07918861607084422,
   0.07918861607084422,
   0.3035660741296366,
   0.0],
  'sfblic_score': 0,
  'normalized_sfblic_score': 0.0},
 {'improvement_method': 1,
  'scores': [22.957612653305553,
   5.874499914683438,
   5.874499914683438,
   12.42114395741385,
   50.553631779166764],
  'normalized_scores': [0.23502545440668296,
   0.06013939832117959,
   0.06013939832117959,
   0.12715978124239677,
   0.5175359677085611],
  'sfblic_score': 50.553631779166764,
  'normalized_sfblic_score': 0.5175359677085611}]

In [29]:
# Save to disk
storage_manager.save_to_folder("VisibilityScores", records, f"{trial_number}.json")

Data saved to ../data/VisibilityScores/14.json
