In [1]:
from dotenv import dotenv_values
import requests
import sys


"""
Configuration
"""

path_to_src_folder = "../src"
sfblic_link = "https://www.sfbli.com/"
path_to_clean_corpus = "../data/clean_content.txt"

# Load environment variables
config = dotenv_values("../.env")
google_api_key = config['GOOGLE_API_KEY']
search_engine_id = config['SEARCH_ENGINE_ID']

sys.path.append(path_to_src_folder)

from helpers.google_interactor import GoogleInteractor
from helpers.html_parser import HTMLParser 
from helpers.storage_manager import StorageManager
from helpers.refiner_model import RefinerModel
from helpers.improver_model import ImproverModel
from helpers.writer_citer_model import WriterCiterModel
from helpers.visibility import parse_citations_from_text, compute_citation_scores

"""
============================================
Run through a single trial of the experiment
============================================
"""

"""
Set up
"""

# Number of links to consider
K = 5

# Declare trial number
trial_number = 1

# Select a query to use for the current trial
query = "Where is Southern Farm Bureau Life Insurance Company located?"

# Instantiate helpers
google_interactor = GoogleInteractor(api_key=google_api_key, search_engine_id=search_engine_id)
storage_manager = StorageManager("../data/")    
website_refiner = RefinerModel(api_key=config['OPEN_AI_API_KEY'])
corpus_refiner = RefinerModel(api_key=config['OPEN_AI_API_KEY'])
improver_model = ImproverModel(api_key=config['OPEN_AI_API_KEY'])
writer_citer = WriterCiterModel(api_key=config['OPEN_AI_API_KEY'])

# Load the clean corpus text
with open(path_to_clean_corpus) as f:
    corpus_text = f.read()

In [2]:
"""
Submit the query to Google and store the results
"""

# Submit the query to Google
search_results = google_interactor.search_google(query=query, num_results=10)

# Store the response
storage_manager.save_to_folder("SubmittedQueries", {"query": query}, f"{trial_number}.json")
    
# Store the response
storage_manager.save_to_folder("InitialGoogleResponses", search_results, f"{trial_number}.json")

Data saved to ../data/SubmittedQueries/1.json
Data saved to ../data/InitialGoogleResponses/1.json


In [3]:
"""
View links
"""

top_k_links = [item['link'] for item in search_results['items']][:K]
top_k_links

['https://www.sfbli.com/',
 'https://w3.sfbcic.com/',
 'https://www.sfbli.com/loginregister',
 'https://www.linkedin.com/company/southern-farm-bureau',
 'https://www.sfbli.com/contactus']

In [4]:
"""
Check to see if SFBLIC is in the top K links. 
If not, add it
"""

if sfblic_link in top_k_links:    
    updated_links = top_k_links    
else:
    top_k_links[K-1] = sfblic_link
    updated_links = top_k_links

updated_links

['https://www.sfbli.com/',
 'https://w3.sfbcic.com/',
 'https://www.sfbli.com/loginregister',
 'https://www.linkedin.com/company/southern-farm-bureau',
 'https://www.sfbli.com/contactus']

In [5]:
storage_manager.save_to_folder("UpdatedGoogleResponses", updated_links, f"{trial_number}.json")

Data saved to ../data/UpdatedGoogleResponses/1.json


In [6]:
"""
Get Raw Text from the HTML associated with the links from Google
"""

# Retrieve the stored response
updated_links = storage_manager.load_from_folder("UpdatedGoogleResponses", f"{trial_number}.json")

raw_text_from_html = []

# For each item of raw text that was stored ...
for link in updated_links:    

    try:

        # Fet the HTML associated with the link
        response = requests.get(link)
    
        # Get the raw text associated with the HTML
        parsed_text = HTMLParser.parse_html(response.content)
    
        # Store the raw text from the HTML
        raw_text_from_html.append({"link" : link, "raw_text" : parsed_text})

    except:
        print("Something happened with ", link)
    
storage_manager.save_to_folder("RawTextFromHTML", raw_text_from_html, f"{trial_number}.json")

Data loaded from ../data/UpdatedGoogleResponses/1.json
Data saved to ../data/RawTextFromHTML/1.json


In [7]:
"""
Get the refined text from the raw text
"""

raw_text_from_html = storage_manager.load_from_folder("RawTextFromHTML", f"{trial_number}.json")

website_refiner_responses = []
# For each element of raw text that was stored ...
for current_item in raw_text_from_html:
    
    link = current_item['link']
    raw_text = current_item['raw_text']   

    
    response_from_website_refiner = website_refiner.refine_text(raw_text=raw_text, query=query)
    
    if response_from_website_refiner:
        response_from_website_refiner['link'] = link
        website_refiner_responses.append(response_from_website_refiner)
    else:
        print("Failed to refine text.")

storage_manager.save_to_folder("WebsiteRefinerModelResponses", website_refiner_responses, f"{trial_number}.json")

Data loaded from ../data/RawTextFromHTML/1.json
Data saved to ../data/WebsiteRefinerModelResponses/1.json


In [8]:
for item in website_refiner_responses:
    print(item['link'])
    print(item['choices'][0]['message']['content'])
    print()

https://www.sfbli.com/
Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.

https://w3.sfbcic.com/
THERE IS NO RELEVANT INFORMATION IN THE RAW TEXT

https://www.sfbli.com/loginregister
Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.

https://www.linkedin.com/company/southern-farm-bureau
THERE IS NO RELEVANT INFORMATION IN THE RAW TEXT

https://www.sfbli.com/contactus
Southern Farm Bureau Life Insurance Company is located at 1401 Livingstone Lane, Jackson, MS 39213.



In [9]:
"""
Filter the corpus to information that may be relevant to the query
"""

response_from_corpus_refiner = corpus_refiner.refine_text(raw_text=corpus_text, query=query)
storage_manager.save_to_folder("CorpusRefinerModelResponses", response_from_corpus_refiner, f"{trial_number}.json")

Data saved to ../data/CorpusRefinerModelResponses/1.json


In [10]:
print(response_from_corpus_refiner['choices'][0]['message']['content'])

Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.


In [11]:
"""
Improve the text from the SFBLIC site
"""

# Get the refined text from the website
website_refiner_responses = storage_manager.load_from_folder("WebsiteRefinerModelResponses", f"{trial_number}.json")
sfblic_response = [response for response in website_refiner_responses if response['link'] == 'https://www.sfbli.com/'][0]
refined_text_from_website = sfblic_response['choices'][0]['message']['content']
refined_text_from_website

Data loaded from ../data/WebsiteRefinerModelResponses/1.json


'Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.'

In [12]:
# Get the refined text from the corpus
response_from_corpus_refiner = storage_manager.load_from_folder("CorpusRefinerModelResponses", f"{trial_number}.json")
refined_text_from_corpus = response_from_corpus_refiner['choices'][0]['message']['content']
print(refined_text_from_corpus)

Data loaded from ../data/CorpusRefinerModelResponses/1.json
Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.


In [13]:
# Generate improved text
improved_text = improver_model.improve_text(
    query=query,
    refined_text_from_website=refined_text_from_website,
    refined_text_from_corpus=refined_text_from_corpus
)

if improved_text:
    print("Improved Text generated...")
    
else:
    print("Failed to improve text.")


# Save to disk
storage_manager.save_to_folder("ImproverModelResponses", improved_text, f"{trial_number}.json")

Improved Text generated...
Data saved to ../data/ImproverModelResponses/1.json


In [14]:
print(improved_text['choices'][0]['message']['content'])

Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, Mississippi, 39213.


In [15]:
"""
Create multiple sets of reference material
"""

# Create a set of reference material that has the original (unimproved) content from the website

website_refiner_model_responses = storage_manager.load_from_folder("WebsiteRefinerModelResponses", f"{trial_number}.json")

unmodified_set_of_reference_material =  [
    {
        "link" : item['link'],
        "information" : item['choices'][0]['message']['content']
    }
    for item in website_refiner_model_responses
]

unmodified_set_of_reference_material

Data loaded from ../data/WebsiteRefinerModelResponses/1.json


[{'link': 'https://www.sfbli.com/',
  'information': 'Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.'},
 {'link': 'https://w3.sfbcic.com/',
  'information': 'THERE IS NO RELEVANT INFORMATION IN THE RAW TEXT'},
 {'link': 'https://www.sfbli.com/loginregister',
  'information': 'Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.'},
 {'link': 'https://www.linkedin.com/company/southern-farm-bureau',
  'information': 'THERE IS NO RELEVANT INFORMATION IN THE RAW TEXT'},
 {'link': 'https://www.sfbli.com/contactus',
  'information': 'Southern Farm Bureau Life Insurance Company is located at 1401 Livingstone Lane, Jackson, MS 39213.'}]

In [16]:

# Create a set of reference material that has the new (improved) content
improved_text = storage_manager.load_from_folder('ImproverModelResponses', f"{trial_number}.json")
improved_version = improved_text['choices'][0]['message']['content']


# Create a new set of reference material

new_set_of_reference_material = []

for item in unmodified_set_of_reference_material:
    if item['link'] == sfblic_link:
        new_set_of_reference_material.append({"link" : item['link'], "information" : improved_version})
    else:
        new_set_of_reference_material.append(item)

new_set_of_reference_material

Data loaded from ../data/ImproverModelResponses/1.json


[{'link': 'https://www.sfbli.com/',
  'information': 'Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, Mississippi, 39213.'},
 {'link': 'https://w3.sfbcic.com/',
  'information': 'THERE IS NO RELEVANT INFORMATION IN THE RAW TEXT'},
 {'link': 'https://www.sfbli.com/loginregister',
  'information': 'Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213.'},
 {'link': 'https://www.linkedin.com/company/southern-farm-bureau',
  'information': 'THERE IS NO RELEVANT INFORMATION IN THE RAW TEXT'},
 {'link': 'https://www.sfbli.com/contactus',
  'information': 'Southern Farm Bureau Life Insurance Company is located at 1401 Livingstone Lane, Jackson, MS 39213.'}]

In [17]:
"""
Store Sets of Reference Material
"""

sets_of_reference_material = []

improvement_method = 0
set_of_reference_material = unmodified_set_of_reference_material

sets_of_reference_material.append({
    "improvement_method" : improvement_method,
    "set_of_reference_material" : set_of_reference_material,
})

improvement_method = 1
set_of_reference_material = new_set_of_reference_material

sets_of_reference_material.append({
    "improvement_method" : improvement_method,
    "set_of_reference_material" : set_of_reference_material,
})


storage_manager.save_to_folder("SetsOfReferenceMaterial", sets_of_reference_material, f"{trial_number}.json")

Data saved to ../data/SetsOfReferenceMaterial/1.json


In [18]:
"""
Send sets of reference material to the writer-citer
"""

writer_citer_model_responses = [] 

sets_of_reference_material = storage_manager.load_from_folder("SetsOfReferenceMaterial", f"{trial_number}.json")

# Unimproved Method
# -----------------

set_of_reference_material = sets_of_reference_material[0]
improvement_method = set_of_reference_material['improvement_method']
reference_material = set_of_reference_material['set_of_reference_material']

# Generate a cited response
cited_response = writer_citer.generate_cited_response(
    rephrased_query=query,
    set_of_reference_query=reference_material
)

cited_response['improvement_method'] = improvement_method

writer_citer_model_responses.append(cited_response)

# Improved Method
# -----------------

set_of_reference_material = sets_of_reference_material[1]
improvement_method = set_of_reference_material['improvement_method']
reference_material = set_of_reference_material['set_of_reference_material']

# Generate a cited response
cited_response = writer_citer.generate_cited_response(
    rephrased_query=query,
    set_of_reference_query=reference_material
)

cited_response['improvement_method'] = improvement_method

writer_citer_model_responses.append(cited_response)


Data loaded from ../data/SetsOfReferenceMaterial/1.json


In [19]:
for item in writer_citer_model_responses:
    print(item['choices'][0]['message']['content'])
    print("---")

Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, MS 39213 [1][3][5].
---
Southern Farm Bureau Life Insurance Company is located at 1401 Livingston Lane, Jackson, Mississippi, 39213 [1][3][5].
---


In [20]:
# Save to disk
storage_manager.save_to_folder("WriterCiterModelResponses", writer_citer_model_responses, f"{trial_number}.json")

Data saved to ../data/WriterCiterModelResponses/1.json


In [21]:
"""
Compute visibility scores
"""


# Helpers for computing visibility scores

def get_sfblic_link_number(sets_of_reference_material, improvement_method):

    set_of_reference_material = [
        s for s in sets_of_reference_material if s['improvement_method'] == improvement_method
    ][0]['set_of_reference_material']

    link_number = 1
    
    for piece_of_reference_material in set_of_reference_material:
        if piece_of_reference_material['link'] == sfblic_link:
            return link_number
        else:
            link_number += 1
            

def get_writer_citer_response(improvement_method): 
    return [        
        r for r in writer_citer_model_responses if r['improvement_method'] == improvement_method
    ][0]['choices'][0]['message']['content']

# Load data to compute visibility scores

sets_of_reference_material = storage_manager.load_from_folder("SetsOfReferenceMaterial", f"{trial_number}.json")
writer_citer_model_responses = storage_manager.load_from_folder("WriterCiterModelResponses", f"{trial_number}.json")
sfblic_link_number = get_sfblic_link_number(sets_of_reference_material, improvement_method)

# Compute visibility scores
# -------------------------

visibility_score_records = []

for improvement_method in [0, 1]:

    text = get_writer_citer_response(improvement_method)
    parsed_data = parse_citations_from_text(text)
    scores = compute_citation_scores(parsed_data, num_citations=5, normalize=False)
    normalized_scores = compute_citation_scores(parsed_data, num_citations=5, normalize=True)
    sfblic_score = scores[sfblic_link_number - 1]
    normalized_sfblic_score = normalized_scores[sfblic_link_number - 1]
    
    record = {
        "improvement_method" : improvement_method,
        "scores" : scores,
        "normalized_scores" : normalized_scores,
        "sfblic_score" : sfblic_score,
        "normalized_sfblic_score" : normalized_sfblic_score,    
        
    }

    visibility_score_records.append(record)

# Save to disk
storage_manager.save_to_folder("VisibilityScores", visibility_score_records, f"{trial_number}.json")

visibility_score_records

Data loaded from ../data/SetsOfReferenceMaterial/1.json
Data loaded from ../data/WriterCiterModelResponses/1.json
Data saved to ../data/VisibilityScores/1.json


[{'improvement_method': 0,
  'scores': [4.0, 0, 4.0, 0, 4.0],
  'normalized_scores': [0.3333333333333333,
   0.0,
   0.3333333333333333,
   0.0,
   0.3333333333333333],
  'sfblic_score': 4.0,
  'normalized_sfblic_score': 0.3333333333333333},
 {'improvement_method': 1,
  'scores': [4.333333333333333, 0, 4.333333333333333, 0, 4.333333333333333],
  'normalized_scores': [0.3333333333333333,
   0.0,
   0.3333333333333333,
   0.0,
   0.3333333333333333],
  'sfblic_score': 4.333333333333333,
  'normalized_sfblic_score': 0.3333333333333333}]