In [None]:
"""
This can be used for addresses that are misspelled. Add in the CFS address and
the victim's address to use with the LLMs to determine the best course of 
action.
"""
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

import faiss
import json

from openai import OpenAI
import gradio as gr
import os
from dotenv import load_dotenv
from typing import List, Tuple
import nltk
from transformers import AutoTokenizer, AutoModel

# Additional imports for conversational memory.
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
import warnings
warnings.filterwarnings("ignore")


# Make sure NLTK's sentence tokenizer is available
nltk.download('punkt')

load_dotenv("inc/api_keys.env")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)
OPENAI_MODEL = "gpt-3.5-turbo"

# Initialize the model.
llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL, temperature=0.3)
""""""
# Load a pre-trained model for text embeddings
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

try:
    print("Loading FAISS files...")
    # Load index from disk
    index = faiss.read_index("resources/canton_addresses.faiss")

    # Load the original texts from the JSON file
    with open("resources/text_mapping.json", "r") as f:
        text_data = {int(k): v for k, v in json.load(f).items()}
except: 
    print("Files not located.\n Creating FAISS files...")
    # Prep and train model for only Canton addresses
    df_addresses = pd.read_excel("resources/lib_address.xlsx")
    address = df_addresses.loc[df_addresses['city'] == "CANTON"]['full_address'].tolist()
    embeddings = model.encode(address, verbose=0)

    # Set the dimension to the embedding size
    dimension = embeddings.shape[1]

    # Create a FAISS index (Flat L2 distance)
    index = faiss.IndexFlatL2(dimension)

    # Add the embeddings to the FAISS index
    index.add(np.array(embeddings).astype('float32'))

    # Store the original text
    text_data = {i: text for i, text in enumerate(address)}

    # Save index to disk
    faiss.write_index(index, "resources/canton_addresses.faiss")

    # Save the original texts in a JSON file
    with open("resources/text_mapping.json", "w") as f:
        json.dump(text_data, f)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\James\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Loading FAISS files...


In [2]:
def address_suggestion(address, model, llm, k=3, vic_address=None):

    # Vectorize the query text
    query_vector = model.encode([address])

    # Perform the search in FAISS (k=1 means the closest match)
    distances, indices = index.search(np.array(query_vector).astype('float32'), k)

    # Get the list of the closest matches
    string = ""
    for i in range(0,k,1):
        string += f"{text_data[indices[0][i]]}, "
    string = string[:-2]

    if vic_address is not None:
        prompt = f"""Given an address'{address}' with the person's address 
        listed as '{vic_address}' which may or may not be incorrect, compare it 
        to the list of addresses '{string}' and identify the closest match 
        based on the street name, directional quadrant, house number, and the 
        person's listed address. Prioritize matches where the house number is 
        closest, and consider slight data entry errors such as extra digits or 
        misnumbering. Then return what you believe to be the correct address 
        without explaination.If there are no matches to the list provided, reply with 
        'Needs to be added'."""
    else:
        prompt = f"""Given an address'{address}', compare it 
        to the list of addresses '{string}' and identify the closest match 
        based on the street name, directional quadrant, and house number. 
        Prioritize matches where the house number is closest, and consider 
        slight data entry errors such as extra digits or incorrect directional 
        quadrant. Then return what you believe to be the correct address 
        without explaination. If there are no matches to the list provided, reply with 
        'Needs to be added'."""
    
    result = llm.invoke(prompt)
    return result.content
    
    

In [None]:
# Correcting an address typed by the officer by using the victim's address as a close match type.
address = '60189 Arlington AVE NE'
vic_address = '609 Arlington Ave NW'
result = address_suggestion(address, model, llm, k=3, vic_address=vic_address)
print(result)

609 ARLINGTON AVE NW


In [None]:
# Correcting an address that was misspelled.
address = '1401 31STT RD NE'
vic_address = None
result = address_suggestion(address, model, llm, k=3, vic_address=vic_address)
print(result)

1401 31ST ST NE


In [None]:
# Finding the closest match for an address that was entered incorrectly.
# Example query text
query_text = "6018 Arlington Ave NE"

# Vectorize the query text
query_vector = model.encode([query_text])

# Perform the search in FAISS (k=1 means the closest match)
k = 3  # You can increase k to get more matches
distances, indices = index.search(np.array(query_vector).astype('float32'), k)

# Get the index of the closest match
closest_index = indices[0][0]

# Retrieve the corresponding text
retrieved_text = text_data[closest_index]
print(f"Closest text: {retrieved_text}")

Closest text: 609 ARLINGTON AVE NW


In [None]:
# This can return multiple close matches to work from as needed.
string = ""
for i in range(0,k,1):
    string += f"{text_data[indices[0][i]]}, "
string = string[:-2]
print(string)

609 ARLINGTON AVE NW, 615 ARLINGTON AVE NW, 8322 ARLINGTON AVE NW


In [None]:
# Using a detailed prompt to get the LLM to determine the best match.
prompt = f"""Given an address'{query_text}' with the person's address listed as 
'{vic_address}' which may or may not be incorrect, compare it to the list of addresses '{string}' 
and identify the closest match based on the street name, directional quadrant, 
house number, and the person's listed address. Prioritize matches where the 
house number is closest, and consider slight data entry errors such as extra 
digits or misnumbering. Then return what you believe to be the correct address without explaination."""

result = llm.invoke(prompt)
print(result.content)

609 ARLINGTON AVE NW
