In [1]:
import pandas as pd
import os
from openai import OpenAI

In [2]:
embeddings = pd.read_parquet("StandUp_Embeddings.parquet")

print("Shape:", embeddings.shape)
print("Columns:", embeddings.columns.tolist())

print(embeddings.head)

embeddings_df = pd.DataFrame(embeddings)

Shape: (1230, 1540)
Columns: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '1

In [3]:
# Adding and defining Chroma lists

ids = embeddings_df["id"].astype(str).tolist()

titles = embeddings_df["title"].tolist()

contents = embeddings_df["content"].tolist()

vector_cols = [str(i) for i in range(1536)]

embeddings = embeddings_df[vector_cols].values.tolist()

metadatas = embeddings_df[["title", "section", "content"]].to_dict(orient= 'records')


In [4]:
# Using Chroma to build vector storage

import chromadb

client = chromadb.Client()
collection = client.get_or_create_collection("StandUp")


In [5]:
collection.add(
    ids=ids,
    embeddings=embeddings,
    metadatas=metadatas,
    documents=contents
)

In [18]:

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

def embed_query(text:str) -> list[float]:
    resp = client.embeddings.create(
        model = "text-embedding-ada-002",
        input = [text]
    )
    return resp.data[0].embedding

test_query = "My landlord was served an improvement notice, what does this mean and when does it start?"

query_vector = embed_query(test_query)



In [19]:
embeddings_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1530,1531,1532,1533,1534,1535,id,title,section,content
0,-0.0178,-0.005056,0.023964,-0.033802,0.003952,0.014686,-0.015012,-0.002605,-0.035575,-0.032056,...,-0.013539,0.00905,-0.026492,0.018856,-0.030362,0.005506,274,Equality Act 2010,s.13(1),(1)A person (A) discriminates against another ...
1,-0.020304,-0.015969,0.019388,-0.014705,-0.001068,0.007295,-0.015892,-0.007185,-0.030339,-0.0222,...,-0.012867,-0.001465,-0.033435,0.002338,-0.022793,-0.010397,275,Equality Act 2010,s.15(1),A person (A) discriminates against a disabled ...
2,-0.016071,-0.007583,-0.001029,-0.010051,-0.006182,0.0158,-0.020347,0.001606,-0.018771,-0.006905,...,-0.009476,-0.007448,0.000861,0.021096,-0.033718,0.002706,276,Equality Act 2010,s.20(1),Where this Act imposes a duty to make reasonab...
3,0.004635,0.004765,0.012514,-0.005445,0.01318,0.028719,-0.01701,-0.02054,-0.020313,-0.028958,...,-0.00995,-0.028425,-0.021326,0.011569,-0.032555,0.003145,277,Equality Act 2010,s.20(2),The duty comprises the following three require...
4,-0.003886,0.003766,0.000147,-0.02297,0.01977,0.010328,-0.015002,-0.035973,0.008247,-0.01399,...,-0.016013,-0.039869,-0.017835,0.008329,-0.013977,-0.00678,278,Equality Act 2010,s.20(6),Where the first or third requirement relates t...


In [20]:
# The million dollar question! Will the program return the relevant laws and statutes


results = collection.query(
    query_embeddings=[query_vector], n_results = 10
)

# Match the id with title/section, since Chroma's metadata function did not work. The 'Title and Section' was always the first row of the dataset.

meta_lookup = {}

for _, row in embeddings_df.iterrows():
    statute_id = str(row["id"])
    meta_lookup[statute_id] = {
        "title": row["title"],
        "section": row["section"],
        "content": row["content"]
    }


matched_ids = results["ids"][0]
matched_docs = results["documents"][0]
distances = results["distances"][0]

for sid, docs, dist in zip(matched_ids, matched_docs, distances):
    data = meta_lookup[sid]
    print(f"Statute ID: {sid}\n Title and Section: {data["title"]} {data["section"]} \n Statute Content: {docs} \n Distance: {dist}")



Statute ID: 613
 Title and Section: Housing Act 1988 s.14(3) 
 Statute Content: For the purposes of subsection (2)(b) above, in relation to a notice which is referred by a tenant as mentioned in subsection (1) above, an improvement is a relevant improvement if either it was carried out during the tenancy to which the notice relates or the following conditions are satisfied, namelyâ (a) that it was carried out not more than twenty-one years before the date of service of the notice; and (b) that, at all times during the period beginning when the improvement was carried out and ending on the date of service of the notice, the dwelling-house has been let under an assured tenancy; and (c) that, on the coming to an end of an assured tenancy at any time during that period, the tenant (or, in the case of joint tenants, at least one of them) did not quit. 
 Distance: 0.26659783720970154
Statute ID: 730
 Title and Section: Housing Act 2004 s.15(2) 
 Statute Content: The general rule is that an

In [21]:
# Creating an algorithm that logs the results into a csv for evaluation

query_id_list = []
statute_id_list = []
title_list = []
section_list = []
content_list = []

query_id = "6"

for sid, docs in zip(matched_ids, matched_docs):
    statute_id_str = str(sid)
    data = meta_lookup[statute_id_str]
    query_id_list.append(query_id)
    statute_id_list.append(statute_id_str)
    title_list.append(data.get("title", " "))
    section_list.append(data.get("section", " "))
    content_list.append(docs)
    

In [22]:
# Creating a dataframe for all results
eval_df_6 = pd.DataFrame(list(zip(query_id_list, statute_id_list, title_list, section_list, content_list)),columns = ["query_id", "statute_id", "title_list", "section_list", "content_list"])



In [23]:
df_6 = pd.DataFrame(eval_df_6)

df_6.to_csv("query_6.csv")