In [1]:
import polars as pl
from neo4j import GraphDatabase
from embedder import Retriever, Gemini_Embeddings

## Database Connections
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "fairusecases")

driver = GraphDatabase.driver(URI, auth=AUTH)

In [2]:
# Simple function to verify connection
def verify_connection():
    try:
        with driver.session() as session:
            result = session.run("RETURN 1")
            value = result.single()[0]
            if value == 1:
                print("Connection verified!")
            else:
                print("Connection failed.")
    except Exception as e:
        print(f"Error connecting to Neo4j: {e}")
    finally:
        driver.close()

verify_connection()

Connection verified!


In [3]:
retriever = Retriever(Gemini_Embeddings(), driver)

In [4]:
df = pl.read_csv("./Data/case_complaints.csv")

In [5]:
def get_retrieved_cases(text, text_sim, court_weight, cit_weight, retriever=retriever):
    
    df_cases = retriever.search_similar_cases(text, 10, text_sim, court_weight, cit_weight)

    cases = df_cases["Case"].to_list()
    text_similarities = df_cases["TextSimilarity"].to_list()
    pagerank = df_cases["CasePageRank"].to_list()
    courts = df_cases["CourtName"].to_list()

    return {"cases": cases, "text_sim": text_similarities, "pagerank": pagerank, "court": courts}

In [6]:
df_standard_RAG = df.with_columns(
    pl.col("Complaint").map_elements(lambda x: get_retrieved_cases(x, 1, 0, 0)).alias("retrieved")
).unnest("retrieved")

In [7]:
df_standard_RAG.explode(["cases",	"text_sim",	"pagerank",	"court"]).write_csv("./Data/StandardRAGRetrieval.csv")

In [8]:
df_pagerank = df.with_columns(
    pl.col("Complaint").map_elements(lambda x: get_retrieved_cases(x, .33, .33, .33)).alias("retrieved")
).unnest("retrieved")

In [9]:
df_pagerank.explode(["cases",	"text_sim",	"pagerank",	"court"]).write_csv("./Data/PRRAGRetrieval.csv")

In [10]:
retriever.search_similar_cases()

TypeError: search_similar_cases() missing 5 required positional arguments: 'text', 'top_k', 'similarity_weight', 'court_weight', and 'case_weight'

In [None]:
retriever.search_similar_cases("HHHH", 10, 1, 0, 0)

In [None]:
similarity_weight = 0.3
court_weight = 0.3
case_weight = 0.4
df = pl.read_csv("./cases.csv")

In [None]:
num_docs = 5

In [None]:
df.with_columns(
    (pl.col("score") - pl.col("score").min())/((pl.col("score").max() - pl.col("score").min())),
    (pl.col("CasePageRank") - pl.col("CasePageRank").min())/((pl.col("CasePageRank").max() - pl.col("CasePageRank").min())),
    (pl.col("CourtPageRank") - pl.col("CourtPageRank").min())/((pl.col("CourtPageRank").max() - pl.col("CourtPageRank").min()))
).with_columns(
    pl.struct(["score", "CasePageRank", "CourtPageRank"]).map_elements(lambda x: similarity_weight*x["score"] + court_weight*x["CourtPageRank"] + case_weight*x["CasePageRank"]).alias("FinalScore")
).group_by(["Case", "FiledDate", "CourtName"]).max().sort("FinalScore", descending = True).top_k(num_docs, by = "FinalScore")