In [19]:
import os 
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase
from typing import List
import polars as pl

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "fairusecases")

load_dotenv()

driver = GraphDatabase.driver(URI, auth=AUTH)

In [20]:
example = """
Dr. Jane Doe, a medical educator, created a proprietary study guide titled Advanced Radiology Review, designed to help medical professionals prepare for board certification exams. The guide, compiled over several years, contained unique mnemonics, case studies, and explanatory diagrams. Doe made this study material available through her website and a subscription-based online course.

On March 1, 2024, Doe discovered that Online Medical Academy, Inc. (OMA), a well-known YouTube-based medical education platform, was advertising an upcoming free live-streamed review course on their channel. Upon further investigation, she obtained an early-access copy of the PDF syllabus that OMA planned to provide to viewers. The document contained entire sections copied verbatim from Advanced Radiology Review without her permission.
"""

In [21]:
class Custom_Embeddings:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', token = os.environ["HUGGING_FACE"])
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return [self.model.encode(t)[0] for t in texts]
            
    def embed_query(self, query: str) -> List[float]:
        return self.model.encode([query])[0]
    
mini_embedder = Custom_Embeddings()

In [None]:
def search_similar_texts(tx, query_text, top_k=5, embedding_model = mini_embedder):

    query_embedding = embedding_model.embed_query(query_text)

    query = """
    CALL db.index.vector.queryNodes('MiniEmbeddingIndex', 100, $query_embedding)
    YIELD node, score
    MATCH (node)-[:FROM]-()-[:OF]-(o:Opinion)-[:HAS_OPINION]-(c:Case)
    RETURN DISTINCT c.WestLawCaseName as Case, o.Document AS text, score
    ORDER BY score DESC
    """
    
    return tx.run(query, top_k = top_k, query_embedding = query_embedding).to_df()

with driver.session() as session:
    df = pl.from_pandas(session.execute_read(search_similar_texts, example)).group_by(["Case", "text"]).mean().sort("score", descending = True).top_k(5, by = "score")




In [None]:
df.group_by(["Case", "text"]).mean().sort("score", descending = True).top_k()

TypeError: top_k() missing 1 required keyword-only argument: 'by'

In [36]:
df["Case"].to_list()[:10]

['Weissmann v. Freeman',
 'NXIVM Corp. v. Ross Institute',
 'West Pub. Co. v. Mead Data Cent., Inc.',
 'Cambridge University Press v. Patton',
 'Weissmann v. Freeman',
 'Healthcare Advocates, Inc. v. Harding, Earley, Follmer & Frailey',
 'Peter Letterese And Associates, Inc. v. World Institute Of Scientology Enterprises',
 'Compaq Computer Corp. v. Ergonome Inc.',
 'Compaq Computer Corp. v. Ergonome Inc.',
 'Compaq Computer Corp. v. Ergonome Inc.']