In [1]:
import polars as pl
from neo4j import GraphDatabase
from datetime import datetime
from util import court_mapping
from google import genai
import os
import time
from extract_instructions import facts_of_case, purpose_and_character, nature_of, amount_used, market_eff, weigh_four_factors

instructions = {
    'Facts': facts_of_case,
    'Purpose': purpose_and_character,
    'Nature': nature_of,
    'Amount': amount_used,
    'Market': market_eff,
    'Combined': weigh_four_factors
}

client = genai.Client(api_key=os.environ["GEMINI_API"])

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "fairusecases")

In [17]:
# Read all processed files
westlaw = pl.read_csv("./Cases/WestLawMatch.csv")
opinions = pl.read_csv("./Cases/CourtListenerOpinions.csv")
cl = pl.read_csv("./Cases/CourtListenerWithCourt.csv")
appellate_relationships = pl.read_csv("./Cases/AppellateRelationship.csv")
citation = pl.read_csv("./Cases/CitationRelationship.csv")

In [18]:
### Creating Court Cases with a corresponding match to Westlaw Cases for
### re-identification - hopefully not ;(
westlaw = westlaw.select(["Title", "CourtListenerCaseName"]).unique()

def create_westlaw_cases(tx, obs):
    result = tx.run("""
        MERGE (c: Case {WestLawCaseName: $westlaw, CaseName: $courtlistener})
        """, westlaw = obs["Title"], courtlistener = obs["CourtListenerCaseName"]
    )

    return result

obs = westlaw.iter_rows(named=True)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
         for ob in obs:
            session.execute_write(create_westlaw_cases, ob)

In [19]:
westlaw

Title,CourtListenerCaseName
str,str
"""Galvin v. Illi…","""Galvin v. Illi…"
"""Bell v. Eagle …","""Bell v. Eagle …"
"""Universal City…","""Universal City…"
"""A.V. ex rel. V…","""A v. Ex Rel. V…"
"""Sofa Entertain…","""SOFA Entertain…"
"""Fisher v. Dees…","""State ex rel. …"
"""Straus v. DVC …","""Straus v. DVC …"
"""North Jersey M…","""North Jersey M…"
"""BMG Music v. G…","""BMG Music v. G…"
"""Monge v. Maya …","""Noelia Monge v…"


In [4]:
## Create Case Clusters. This is based on CourtListener's structure of cases etc.

cluster = opinions.select(["OpinionURL", "CourtListenerCaseName", "ClusterID", "DocketID"])

def define_cluster_courtlistener(tx, obs):
    result = tx.run("""
        MATCH (c: Case {CaseName: $courtlistener} )
        SET c.OpinionURL = $OpinionURL, c.ClusterID = $ClusterID, c.DocketID = $DocketID
        """, courtlistener = obs["CourtListenerCaseName"], OpinionURL = obs["OpinionURL"], ClusterID = obs["ClusterID"], DocketID = obs["DocketID"]
    )

    return result

obs = cluster.iter_rows(named=True)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
         for ob in obs:
            session.execute_write(define_cluster_courtlistener, ob)

In [5]:
## Create Opinions, linking them to the cases involved

opinions = opinions.select(["CourtListenerCaseName", 'SubOpinions', 'PrecedentialStatus', 'OpinionType', 'Document']).unique()

def create_opinions(tx, obs):
    result = tx.run("""
        MERGE (o: Opinion {URL: $SubOpinions, PrecendentialStatus: $PrecedentialStatus, Type: $OpinionType, Document: $Document} )
        WITH o
        MATCH (c: Case {CaseName: $courtlistener})
        MERGE (c)-[:HAS_OPINION]->(o)
        
        """, courtlistener = obs["CourtListenerCaseName"], SubOpinions = obs["SubOpinions"], PrecedentialStatus = obs["PrecedentialStatus"], OpinionType = obs["OpinionType"], Document = obs["Document"]
    )

    return result

obs = opinions.iter_rows(named=True)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
         for ob in obs:
            session.execute_write(create_opinions, ob)

In [6]:
def create_courts(tx, obs):
    result = tx.run("""
        MERGE (c1: Court {Name: $start_name})
        MERGE (c2: Court {Name: $end_name})
        WITH c1, c2
        MERGE (c1)-[:APPEALS_TO]->(c2)
        """, start_name = obs["start_name"], end_name = obs["end_name"]
    )

    return result

obs = appellate_relationships.iter_rows(named=True)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
         for ob in obs:
            session.execute_write(create_courts, ob)

In [7]:
cl = cl.select(["CourtListenerCaseName", "CourtID", "CourtURL", "Court"]).unique()

In [8]:
def create_case_citation(tx, obs):
    result = tx.run("""
        MATCH (c1: Case {CaseName: $citer}), (c2: Case {CaseName: $citee})
        WITH c1, c2
        MERGE (c1)-[:CITED {Citation: $Citation}]->(c2)
        """, citer = obs["CourtListenerCaseName"], citee = obs["CitedCaseName"], Citation = obs["Citation"]
    )
    return result

obs = citation.iter_rows(named=True)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
         for ob in obs:
            if ob["CourtListenerCaseName"] != ob["CitedCaseName"]:
                session.execute_write(create_case_citation, ob)

In [9]:
## Add Filing Date!
fact_pattern = pl.read_csv("./Cases/Westlaw/Fair Use Fact Pattern Precision Search.csv", truncate_ragged_lines=True)
defense = pl.read_csv("./Cases/Westlaw/Fair Use Defense Precision Search.csv", truncate_ragged_lines=True)

westlaw = pl.concat([fact_pattern, defense])
westlaw = westlaw.select(["Title", "Court Line", "Citation", "Filed Date"]).unique()

In [10]:
def to_std_datetime(date_str):

    try:
        # Parse the string into a datetime object using the appropriate format
        date_obj = datetime.strptime(date_str, "%B %d, %Y")
        # Format the datetime object into the desired format
        formatted_date = date_obj.strftime("%Y/%m/%d")
    
        return formatted_date
    
    except:
        return None

In [11]:
westlaw = westlaw.with_columns(
    pl.col("Filed Date").map_elements(to_std_datetime),
    pl.col("Court Line").replace(court_mapping)
)

In [12]:
top_court = westlaw.drop(["Citation", "Court Line"]).sort(["Title", "Filed Date"]).group_by(["Title"]).first()

In [13]:
top_court = top_court.join(westlaw, on = ["Title", "Filed Date"], how = "inner")

In [14]:
def add_case_date_and_court(tx, obs):
    result = tx.run("""
        MATCH (c1: Case {WestLawCaseName: $case})
        SET c1.FiledDate = $date, c1.Citation = $citation
        WITH c1
        MATCH (court:Court {Name: $courtName})
        MERGE (c1)-[:DECIDED_IN]->(court)
        """, case = obs["Title"], date = obs["Filed Date"], courtName = obs["Court Line"], citation = obs["Citation"]
    )
    return result


In [15]:
obs = westlaw.iter_rows(named=True)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
         for ob in obs:
            session.execute_write(add_case_date_and_court, ob)

In [21]:
def remove_duplicated_cases(tx):
    result = tx.run("""
        MATCH (n:Case)
        WITH n.CaseName AS name, COLLECT(n) AS nodes
        WHERE SIZE(nodes) > 1
        UNWIND nodes AS n
        WITH name, n ORDER BY n.FiledDate DESC // Correctly ordered in the main query
        WITH name, COLLECT(n) AS sortedNodes
        WITH sortedNodes[0] AS latest, sortedNodes[1..] AS duplicates
        UNWIND duplicates AS duplicate
        DETACH DELETE duplicate"""
    )
    return result


with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
            session.execute_write(add_case_date_and_court)


In [2]:
def opinion_does_not_have_facts(tx):
    
    results = tx.run("""
        MATCH (o:Opinion)
        WHERE NOT EXISTS {
            MATCH (o)<-[:OF]-(:Facts)
        }
        RETURN o.URL as url, o.Document as Document
        """
    )
         
    return results.to_df()


with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
            no_facts = session.execute_read(opinion_does_not_have_facts)

In [3]:
def extraction(instruction: str, document: str):

    try:
        response = client.models.generate_content(
            model = "gemini-2.0-flash",
            contents = instruction + "\n\n" + document
        )

        time.sleep(10)

        return response.text
    
    except:

        print("Going to Sleep!")
        time.sleep(60)

        return None