In [1]:
import polars as pl
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"
AUTH = ("neo4j", "fairusecases")

In [3]:
def get_all_opinions(tx):
    
    results = tx.run("""
        MATCH (c:Case)-[]-(o:Opinion)
        RETURN c.WestLawCaseName as Title, o.Document as Document
        """
    )
         
    return results.to_df()


with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
            df = session.execute_read(get_all_opinions)

In [5]:
df = pl.from_pandas(df)

In [None]:
df.with_columns(
    pl.col("Document").map_elements(len).alias("DocLength")
).sort(by = "DocLength")

Title,Document,DocLength
str,str,i64
"""Meeropol v. Ni…","""<opinion type=…",6729
"""Kienitz v. Sco…",""" --- Page 1 --…",7455
"""Blanch v. Koon…","""<div> <center>…",9210
"""Estate of Smit…",""" --- Page 1 --…",9776
"""Estate of Smit…",""" --- Page 1 --…",9776
"""Homeowner Opti…","""<div> <center>…",9986
"""Worldwide Chur…","""<opinion type=…",14682
"""Richards v. Me…","""<opinion type=…",14880
"""Davidson v. Un…","""<pre class=""in…",15354
"""Iowa State Uni…","""<opinion type=…",17997


In [11]:
df.group_by("Title").count().filter(pl.col("count") != 1)["Title"].to_list()

['Bouchat v. Baltimore Ravens Ltd. Partnership',
 'Otto v. Hearst Communications, Inc.',
 'Monge v. Maya Magazines, Inc.',
 'Cambridge University Press v. Patton',
 'Abend v. MCA, Inc.',
 'MCA, Inc. v. Wilson',
 'Compaq Computer Corp. v. Ergonome Inc.',
 'Hustler Magazine Inc. v. Moral Majority Inc.',
 'Harper & Row Publishers, Inc. v. Nation Enterprises',
 'Andy Warhol Foundation for Visual Arts, Inc. v. Goldsmith',
 'West Pub. Co. v. Mead Data Cent., Inc.',
 'Estate of Smith v. Graham',
 'American Geophysical Union v. Texaco Inc.',
 'Fox News Network, LLC v. Tveyes, Inc.',
 'Cheffins v. Stewart',
 'Worldwide Church of God v. Philadelphia Church of God, Inc.']