In [12]:
import polars as pl
import requests
import os
import re

API_KEY = os.environ["CL_API_KEY"]
# defining a params dict for the parameters to be sent to the API
header =  {"Authorization": f"Token {API_KEY}"}

In [None]:
df = pl.read_csv("./Cases/WestLawMatch.csv")

In [None]:
def get_cluster_id(absolute_url):
    pattern = r'/opinion/(\d+)/'

    match = re.search(pattern, absolute_url)
    if match:
        return match.group(1)

In [None]:
df = df.with_columns(
    pl.col("OpinionURL").map_elements(get_cluster_id).alias("ClusterID")
)

In [None]:
df.head()

In [None]:
def get_opinions_from_cluster(cluster_id, header = header):

    URL = f"https://www.courtlistener.com/api/rest/v4/clusters/{cluster_id}"

        # sending get request and saving the response as response object
    r = requests.get(url = URL, headers = header)

    if r.status_code == 200:

        response = r.json()
        return response
    
    else:

        return { None }

In [None]:
get_opinions_from_cluster("7331944")

In [None]:
df = df.with_columns(
    pl.col("ClusterID").map_elements(get_opinions_from_cluster).alias("opinions")
)

In [None]:
cl = df.select(["OpinionURL", "CourtListenerCaseName", "Citation","ClusterID", "opinions"]).with_columns(
    pl.col("opinions").map_elements(lambda x: x["docket_id"]).alias("DocketID"),
    pl.col("opinions").map_elements(lambda x: x["sub_opinions"]).alias("SubOpinions"),
    pl.col("opinions").map_elements(lambda x: x["precedential_status"]).alias("PrecedentialStatus")
).drop("opinions")

In [None]:
cl = cl.explode("SubOpinions")

In [None]:
cl.head()

In [None]:
def get_actual_opinion(opinion_id):

    URL = opinion_id

    # sending get request and saving the response as response object
    r = requests.get(url = URL, headers = header)

    if r.status_code == 200:

        response = r.json()
            
        return response
        
    else:

        return None

In [None]:
cl = cl.with_columns(
    pl.col("SubOpinions").map_elements(get_actual_opinion).alias("Opinion_API")
)

In [None]:
def get_raw_opinion(opinion_data: dict):

    # Priority order for locating the opinion text
    document_priority = [
        "html_with_citations",
        "html_columbia",
        "html_lawbox",
        "xml_harvard",
        "html_anon_2020",
        "html",
        "plain_text",
    ]

    # Find the first non-empty document in the priority list
    document = None
    for key in document_priority:
        # We check if the key is in the dict and not None or empty
        if key in opinion_data and opinion_data[key]:
            document = opinion_data[key]
            break

    return document

In [None]:
cl = cl.with_columns(
    pl.col("Opinion_API").map_elements(lambda x: x["type"]).alias("OpinionType"),
    pl.col("Opinion_API").map_elements(get_raw_opinion).alias("Document")
).drop("Opinion_API")

In [None]:
cl.write_csv("./Cases/CourtListenerOpinions.csv")

In [None]:
def get_courts_from_docket(docket_id, header = header):

    docket_id = str(docket_id)

    URL = f"https://www.courtlistener.com/api/rest/v4/dockets/{docket_id}"

    # sending get request and saving the response as response object
    r = requests.get(url = URL, headers = header)

    if r.status_code == 200:

        response = r.json()
        return response
    
    else:

        return { None }

In [24]:
def get_courts_from_court_id(court_url, header = header):

    URL = court_url

    # sending get request and saving the response as response object
    r = requests.get(url = URL, headers = header)

    if r.status_code == 200:

        response = r.json()
        return response
    
    else:

        return { None }

In [14]:
df = pl.read_csv("./Cases/CourtListenerOpinions.csv")

In [None]:
df = df.with_columns(
    pl.col("DocketID").map_elements(get_courts_from_docket).alias("Docket_API")
)

In [21]:
df = df.with_columns(
    pl.col("Docket_API").map_elements(lambda x: x["court_id"]).alias("CourtID"),
    pl.col("Docket_API").map_elements(lambda x: x["court"]).alias("CourtURL")
)

In [25]:
df = df.with_columns(
    pl.col("CourtURL").map_elements(get_courts_from_court_id).alias("Court_API")
)

In [31]:
df.with_columns(
    pl.col("Court_API").map_elements(lambda x: x["full_name"]).alias("Court")
).drop(["Docket_API", "Court_API"]).write_csv("./Cases/CourtListenerWithCourt.csv")