Python script that connects to Harvard's Caselaw Access Project API, searches for copyright cases and fair use cases

pip install requests
<br>
take a look at: <br>
https://github.com/harvard-lil/cap-examples/blob/develop/README.md
<br>
https://github.com/harvard-lil/cap-examples/blob/develop/python_wrapper/README.md

In [1]:
import json
import requests
import pandas as pd
import numpy as np
import re

In [2]:
API_KEY = ''  # Set your API key here or leave it blank if you don't have one but opinions won't be available
BASE_URL = 'https://api.case.law/v1/cases/?page_size=600'
HEADERS = {'AUTHORIZATION': f'Token {API_KEY}'} if API_KEY else {}

In [3]:
disposition_to_outcome = {
    'Transfer to another district': 'pending',  
    'Remanded to state court': 'pending', 
    'MDL Transfer': 'pending',  
    'Remanded to U.S. Agency': 'pending',  
    'Want of prosecution':  'copyright infringement not found',  
    'Lack of jurisdiction':  'copyright infringement not found', 
    'Voluntarily': 'copyright infringement not found', 
    'Settled': 'copyright infringement not found',  
    'Other': 'pending', 
    'Default': 'copyright infringement found', 
    'Consent': 'copyright infringement found',  
    'Motion before trial': 'copyright infringement found',  
    'Jury verdict': 'copyright infringement found', 
    'Directed verdict': 'copyright infringement found', 
    'Court trial': 'copyright infringement found', 
    'Award of arbitrator': 'pending',  
    'Stayed pending bankruptcy': 'pending', 
    'Other': 'pending',  
    'Statistical closing': 'copyright infringement not found',  
    'Appeal affirmed (magistrate judge)': 'copyright infringement not found',  
    'Appeal denied (magistrate judge)': 'copyright infringement found',  
    'Copyright infringement not found': 'copyright infringement not found',
    'Copyright infringement found' : 'copyright infringement found',
    'fair use found': 'fair use found',
    'fair use not found': 'fair use not found',
    'pending':'pending',
    'unknown': 'unknown'
}

def extract_disposition(opinion_text):
    # Check the first page or a subset of the text for disposition
    possible_dispositions = disposition_to_outcome.keys()
    for disposition in possible_dispositions:
        if disposition.lower() in opinion_text.lower():
            return disposition_to_outcome[disposition]
    return 'unknown'


In [4]:
def get_cases(query, case_type):
    url = f"{BASE_URL}&search={query}"
    response = requests.get(url, headers=HEADERS)
    cases = response.json().get('results', [])
    
    case_list = []
    
    for case in cases:
        try:
            if API_KEY:
                opinions = case['casebody']['data']['opinions']
                text = ''.join(opinion['text'] for opinion in opinions)
                text += case['casebody']['data'].get('head_matter', '')
            else:
                text = "" #"Opinion not available without API key"
            
            outcome = extract_disposition(text)
            
            case_data = {
                "case_type": case_type,
                "year": case['decision_date'][:4],
                "court": case['court']['name'],
                "jurisdiction": case['jurisdiction']['name'],
                "opinions": text,
                "outcome": outcome
            }
            case_list.append(case_data)
        except Exception as e:
            print(f"Error processing case ID {case['id']}: {e}")
    
    return case_list

In [5]:
# Define the word to column mappings
word_list = [
    'audiovisual', 'commentary', 'computer program', 'digitization', 'drawing',
    'education', 'film', 'format shifting', 'graphic', 'internet', 'music',
    'news reporting', 'painting', 'parody', 'photograph', 'research', #'review',
    'satire', 'scholarship', 'sculpture', 'space shifting', 'textual', 'unpublished',
    'used in government proceeding'
]

def _build_tags(cases_df):
    # Iterate over the holding column and update the tags column
    for idx, row in cases_df.iterrows():
        holding_text = row['text']
        if pd.notna(holding_text):
            tags = []
            for word in word_list:
                if word in holding_text.lower():
                    tags.append(word)
            cases_df.at[idx, 'tags'] = ', '.join(tags)


In [6]:
def save_cases_to_csv(cases, filename):
    df = pd.DataFrame(cases)
    df =df.drop_duplicates(keep=False)
    df['case_type_encoded'] = df['case_type'].apply(lambda x: 1 if 'copyright' in x else 0)
    df = df.rename(columns={'opinions':'text'})
    df['tags']=''
    _build_tags(df)
    df.to_csv(filename, index=False)

In [7]:
def main():
    copyright_cases = get_cases("copyright", "copyright")
    fair_use_cases = get_cases("fair use", "fair use")
    
    all_cases = copyright_cases + fair_use_cases
    
    save_cases_to_csv(all_cases, 'fairuse_copyright_dataset.csv')
    print(f"Saved {len(all_cases)} cases to fairuse_copyright_dataset.csv")

if __name__ == '__main__':
    main()

Saved 1200 cases to fairuse_copyright_dataset.csv
