Python script that connects to Harvard's Caselaw Access Project API, searches for copyright cases and fair use cases

pip install requests


In [6]:
import json
import requests
import pandas as pd
import numpy as np
import re

In [7]:
API_KEY = ''  # Set your API key here or leave it blank if you don't have one but opinions won't be available
BASE_URL = 'https://api.case.law/v1/cases/?page_size=600'
HEADERS = {'AUTHORIZATION': f'Token {API_KEY}'} if API_KEY else {}

In [8]:
def get_cases(query, case_type):
    url = f"{BASE_URL}&search={query}"
    response = requests.get(url, headers=HEADERS)
    cases = response.json().get('results', [])
    
    case_list = []
    
    for case in cases:
        try:
            if API_KEY:
                opinions = case['casebody']['data']['opinions']
                text = ''.join(opinion['text'] for opinion in opinions)
                text += case['casebody']['data'].get('head_matter', '')
            else:
                text = "" #"Opinion not available without API key"
            
            case_data = {
                "case_type": case_type,
                "year": case['decision_date'][:4],
                "court": case['court']['name'],
                "jurisdiction": case['jurisdiction']['name'],
                "opinions": text
            }
            case_list.append(case_data)
        except Exception as e:
            print(f"Error processing case ID {case['id']}: {e}")
    
    return case_list

In [9]:
def save_cases_to_csv(cases, filename):
    df = pd.DataFrame(cases)
    df =df.drop_duplicates(keep=False)
    df['case_type_encoded'] = df['case_type'].apply(lambda x: 1 if 'copyright' in x else 0)
    df = df.rename(columns={'opinions':'text'})
    df.to_csv(filename, index=False)

In [10]:
def main():
    copyright_cases = get_cases("copyright", "copyright")
    fair_use_cases = get_cases("fair use", "fair use")
    
    all_cases = copyright_cases + fair_use_cases
    
    save_cases_to_csv(all_cases, 'fairuse_copyright_dataset.csv')
    print(f"Saved {len(all_cases)} cases to fairuse_copyright_dataset.csv")

if __name__ == '__main__':
    main()

Saved 1200 cases to fairuse_copyright_dataset.csv
