we're creating two models, to take a document description (from purchase dockets) and a document casename + docuemnt_type (from RSS) to predict whether it's a search warrant or not.

we're going to get the training data in several batches:

1. a bunch of docs (like 20 per district court) that ARE NOT search warrants
2. a bunch of docs that match the term "search" and "warrant" that are hand-coded.
3. a bunch of case_names from RSS
4. case_names from RSS that are coded -sw- in the case number.

So, basically, we're oversampling on things that match "search" and "warrant". TODO: we might also want to oversample on cases with the United States as a party...

#1 is on sheet "TK" of https://docs.google.com/spreadsheets/d/1gyYAlYdL9o45pPIC0POXzKwvO5Ji7SZ9uwP46uXfMqA/edit#gid=1319449048

#2 is on sheet "matches search_warrants" of the same doc, https://docs.google.com/spreadsheets/d/1gyYAlYdL9o45pPIC0POXzKwvO5Ji7SZ9uwP46uXfMqA/edit#gid=1319449048

#3 is on "search warrant case_names from RSS" of the same doc.

## getting search warrant docs

so we can oversample on things that might be search warrants

In [2]:
from os import environ, makedirs
from os.path import join, exists, dirname
import requests
from urllib.parse import urlencode
import logging
import csv
import pandas as pd

API_KEY = environ.get("API_KEY")
STORAGE_PATH = "pdfs/"



In [17]:
def get_pdf(recap_filepath_local, type_of_file="other"):
    logging.basicConfig(level=logging.DEBUG)

    fn = recap_filepath_local.split("/")[-1]
    fp = join(STORAGE_PATH, type_of_file, fn)
    makedirs(dirname(fp), exist_ok=True)
    if exists(fp):
        return fp
    url = recap_filepath_local.replace("/storage", "https://www.courtlistener.com").replace("/sata", "https://www.courtlistener.com")
    with open(fp, 'wb') as f:
        logging.debug(f"actually getting PDF from the web {url}")
        f.write(requests.get(url).content)
    return fp

def get_search_warrant_pdf(recap_filepath_local):
    return get_pdf(recap_filepath_local, "search_warrant")

# def get_docket_entries():
#     "https://www.courtlistener.com/api/rest/v3/docket-entries/?docket__id=XXX"
def search_recap_with_url(url):
    return requests.get(url, headers = {'content-type': 'application/json', "Authorization": f"Token {API_KEY}"}).json()

def search_recap(q=None, description=None, available_only=None, suit_nature=None):
    urlparams = {
        "type": "r", # Document-oriented results from the RECAP Archive
        "available_only": "on" if available_only else "off",
        "order_by": "entry_date_filed desc"
    }
    if suit_nature: urlparams["suitNature"] = suit_nature
    if description: urlparams["description"] = description
    if q: urlparams["q"] = q # wwg1wga
    return search_recap_with_url("https://www.courtlistener.com/api/rest/v3/search/?{}".format(urlencode(urlparams)))
    


In [18]:
def find_search_warrant_documents(n=1000):
#     for each case and document, make a record (in memory or in a DB), so we don\'t duplicate
#     download the documents locally
#    ?q=&type=r&order_by=entry_date_filed%20desc&available_only=on&description=search%20warrant
    next_url = None
    records = []
    while len(records) <= n:
        if len(records) == 0:
            search_result = search_recap(description="search warrant", available_only=True)
            records += search_result["results"]
            next_url = search_result["next"]
        elif next_url:
            search_result = search_recap_with_url(next_url)
            records += search_result["results"]
            next_url = search_result["next"]
        else: # next_url is not None (and it's not the first go)
            break
    return records


In [19]:
search_results = find_search_warrant_documents()

KeyboardInterrupt: 

In [None]:
SEARCH_WARRANT_CSV_FN = 'search_warrants_model/search_warrants.csv'

with open(SEARCH_WARRANT_CSV_FN, 'w') as csvfile:
    writer = None
    for i, result in enumerate(search_results):
        if i == 0:
            writer = csv.DictWriter(csvfile, fieldnames=result.keys())
            writer.writeheader()
        writer.writerow(result)
        get_search_warrant_pdf(result["filepath_local"])


# get non-search warrants (negative training data)

In [20]:
import courtlistener

ModuleNotFoundError: No module named 'courtlistener'

In [21]:
def get_courts():
    """
    kwargs passed through to search_recap:  q=None, description=None, available_only=None, suit_nature=None, filed_after=None

    """
    next_url = None
    records = []
    while True:
        if len(records) == 0:
            search_result = search_recap_with_url("https://www.courtlistener.com/api/rest/v3/courts/")
            records += search_result["results"]
            next_url = search_result["next"]
        elif next_url:
            search_result = search_recap_with_url(next_url)
            records += search_result["results"]
            next_url = search_result["next"]
        else:  # next_url is not None (and it's not the first go)
            break
    return records

courts = get_courts()


In [29]:
courts = pd.DataFrame(courts)
district_courts = courts[(~courts["fjc_court_id"].isna()) & (courts["fjc_court_id"] != '') & (courts["id"].str.match(r"[a-z]{2,3}d$"))]
district_courts

Unnamed: 0,resource_uri,id,pacer_court_id,pacer_has_rss_feed,pacer_rss_entry_types,date_last_pacer_contact,fjc_court_id,date_modified,in_use,has_opinion_scraper,has_oral_argument_scraper,position,citation_string,short_name,full_name,url,start_date,end_date,jurisdiction
140,https://www.courtlistener.com/api/rest/v3/cour...,dcd,45.0,True,all,,90,2020-07-27T23:10:06.610741-07:00,True,True,False,200.5,D.D.C.,District of Columbia,"District Court, District of Columbia",http://www.dcd.uscourts.gov/,,,FD
142,https://www.courtlistener.com/api/rest/v3/cour...,almd,17.0,True,"order,order-cr,motion",,27,2020-07-27T23:18:14.673226-07:00,True,False,False,201.0,M.D. Ala.,M.D. Alabama,"District Court, M.D. Alabama",http://www.almd.uscourts.gov/,,,FD
143,https://www.courtlistener.com/api/rest/v3/cour...,alnd,19.0,False,,,26,2013-08-14T22:35:30-07:00,True,False,False,201.5,N.D. Ala.,N.D. Alabama,"District Court, N.D. Alabama",http://www.alnd.uscourts.gov/,,,FD
144,https://www.courtlistener.com/api/rest/v3/cour...,alsd,21.0,True,all,,28,2020-07-27T23:14:09.738906-07:00,True,False,False,202.0,S.D. Ala.,S.D. Alabama,"District Court, S.D. Alabama",http://www.als.uscourts.gov/,,,FD
145,https://www.courtlistener.com/api/rest/v3/cour...,akd,23.0,True,all,,7-,2020-07-27T23:15:10.762327-07:00,True,False,False,202.5,D. Alaska,D. Alaska,"District Court, D. Alaska",http://www.akd.uscourts.gov/,,,FD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,https://www.courtlistener.com/api/rest/v3/cour...,wyd,207.0,True,all,,89,2020-07-27T23:15:13.163424-07:00,True,False,False,299.0,D. Wyo.,D. Wyoming,"District Court, D. Wyoming",http://www.wyd.uscourts.gov/,,,FD
261,https://www.courtlistener.com/api/rest/v3/cour...,gud,59.0,True,all,,93,2020-07-27T23:10:05.257570-07:00,True,False,False,299.3,D. Guam,D. Guam,"District Court, D. Guam",http://www.gud.uscourts.gov/,,,FD
262,https://www.courtlistener.com/api/rest/v3/cour...,nmid,139.0,True,"order,order-cr,motion",,94,2020-07-27T23:14:09.655978-07:00,True,False,False,299.4,N. Mar. I.,Northern Mariana Islands,"District Court, Northern Mariana Islands",http://www.nmid.uscourts.gov/,,,FD
263,https://www.courtlistener.com/api/rest/v3/cour...,prd,159.0,True,all,,04,2020-07-27T23:18:16.151084-07:00,True,False,False,299.5,D.P.R.,D. Puerto Rico,"District Court, D. Puerto Rico",http://www.prd.uscourts.gov/,,,FD


In [37]:
def get_documents_by_court(court, available_only=True, filed_after=None, party_name=None):
    urlparams = {
        "type": "r",  # Document-oriented results from the RECAP Archive
        "order_by": "entry_date_filed desc",
    }
    if available_only:
        urlparams["available_only"] = "on"
    if filed_after:
        urlparams["filed_after"] = filed_after
    if court:
        urlparams["court"] = court
    if party_name:
        urlparams["party_name"] = party_name
    return search_recap_with_url(
        "https://www.courtlistener.com/api/rest/v3/search/?{}".format(
            urlencode(urlparams)
        )
    )

get_documents_by_court("gand")


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=gand HTTP/1.1" 200 None


{'count': 41279,
 'next': 'https://www.courtlistener.com/api/rest/v3/search/?available_only=on&court=gand&order_by=entry_date_filed+desc&page=2&type=r',
 'previous': None,
 'results': [{'absolute_url': '/docket/18241602/82/elite-integrated-medical-llc-v-new-world-communicatons-of-atlanta-inc/',
   'assignedTo': 'Amy Mil Totenberg',
   'assigned_to_id': 3247,
   'attachment_number': None,
   'attorney': None,
   'attorney_id': None,
   'caseName': 'Elite Integrated Medical, LLC v. New World Communicatons of Atlanta, Inc.',
   'cause': '',
   'court': 'District Court, N.D. Georgia',
   'court_citation_string': 'N.D. Ga.',
   'court_exact': 'gand',
   'court_id': 'gand',
   'dateArgued': None,
   'dateFiled': '2019-11-17T23:53:00-08:00',
   'dateTerminated': '2021-04-28T00:53:00-07:00',
   'description': '',
   'docketNumber': '1:19-cv-05214',
   'docket_absolute_url': '/docket/18241602/elite-integrated-medical-llc-v-new-world-communicatons-of-atlanta-inc/',
   'docket_entry_id': 16305744

In [35]:
SEARCH_WARRANT_CSV_FN = 'search_warrants_model/search_warrants_all_districts_no_query.csv'
with open(SEARCH_WARRANT_CSV_FN, 'w') as csvfile:
    writer = None

    for i, district_court in district_courts.iterrows():
        search_results = get_documents_by_court(district_court["id"])["results"]
        for result in search_results:
            if not writer:
                writer = csv.DictWriter(csvfile, fieldnames=result.keys())
                writer.writeheader()
            writer.writerow(result)



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=dcd HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=almd HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=alnd HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=kyed HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=kywd HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=laed HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=oked HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=oknd HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=okwd HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=vid HTTP/1.1" 200 None


In [38]:
SEARCH_WARRANT_CSV_FN = 'search_warrants_model/search_warrants_all_districts_united_states_as_party.csv'
with open(SEARCH_WARRANT_CSV_FN, 'w') as csvfile:
    writer = None

    for i, district_court in district_courts.iterrows():
        search_results = get_documents_by_court(district_court["id"], party_name="united states")["results"]
        for result in search_results:
            if not writer:
                writer = csv.DictWriter(csvfile, fieldnames=result.keys())
                writer.writeheader()
            writer.writerow(result)



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=dcd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=almd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=alnd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.

DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=iand&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=iasd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=ksd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&avai

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=ncmd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=ncwd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=ndd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.

DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=wvnd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=wvsd&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&available_only=on&court=wied&party_name=united+states HTTP/1.1" 200 None
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.courtlistener.com:443
DEBUG:urllib3.connectionpool:https://www.courtlistener.com:443 "GET /api/rest/v3/search/?type=r&order_by=entry_date_filed+desc&ava

# getting RSS data

for each court get 10 random document_types (code them no)
and get 10 document_types that match 'warrant'

In [3]:
from sqlalchemy import create_engine
import pandas as pd
from os import environ
from dotenv import load_dotenv
load_dotenv()

live_engine = create_engine(environ.get("LIVE_DATABASE_URL"))


courts = pd.DataFrame(get_courts())
district_courts = courts[(~courts["fjc_court_id"].isna()) & (courts["fjc_court_id"] != '') & (courts["id"].str.match(r"[a-z]{2,3}d$"))]
district_courts

NameError: name 'get_courts' is not defined

In [43]:
# for each court, 


SEARCH_WARRANT_CSV_FN = 'search_warrants_model_training/rss_document_types_all_courts_random.csv'
with open(SEARCH_WARRANT_CSV_FN, 'w') as csvfile:
    writer = None

    for i, district_court in district_courts.iterrows():        
        search_results = pd.read_sql("""
            select case_name, document_type, case_number, court, false as "is search warrant?" from rss_docket_entries where document_type not ilike '%%warrant%%' and court = %(district_court)s order by random() limit 20;
            """, live_engine, params={"district_court": district_court["id"]}).head(20)
        for i, result in search_results.iterrows():
            if not writer:
                writer = csv.DictWriter(csvfile, fieldnames=result.to_dict().keys())
                writer.writeheader()
            writer.writerow(result.to_dict())


SEARCH_WARRANT_CSV_FN = 'search_warrants_model_training/rss_document_types_all_courts_matches_warrant.csv'
with open(SEARCH_WARRANT_CSV_FN, 'w') as csvfile:
    writer = None

    for i, district_court in district_courts.iterrows():
        search_results = pd.read_sql("""
            select case_name, document_type, case_number, court, null as "is search warrant?" from rss_docket_entries where document_type ilike '%%warrant%%' and court = %(district_court)s order by random() limit 20;
            """, live_engine, params={"district_court" : district_court["id"]}).head(20)
        for i, result in search_results.iterrows():
            if not writer:
                writer = csv.DictWriter(csvfile, fieldnames=result.to_dict().keys())
                writer.writeheader()
            writer.writerow(result.to_dict())



## sw cases

In [44]:
pd.read_sql("""select court, count(*) from rss_docket_entries where substring(case_number, 6, 2) = 'sw' group by court""", live_engine)

Unnamed: 0,court,count
0,cod,6
1,dcd,16
2,mowd,6
3,rid,37


In [4]:
sw_docs = pd.read_sql("""select case_name,document_type,case_number,court, true as "is search warrant?" from rss_docket_entries where substring(case_number, 6, 2) = 'sw' or substring(case_number, 6, 2) = 'sc'""", live_engine)
sw_docs.to_csv("search_warrants_model_training/sw_docs.csv")
sw_docs

Unnamed: 0,case_name,document_type,case_number,court,is search warrant?
0,INFORMATION ASSOCIATED WITH ONE ACCOUNT STORED...,Application for Search/Seizure Warrant- 18 U.S...,1:21-sc-01192,dcd,True
1,USA v. IN THE MATTER OF THE SEARCH OF THE PRE...,Case Assigned/Reassigned,1:21-sw-00156-1,rid,True
2,USA v. White iPhone with gray and blue case se...,Case Assigned/Reassigned,1:21-sw-00154-1,rid,True
3,USA v. IN THE MATTER OF THE SEARCH OF ONE CELL...,Order on Motion to Unseal Document,1:21-sw-00052-1,dcd,True
4,USA v. White iPhone with gray and blue case se...,Application for Warrant,1:21-sw-00154-1,rid,True
...,...,...,...,...,...
96,THE MONITORING OF GLOBAL POSITIONING SYSTEM IN...,Order on Motion to Unseal Case,1:21-sc-00352,dcd,True
97,"USA v. 4227 JENIFER STREET N.W. WASHINGTON, D....",Search and Seizure Warrant Returned Executed,1:21-sw-00145-1,dcd,True
98,USA v. A SILVER 2004 CHRYSLER TOWN AND COUNTRY...,Search and Seizure Warrant Returned Executed,1:21-sw-00109-1,dcd,True
99,THE MONITORING OF GLOBAL POSITIONING SYSTEM IN...,Case Unsealed,1:21-sc-00299,dcd,True


# sw docs for NER data labeling