# Cap to CL
This notebook seeks to organize court listener bulk data in a way similar to https://case.law


In [3]:
import pandas as pd
import numpy as np
import time
import timeit
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline

In [259]:
DataFrame = pd.core.frame.DataFrame

In [100]:
pd.set_option("display.max_columns", None)

In [8]:
# for large chunkfiles
def read_csv_as_dfs(filename, 
                    num_dfs=10, 
                    max_rows=10**5, 
                    dtype=None, 
                    parse_dates=None,
                    usecols=None
                   ):
    counter = 0
    dfs_opinions = []
    for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype, 
                          parse_dates=parse_dates, usecols=usecols):
        if counter >= num_dfs:
            break
        dfs_opinions.append(df)
        counter = counter + 1
    return dfs_opinions

In [15]:
def csv_to_df(filename, dtype = None, parse_dates = None, max_gb=5):
    start = time.perf_counter()
    file_size = os.path.getsize(filename)
    file_size_gb = round(file_size/10**9, 2)
    print("File Size is :", file_size_gb, "GB")
    print(f'Importing {filename} as a dataframe')
    df = None
    if file_size_gb > max_gb:
        df = pd.concat(read_csv_as_dfs(filename, num_dfs=10**5, max_rows=10**7, 
                                       dtype=dtype, parse_dates=parse_dates))
    else:
        df = pd.read_csv(filename, dtype=dtype, parse_dates=parse_dates)
    end = time.perf_counter()
    print(f'{filename} read in {int((end-start)/60)} minutes')
    return df

## Courts

In [16]:
courts_filename = 'courts-2022-12-31.csv'
df_courts = csv_to_df(courts_filename)

File Size is : 0.0 GB
Importing courts-2022-12-31.csv as a dataframe
courts-2022-12-31.csv read in 0 minutes


In [18]:
df_courts.sample(3)

Unnamed: 0,id,pacer_court_id,pacer_has_rss_feed,fjc_court_id,date_modified,in_use,has_opinion_scraper,has_oral_argument_scraper,position,citation_string,short_name,full_name,url,start_date,end_date,jurisdiction
1497,texdistct202,,,,2022-06-13 13:38:09.25745+00,f,f,f,368.7227,,Tex. 201st Jud. Dist. Ct.,Texas 201st Judicial District Court,https://www.traviscountytx.gov/courts/civil/di...,,,ST
1329,texdistct33,,,,2022-06-13 13:38:08.900235+00,f,f,f,368.7058,,Tex. 32nd Jud. Dist. Ct.,Texas 32nd Judicial District Court,http://www.districtcourt32.org/,,,ST
959,calmunctla,,,,2021-08-20 21:54:53.114+00,f,f,f,350.719,Cal. Mun. Ct. (Los Angeles),"California Municipal Court, Los Angeles County","Municipal Court of California, County of Los A...",http://www.lacourt.org/,,2001-01-01,ST


## Dockets

In [19]:
parse_dates = [
    'date_cert_granted', 
    'date_cert_denied', 
    'date_argued',
    'date_reargued',
    'date_reargument_denied',
    'date_filed',
    'date_terminated',
    'date_last_filing',
    'date_blocked'
    
]
my_types = {
    'appeal_from_str': 'string',
    'assigned_to_str': 'string',
    'referred_to_str': 'string',
    'case_name_short' : 'string',
    'case_name': 'string',
    'case_name_full': 'string',
    'court_id': 'string',
    'cause':'string',
    'nature_of_suit':'string',
    'jury_demand':'string',
    'jurisdiction_type':'string',
    'appellate_fee_status':'string',
    'appellate_case_type_information':'string',
    'mdl_status':'string',
    'filepath_ia':'string',
}

In [21]:
dockets_filename = 'dockets-2022-12-31.csv'
file_size = os.path.getsize(dockets_filename)
dfs_dockets = csv_to_df(dockets_filename, dtype=my_types, parse_dates=parse_dates)

File Size is : 18.49 GB
Importing dockets-2022-12-31.csv as a dataframe


  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,
  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,
  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,
  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,
  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,
  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,
  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,


dockets-2022-12-31.csv read in 16 minutes


In [671]:
dfs_dockets.dtypes

id                                           int64
date_created                                object
date_modified                               object
source                                       int64
appeal_from_str                             string
assigned_to_str                             string
referred_to_str                             string
panel_str                                   object
date_cert_granted                   datetime64[ns]
date_cert_denied                    datetime64[ns]
date_argued                         datetime64[ns]
date_reargued                       datetime64[ns]
date_reargument_denied              datetime64[ns]
date_filed                                  object
date_terminated                             object
date_last_filing                            object
case_name_short                             string
case_name                                   string
case_name_full                              string
slug                           

In [78]:
len(dfs_dockets)

65113781

In [23]:
dfs_dockets.head(3)

Unnamed: 0,id,date_created,date_modified,source,appeal_from_str,assigned_to_str,referred_to_str,panel_str,date_cert_granted,date_cert_denied,date_argued,date_reargued,date_reargument_denied,date_filed,date_terminated,date_last_filing,case_name_short,case_name,case_name_full,slug,docket_number,docket_number_core,pacer_case_id,cause,nature_of_suit,jury_demand,jurisdiction_type,appellate_fee_status,appellate_case_type_information,mdl_status,filepath_ia,filepath_ia_json,date_blocked,blocked,appeal_from_id,assigned_to_id,court_id,idb_data_id,originating_court_information_id,referred_to_id
0,4500079,2016-11-15 10:09:28.299713+00,2021-02-01 20:43:29.984241+00,1,,Roberta A. Colton,,,NaT,NaT,NaT,NaT,NaT,2016-08-22 00:00:00,2016-11-29,2016-12-01,,Inael Berrios and Joan Radaha,,inael-berrios-and-joan-radaha,6:16-bk-05556,1605556.0,1215335,,,,,,,,https://www.archive.org/download/gov.uscourts....,https://archive.org/download/gov.uscourts.flmb...,2017-02-18,t,,8743.0,flmb,,,
1,15096520,2019-05-10 19:46:15.337613+00,2021-01-30 09:44:08.297394+00,1,,Pamela S. Hollis,,,NaT,NaT,NaT,NaT,NaT,2005-05-31 00:00:00,2008-10-23,2008-10-23,,Matthew A Polich and Diane M Polich,,matthew-a-polich-and-diane-m-polich,05-21638,5021638.0,799218,,,,,,,,,,2019-05-10,t,,8793.0,ilnb,,,
2,4521872,2016-11-15 11:37:00.843754+00,2021-01-24 08:05:59.770187+00,1,,Martin Glenn,,,NaT,NaT,NaT,NaT,NaT,2011-11-14 00:00:00,2016-04-04,2016-04-04,Sivova,Sivova v. MF Global Holdings Ltd.,,sivova-v-mf-global-holdings-ltd,11-02881,11002881.0,221903,,,,,,,,https://www.archive.org/download/gov.uscourts....,https://archive.org/download/gov.uscourts.nysb...,2017-02-19,t,,8905.0,nysb,,,


## Opinion Clusters

In [24]:
opinion_clusters_filename ='opinion-clusters-2022-12-31.csv'
df_opinion_clusters = csv_to_df(opinion_clusters_filename)

File Size is : 6.7 GB
Importing opinion-clusters-2022-12-31.csv as a dataframe


  for df in pd.read_csv(filename, chunksize=max_rows, dtype=dtype,


opinion-clusters-2022-12-31.csv read in 2 minutes


In [25]:
df_opinion_clusters.columns

Index(['id', 'date_created', 'date_modified', 'judges', 'date_filed',
       'date_filed_is_approximate', 'slug', 'case_name_short', 'case_name',
       'case_name_full', 'scdb_id', 'scdb_decision_direction',
       'scdb_votes_majority', 'scdb_votes_minority', 'source',
       'procedural_history', 'attorneys', 'nature_of_suit', 'posture',
       'syllabus', 'headnotes', 'summary', 'disposition', 'history',
       'other_dates', 'cross_reference', 'correction', 'citation_count',
       'precedential_status', 'date_blocked', 'blocked', 'docket_id'],
      dtype='object')

## Citation Map

In [45]:
citation_map_filename = 'citation-map-2022-12-31.csv'
df_citation_map = csv_to_df(citation_map_filename)


File Size is : 0.79 GB
Importing citation-map-2022-12-31.csv as a dataframe
citation-map-2022-12-31.csv read in 0 minutes


In [47]:
df_citation_map.sample(3)

Unnamed: 0,id,depth,cited_opinion_id,citing_opinion_id
23391543,192016310,1,40554,40555
19823623,178094740,2,3888713,3393974
6292940,119869467,4,102604,1380429


## Get Opinions

Read a million rows divided into 10 data frames

In [682]:
df_opinions.dtypes

id                       int64
date_created            object
date_modified           object
author_str              object
per_curiam              object
joined_by_str          float64
type                    object
sha1                    object
page_count             float64
download_url            string
local_path              string
plain_text              string
html                    string
html_lawbox             string
html_columbia           string
html_anon_2020          string
xml_harvard             object
html_with_citations     string
extracted_by_ocr        object
author_id              float64
cluster_id               int64
dtype: object

In [676]:
opinion_dtypes = {
    'download_url': 'string',
    'local_path':'string',
    'plain_text':'string',
    'html':'string',
    'html_lawbox':'string',
    'html_columbia':'string',
    'html_anon_2020':'string',
    'html_with_citations':'string',
    'local_path':'string'
}

In [677]:
# get memory size & limit to 1/3 of available memory
opinions_filename = 'opinions-2022-12-31.csv'
num_dfs = 1
dfs_opinions = read_csv_as_dfs(opinions_filename, num_dfs=num_dfs, dtype=opinion_dtypes)

In [681]:
df_opinions = dfs_opinions[0]

In [None]:
df

In [None]:
#dfs_opinions[0].loc[dfs_opinions[0]['id'] == 7954541]

In [35]:
df_opinions = pd.concat(dfs_opinions)

In [695]:
mask = (
        df_opinions[('plain_text')].notna() |
        df_opinions[('html')].notna() |
        df_opinions[('html_lawbox')].notna() |
        df_opinions[('html_columbia')].notna() |
        df_opinions[('xml_harvard')].notna() |
        df_opinions[('html_anon_2020')].notna()

       )

In [696]:
texts = df_opinions[mask][[
        'plain_text',
        'html',
        'html_lawbox',
        'html_columbia',
        'xml_harvard',
        'html_anon_2020'
    
]]

In [697]:
labels = pd.Series(['plain_text',
        'html',
        'html_lawbox',
        'html_columbia',
        'xml_harvard',
        'html_anon_2020'])

In [698]:
len(df_opinions)

100000

In [699]:
amt = pd.Series([len(texts[label].dropna()) 
                    for label in labels])/len(texts)

In [700]:
df_opinions_chart = pd.DataFrame({'source':source, 'amount':amt})

In [701]:
df_opinions_chart

Unnamed: 0,source,amount
0,plain_text,0.093191
1,html,0.03057
2,html_lawbox,0.050606
3,html_columbia,0.000691
4,xml_harvard,0.829408
5,html_anon_2020,0.001151


## Mapping

In [49]:
type(cap_json)

dict

In [39]:
cap_keys = list(cap_json.keys())

In [40]:
cap_keys

['id', 'url', 'name', 'name_abbreviation', 'decision_date', 'docket_number', 'first_page', 'last_page', 'citations', 'volume', 'reporter', 'court', 'jurisdiction', 'cites_to', 'frontend_url', 'frontend_pdf_url', 'preview', 'analysis', 'last_updated', 'provenance']

## CourtListener

In [634]:
# taxonomy = {
#     "id": 'opinions.id',
#     'url' : 'opinions.local_path',
#     'name_abbreviation': 'opinion_cluster.case_name',
#     'name': 'opinion_cluster.case_name_full',
#     'decision_date': 'docket.date_terminated',
#     'docket_number':'cluster.docket_id',
#     #'first_page': '',
#     #'last_page': '',
#     'citations': '', #citing_opinions that cited to this
#     'cites_to' : 'id', #id as citing_opinion_id -> cited_opinion_id
#     #'volume' : '',
#     #'reporter' : '',
#     'court' : {'name': 'courts.fullname'},
#     'jurisdiction' : {'name':'courts.jurisdiction'},
#     'casebody': {'status': 'ok',
#         'data': {'judges': [],
#             'head_matter': '',
#             'opinions': [{
#                 'text' : '',
#                 'author': '',
#                 'type' : ''
#             }]
                 
#         },
#     }
# }

opinions.csv fields

In [249]:
pd.isna(float(col_value(df_opinion['author_str'])))

True

In [50]:
def df_row_by_value(df, column, match):
    return df.loc[df[column] == match]

In [51]:
def get_columns_series(df):
    return [df[col] for col in list(df.columns)]

In [212]:
def col_value(col):
    if not col.isna():
        return col.to_numpy()[0]

In [529]:
def is_pd_series(col):
    return isinstance(col, pd.core.series.Series)

In [711]:
# accepts a series -- a row from a dataframe
def get_opinion_text(opinion):
    text = ''
    pt = opinion.plain_text
    hl = opinion.html
    hlb = opinion.html_lawbox
    hlc = opinion.html_columbia
    xh = opinion.xml_harvard
    hla = opinion.html_anon_2020
    
    if isinstance(pt, str):
        text = pt
    elif isinstance(hl, str):
        text = hl
    elif isinstance(hlb, str):
        text = hlb
    elif isinstance(hlc, str):
        text = hlc 
    elif isinstance(xh, str):
        text = xh 
    elif isinstance(hla, str):
        text = hla
    return text

In [712]:
df_opinions.dtypes

id                       int64
date_created            object
date_modified           object
author_str              object
per_curiam              object
joined_by_str          float64
type                    object
sha1                    object
page_count             float64
download_url            string
local_path              string
plain_text              string
html                    string
html_lawbox             string
html_columbia           string
html_anon_2020          string
xml_harvard             object
html_with_citations     string
extracted_by_ocr        object
author_id              float64
cluster_id               int64
dtype: object

In [713]:
def get_citations(opinion_id, df_citations):
    cites_to = df_citations[df_citations['citing_opinion_id'] == opinion_id]['cited_opinion_id'].to_list()
    cited_by = df_citations[df_citations['cited_opinion_id'] == opinion_id]['citing_opinion_id'].to_list()
    return {
        'cites_to':cites_to,
        'cited_by':cited_by
    }
    

In [714]:
# accepts dataframes
# opinion is a dataframe row! i.e. a dataframe with one row in it excluding headers
def process(taxonomy:dict, opinion: DataFrame, opinion_clusters: DataFrame, courts: DataFrame, 
            dockets: DataFrame, citations: DataFrame) -> dict:
    opinion_id = opinion['id']
    cluster_id = opinion['cluster_id']
    # get each corresponding row from clusters, dockets, courts based on opinion id
    cluster_row: DataFrame = df_row_by_value(opinion_clusters, 'id', cluster_id)
    # get corresponding row from docket df based on cluster opinion id
    docket_id = int(cluster_row['docket_id'])
    docket_row: DataFrame = dockets[dockets['id'] == docket_id]
    # return early if there's 
    if docket_row.empty:
        return 
    court_row: DataFrame = courts[courts['id'] == docket_row['court_id'].iloc[0]]
    if court_row.empty:
        return
    #get opinions cited to
    citation_info = get_citations(opinion_id, citations)
    cites_to = citation_info['cites_to']
    cited_by = citation_info['cited_by']
    #judges
    judges = cluster_row.judges
        
    obj = {
        'id': cluster_id,
        'url': opinion['download_url'],
        'name_abbreviation': cluster_row.case_name.iloc[0],
        'name' : cluster_row.case_name_full.iloc[0],
        'decision_date': docket_row.date_terminated.iloc[0],
        'docket_number' : cluster_row.docket_id.iloc[0],
        'citations' : cited_by,
        'cites_to' : cites_to,
        'court' : {'name': court_row.full_name.iloc[0]},
        'jurisdiction' : {'name': court_row.jurisdiction.iloc[0]},
        'casebody' : {'data': {
            'judges': judges.iloc[0].split(',') if judges.notna().bool() else [],
            'head_matter':'', #Ask CL about copyright,
            'opinions': [{
                'text': get_opinion_text(opinion), 
                'author': '', 'type': ''}]
            }
        }

    }
    return obj

In [715]:
#get_opinion_text(df_opinions[df_opinions['id'] == 6326345])

In [716]:
start = time.perf_counter()
jsonw = df_opinions.sample(5)[[
            'id',
            'local_path',
            'download_url',
            'cluster_id',
            'xml_harvard',
            'plain_text',
            'html',
            'html_lawbox',
            'html_columbia',
            'html_anon_2020'
        ]].apply(
        lambda row: process(
            taxonomy, 
            row, # force row to be a DataFrame than series
            df_opinion_clusters, 
            df_courts, 
            dfs_dockets, 
            df_citation_map)
          ,
        axis=1,
        )
end = time.perf_counter()
print(end-start)


0.7072695269889664


In [717]:
dict(jsonw)

{95364: {'id': 6108113,
  'url': <NA>,
  'name_abbreviation': 'Ramos v. New York City Housing Authority',
  'name': 'Janet Ramos, an Infant, by Her Mother and Natural Guardian, Luz Ramos v. New York City Housing Authority',
  'decision_date': nan,
  'docket_number': 62468245,
  'citations': [],
  'cites_to': [],
  'court': {'name': 'Appellate Division of the Supreme Court of the State of New York'},
  'jurisdiction': {'name': 'SA'},
  'casebody': {'data': {'judges': [],
    'head_matter': '',
    'opinions': [{'text': '<opinion type="majority">\n<p id="ApmL">—In an action to recover damages for personal injuries, etc., the defendant New York City Housing Authority appeals from an order of the Supreme Court, Kings County (Greenstein, J.), dated March 23, 1992, which directed the appellant to produce its employee Heriberto Valentine for an examination before trial and to provide maintenance and/or repair records with regard to the claimed defective bench behind the premises at 725 Stanle

In [498]:
1515.3115051740024/60

25.25519175290004

In [579]:
dm = df_opinions[df_opinions['id'] == 6326345]
dm

Unnamed: 0,id,date_created,date_modified,author_str,per_curiam,joined_by_str,type,sha1,page_count,download_url,local_path,plain_text,html,html_lawbox,html_columbia,html_anon_2020,xml_harvard,html_with_citations,extracted_by_ocr,author_id,cluster_id
794866,6326345,2022-03-24 15:00:15.951702+00,2022-03-24 15:06:31.249458+00,,f,,010combined,5770e2f86271371c7c273163b056ed48038b4aae,4.0,http://www.ca2.uscourts.gov/decisions/isysquer...,pdf/2022/03/24/united_states_v._shafer.pdf,21-1334-cr\nUnited States v. Shafer\n\n ...,,,,,,"<pre class=""inline"">21-1334-cr\nUnited States ...",f,,6454234


In [581]:
get_opinion_text(dm)

> [0;32m/var/folders/rj/v64qwmcn24jgn090vgn_qr_c0000gq/T/ipykernel_3317/3219008094.py[0m(4)[0;36mget_opinion_text[0;34m()[0m
[0;32m      2 [0;31m    [0mtext[0m [0;34m=[0m [0;34m''[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      3 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 4 [0;31m    [0;32mif[0m [0mis_pd_series[0m[0;34m([0m[0mopinion[0m[0;34m.[0m[0mxml_harvard[0m[0;34m)[0m [0;32mand[0m [0;32mnot[0m [0misinstance[0m[0;34m([0m[0mopinion[0m[0;34m.[0m[0mxml_harvard[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m,[0m [0mfloat[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      5 [0;31m        [0mtext[0m [0;34m=[0m [0mopinion[0m[0;34m.[0m[0mxml_harvard[0m[0;34m.[0m[0miloc[0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    [0;32melif[0

'21-1334-cr\nUnited States v. Shafer\n\n                             UNITED STATES COURT OF APPEALS\n                                FOR THE SECOND CIRCUIT\n\n                                      SUMMARY ORDER\nRulings by summary order do not have precedential effect. Citation to a summary order\nfiled on or after January 1, 2007, is permitted and is governed by Federal Rule of Appellate\nProcedure 32.1 and this Court’s Local Rule 32.1.1. When citing a summary order in a\ndocument filed with this Court, a party must cite either the Federal Appendix or an\nelectronic database (with the notation “summary order”). A party citing a summary order\nmust serve a copy of it on any party not represented by counsel.\n\n       At a stated term of the United States Court of Appeals for the Second Circuit, held at\nthe Thurgood Marshall United States Courthouse, 40 Foley Square, in the City of New York,\non the 24th day of March, two thousand twenty-two.\n\nPRESENT:    JOSÉ A. CABRANES,\n         

In [520]:
txt = get_opinion_text(dm)
txt

'NOTE: This order is nonprecedential\nUnited States Court of AppeaIs\nfor the Federal Circuit\nVERMONT YANKEE NUCLEAR POWER\nCORPORATION, -\nPlaintiff~C\'ross Appellant,\nV.\nENTERGY NUCLEAR VERMONT YANKEE, LLC\nAND ENTERGY NUCLEAR OPERATIONS, INC.,\nPlaintiffs-Cross Appellan,ts, o\nV.\nUNITED S\'I`ATES,\nDefendant-Appellan.t.\n2011-5033, -5034, -5042\nAppea1s from the United States Court of Federal\nC1aimS in consolidated case noS. 02-CV-898 and 03-CV-\n2663, Judge Thomas C. Whee1er.\nON MOTION\nORDER\n\nVERMONT YANKEE NUCLEAR POWER V. US 2\nThe State of Vermont moves for a 21-day extension of\ntime, until October 24, 2011 to file its amended amicus\ncuriae brief, or in the alternative, a 21-day extension of\ntime from the date of filing of this order. "This motion is\nun0pposed, provided that the United States is granted an\nenlargement of time such that the deadline for its re-\nsponse would be 30 days from the new deadline for the\nState of Vermont’s amended amicus brief."\nUpon co

In [322]:
# start = time.perf_counter()
# items = []
# json = [items.append(
#     process(taxonomy, opinion, df_opinion_clusters, df_courts, dfs_dockets, df_citation_map))
#  for index, opinion in df_opinions.sample(100).iterrows()]#in zip(*get_columns_series(dfs_opinions[0]))]
# end = time.perf_counter()
# print(end-start)

144.56284578799387


In [404]:
end-start

744.4305102559956

In [302]:
(time.perf_counter() - start)/60

5.895291273283329

In [None]:
json

In [None]:
for r in zip(*get_columns_series(dfs_opinions[0])):
    print(r[0])

In [62]:
df_cluster = df_row_by_value(df_opinion_clusters, 'id', col_value(df_opinion['cluster_id']))

In [63]:
df_cluster

Unnamed: 0,id,date_created,date_modified,judges,date_filed,date_filed_is_approximate,slug,case_name_short,case_name,case_name_full,...,disposition,history,other_dates,cross_reference,correction,citation_count,precedential_status,date_blocked,blocked,docket_id
7551920,8000490,2022-09-08 23:47:52.267828+00,2022-09-08 23:47:52.267841+00,,1991-08-30,f,people-v-ratliff,Ratliff,People v. Ratliff,People v. Ratliff,...,,,,,,0,Published,,f,65095903


In [185]:
dfs_opinions[0].loc[dfs_opinions[0]['id'] == 7954541]

Unnamed: 0,id,date_created,date_modified,author_str,per_curiam,joined_by_str,type,sha1,page_count,download_url,...,plain_text,html,html_lawbox,html_columbia,html_anon_2020,xml_harvard,html_with_citations,extracted_by_ocr,author_id,cluster_id
99989,7954541,2022-09-08 23:47:52.273386+00,2022-09-08 23:47:52.273399+00,,f,,020lead,,,,...,,,,,,"<opinion type=""majority"">\n<p id=""AIK"">Court o...",,t,,8000490


In [89]:
l = dfs_opinions[0]['id'], dfs_opinions[0]['type']

In [90]:
l[0]

0        5901618
1        5901619
2        5901620
3        5901621
4        5901622
          ...   
99995    5976242
99996    5976243
99997    4468255
99998    5976244
99999    5976245
Name: id, Length: 100000, dtype: int64

In [83]:
len(dfs_opinions[0].columns)

21

In [None]:
#result = [for print(1) for row in zip(*get_columns_series(dfs_opinions))]



get the corresponding opinion_cluster_row: case_name_full, docket_id, case_name
get the corresponding docket: date_terminated
get corresponding court: fullname
get citationsmap: cited_opinion_id, citing_opinion_id

In [63]:
dfs[0].head()

Unnamed: 0,id,date_created,date_modified,author_str,per_curiam,joined_by_str,type,sha1,page_count,download_url,...,plain_text,html,html_lawbox,html_columbia,html_anon_2020,xml_harvard,html_with_citations,extracted_by_ocr,author_id,cluster_id
0,5901618,2022-01-13 03:22:58.163752+00,2022-01-13 03:22:58.16378+00,,f,,020lead,,,,...,,,,,,"<opinion type=""majority"">\n<p id=""ADG"">—Appeal...",,t,,6038492
1,5901619,2022-01-13 03:22:58.445791+00,2022-01-13 03:22:58.445825+00,,f,,020lead,,,,...,,,,,,"<opinion type=""majority"">\n<p id=""AocO"">—Appea...",,t,,6038493
2,5901620,2022-01-13 03:22:58.704638+00,2022-01-13 03:22:58.704662+00,,f,,020lead,,,,...,,,,,,"<opinion type=""majority"">\n<p id=""AhO2"">—Appea...",,t,,6038494
3,5901621,2022-01-13 03:22:58.970362+00,2022-01-13 03:22:58.970386+00,,f,,020lead,,,,...,,,,,,"<opinion type=""majority"">\n<p id=""AnXL"">—Appea...",,t,,6038495
4,5901622,2022-01-13 03:22:59.217791+00,2022-01-13 03:22:59.217815+00,,f,,020lead,,,,...,,,,,,"<opinion type=""majority"">\n<p id=""b845-12"">In ...",,t,,6038496
