In [11]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(top_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *


# directory set up
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
experiment_data_dir = data_dir + 'federal'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
courts = ['scotus', 'cafc', 'cadc']
courts += ['ca' + str(i+1) for i in range(11)]

# download opinion and cluster files

In [15]:
for court in courts:
    start = time.time()
    download_bulk_resource(court, 'clusters', data_dir)
    download_bulk_resource(court, 'opinions', data_dir)
    
    print '%s took %d seconds' % (court, time.time() - start)

requesting metadata for scotus
Downloading clusters data for court SCOTUS...
requesting metadata for scotus
Downloading opinions data for court SCOTUS...
scotus took 1140 seconds
requesting metadata for cafc
Downloading clusters data for court CAFC...
requesting metadata for cafc
Downloading opinions data for court CAFC...
cafc took 362 seconds
requesting metadata for cadc
Downloading clusters data for court CADC...
requesting metadata for cadc
Downloading opinions data for court CADC...
cadc took 668 seconds
requesting metadata for ca1
Downloading clusters data for court CA1...
requesting metadata for ca1
Downloading opinions data for court CA1...
ca1 took 783 seconds
requesting metadata for ca2
Downloading clusters data for court CA2...
requesting metadata for ca2
Downloading opinions data for court CA2...
ca2 took 737 seconds
requesting metadata for ca3
Downloading clusters data for court CA3...
requesting metadata for ca3
Downloading opinions data for court CA3...
ca3 took 1014 sec

# download the master edgelist

In [None]:
# download_master_edgelist(data_dir)

# make case metadata

In [30]:
start = time.time()
# append all other courts
for court in courts:
    court_data = get_raw_case_metadata_from_court(court, data_dir)
    
    # either initialize of append data frame
    if court == courts[0]:
        case_metadata = court_data
    else:
        case_metadata = case_metadata.append(court_data)
        
print time.time() - start

2496.03245115


In [33]:
case_metadata.to_csv(data_dir + 'raw/fed_case_metadata_r.csv', index=True)

# clean scotus

In [60]:
# list of cases with no SCDB id
no_scdb_link = pd.read_csv(data_dir + 'raw/no_scdb_link.csv', index_col=0).index.astype(str).tolist()

# remove SCOTUS cases with no SCDB id
case_metadata.drop(no_scdb_link, inplace=True)

# kill detroit lumber
case_metadata.drop('96405', inplace=True)


In [61]:
case_metadata.to_csv(data_dir + 'federal/case_metadata.csv', index=True)

# get the federal subedgelist

In [69]:
# load master edgelist
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')

# only keep edges within federal circuit
case_ids = set(case_metadata.index)
edgelist = master_edgelist[master_edgelist.citing.isin(case_ids) & master_edgelist.cited.isin(case_ids)]

# save federal edgelist
edgelist.to_csv(data_dir + 'federal/edgelist.csv', index=False)

## make igraph object

In [75]:
# initialize graph
G = ig.Graph(n=case_metadata.shape[0], directed=True)

# add opinion names
G.vs['name'] = case_metadata.index

# opinion to ig index mapping
op_to_ig = {op_id: G.vs.find(name=op_id).index  for op_id in G.vs['name']}

# convert edgelist to ig ids
edgelist_ig = edgelist.apply(lambda c: [op_to_ig[str(op_id)] for op_id in c])

# add edes to graph
G.add_edges(edgelist_ig.as_matrix().tolist())

## make case text files

## make td-idf / cosine similarity matrix

## make snapshots

## make edge dataframe