In [35]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(top_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *
from load_data import case_info

sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
from make_snapshots import *
from make_graph import *


# court
court = 'scotus'
network_name = 'scotus'

# directory set up
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
experiment_data_dir = data_dir + 'scotus/'
text_dir = experiment_data_dir + 'textfiles/'



# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# download opinion and cluster files

In [15]:
download_bulk_resource(court, 'clusters', data_dir)
download_bulk_resource(court, 'opinions', data_dir)

requesting metadata for scotus
Downloading clusters data for court SCOTUS...
requesting metadata for scotus
Downloading opinions data for court SCOTUS...
scotus took 1140 seconds
requesting metadata for cafc
Downloading clusters data for court CAFC...
requesting metadata for cafc
Downloading opinions data for court CAFC...
cafc took 362 seconds
requesting metadata for cadc
Downloading clusters data for court CADC...
requesting metadata for cadc
Downloading opinions data for court CADC...
cadc took 668 seconds
requesting metadata for ca1
Downloading clusters data for court CA1...
requesting metadata for ca1
Downloading opinions data for court CA1...
ca1 took 783 seconds
requesting metadata for ca2
Downloading clusters data for court CA2...
requesting metadata for ca2
Downloading opinions data for court CA2...
ca2 took 737 seconds
requesting metadata for ca3
Downloading clusters data for court CA3...
requesting metadata for ca3
Downloading opinions data for court CA3...
ca3 took 1014 sec

# download the master edgelist

In [None]:
# download_master_edgelist(data_dir)

# make case metadata

In [3]:
%time case_metadata = get_raw_case_metadata_from_court(court, data_dir)

CPU times: user 2min 1s, sys: 9.24 s, total: 2min 10s
Wall time: 2min 44s


In [5]:
case_metadata.to_csv(data_dir + 'raw/scotus_case_metadata_r.csv', index=True)

# clean scotus

kill SCOTUS cases with no SCDB ids

In [6]:
case_metadata = pd.read_csv(data_dir + 'raw/scotus_case_metadata_r.csv', index_col=0)
case_metadata.index= case_metadata.index.astype('str')

In [8]:
# scotus scdb ids
scdb_ids = case_metadata['scdb_id']

# scotus cases with no scdb id
no_scdb_link = scdb_ids.index[scdb_ids.isnull()].tolist()

# remove SCOTUS cases with no SCDB id
case_metadata.drop(no_scdb_link, inplace=True)

# kill detroit lumber
case_metadata.drop('96405', inplace=True)

In [10]:
case_metadata.to_csv(experiment_data_dir + 'case_metadata.csv', index=True)

# get the SCOTUS subedgelist

In [11]:
# load master edgelist
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')

# only keep edges within scotus
case_ids = set(case_metadata.index)
edgelist = master_edgelist[master_edgelist.citing.isin(case_ids) & master_edgelist.cited.isin(case_ids)]

# save federal edgelist
edgelist.to_csv(experiment_data_dir + 'edgelist.csv', index=False)

## make igraph object

In [20]:
%time make_graph(experiment_data_dir, network_name)

CPU times: user 6.05 s, sys: 281 ms, total: 6.33 s
Wall time: 8.8 s


## make case text files

In [32]:
# make the textfiles for give court
%time make_text_files(data_dir, court, CLid_good=None, CLid_bad=no_scdb_link, output_path = text_dir)

CPU times: user 9min 31s, sys: 28.5 s, total: 10min
Wall time: 11min 22s


## make td-idf similarity matrix

In [33]:
%time make_tf_idf(text_dir, experiment_data_dir + 'nlp/', min_df=0, max_df=1)

CPU times: user 4min 47s, sys: 17.2 s, total: 5min 4s
Wall time: 6min 4s


## make snapshots

In [47]:
G = ig.Graph.Read_GraphML(experiment_data_dir + 'scotus_network.graphml')

In [48]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                   'd_pagerank', 'authorities', 'hubs']

active_years = range(1900, 2015 + 1)

In [50]:
%time make_snapshot_vertex_metrics(G, active_years, vertex_metrics, experiment_data_dir)

year 1900, (2/117) at 17:03:19
year 1902, (4/117) at 17:03:20
year 1906, (8/117) at 17:03:22
year 1914, (16/117) at 17:03:27
year 1930, (32/117) at 17:03:41
year 1962, (64/117) at 17:04:21
CPU times: user 1min 51s, sys: 4.36 s, total: 1min 56s
Wall time: 2min 14s
