In [1]:
#top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
top_directory = '/Users/Michael/Documents/GitHub/law-net/'

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(top_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *
from load_data import case_info

sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
from make_snapshots import *
from make_graph import *


# court
court = 'scotus'
network_name = 'scotus'

# directory set up

#data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
data_dir = '/Users/Michael/Desktop/blah/'

experiment_data_dir = data_dir + 'scotus/'
text_dir = experiment_data_dir + 'textfiles/'



# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

# download scdb files

In [2]:
download_scdb(data_dir)

# download opinion and cluster files

In [3]:
download_bulk_resource(court, 'clusters', data_dir)
download_bulk_resource(court, 'opinions', data_dir)

requesting metadata for scotus
Downloading clusters data for court SCOTUS...
requesting metadata for scotus
Downloading opinions data for court SCOTUS...


# download the master edgelist

In [4]:
# was commented out previously (why?)
download_master_edgelist(data_dir)

downloading edgelist gzip...


# make case metadata

In [5]:
%time case_metadata = get_raw_case_metadata_from_court(court, data_dir)

Wall time: 4min 20s


In [6]:
case_metadata.to_csv(data_dir + 'raw/scotus_case_metadata_r.csv', index=True)

# clean scotus

kill SCOTUS cases with no SCDB ids

In [7]:
case_metadata = pd.read_csv(data_dir + 'raw/scotus_case_metadata_r.csv', index_col=0)
case_metadata.index= case_metadata.index.astype('str')

In [8]:
# scotus scdb ids
scdb_ids = case_metadata['scdb_id']

# scotus cases with no scdb id
no_scdb_link = scdb_ids.index[scdb_ids.isnull()].tolist()

# remove SCOTUS cases with no SCDB id
case_metadata.drop(no_scdb_link, inplace=True)

# kill detroit lumber
case_metadata.drop('96405', inplace=True)

In [9]:
case_metadata.to_csv(experiment_data_dir + 'case_metadata.csv', index=True)

# get the SCOTUS subedgelist

In [10]:
# load master edgelist
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')

# only keep edges within scotus
case_ids = set(case_metadata.index)
edgelist = master_edgelist[master_edgelist.citing.isin(case_ids) & master_edgelist.cited.isin(case_ids)]

# save federal edgelist
edgelist.to_csv(experiment_data_dir + 'edgelist.csv', index=False)

## make igraph object

In [11]:
%time make_graph(experiment_data_dir, network_name)

Wall time: 4.52 s


## make case text files

In [12]:
# make the textfiles for give court
%time make_text_files(data_dir, court, CLid_good=None, CLid_bad=no_scdb_link, output_path = text_dir)

Wall time: 11min 30s


## make td-idf similarity matrix

In [13]:
%time make_tf_idf(text_dir, experiment_data_dir + 'nlp/', min_df=0, max_df=1)

Wall time: 3min 41s


## make snapshots

In [14]:
G = ig.Graph.Read_GraphML(experiment_data_dir + 'scotus_network.graphml')

In [15]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                   'd_pagerank', 'authorities', 'hubs']

active_years = range(1900, 2015 + 1)

In [16]:
%time make_snapshot_vertex_metrics(G, active_years, vertex_metrics, experiment_data_dir)

year 1900, (2/117) at 00:29:38
year 1902, (4/117) at 00:29:39
year 1906, (8/117) at 00:29:40
year 1914, (16/117) at 00:29:44
year 1930, (32/117) at 00:29:52
year 1962, (64/117) at 00:30:17
Wall time: 1min 41s
