In [4]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import sys

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from make_network_data import *
from make_graph import make_graph
from bag_of_words import make_tf_idf
from make_snapshots import make_snapshot_vertex_metrics, update_snapshot_vertex_metrics

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

# set up the data directory

In [None]:
setup_data_dir(data_dir)

In [None]:
make_subnetwork_directory(data_dir, network_name)

# data download

## get opinion and cluster files from CourtListener

opinions/cluster files are saved in data_dir/raw/court/ 

In [None]:
download_op_and_cl_files(data_dir, network_name)

## get the master edgelist from CL

master edgelist is saved in data_dir/raw/

In [None]:
download_master_edgelist(data_dir)

## download scdb data from SCDB

scdb data is saved in data_dir/scdb

In [None]:
download_scdb(data_dir)

# network data

## make the case metadata and edgelist

- add the raw case metadata data frame to the raw/ folder
- remove cases missing scdb ids
- remove detroit lumber case
- get edgelist of cases within desired subnetwork
- save case metadata and edgelist to the experiment_dir/

In [None]:
# create the raw case metadata data frame in the raw/ folder
make_subnetwork_raw_case_metadata(data_dir, network_name)

In [None]:
# create clean case metadata and edgelist from raw data
clean_metadata_and_edgelist(data_dir, network_name)

## make graph

creates the network with the desired case metadata and saves it as a .graphml file in experiment_dir/ 

In [None]:
make_graph(subnet_dir, network_name)

# NLP data

## make case text files

grabs the opinion text for each case in the network and saves them as a text file in experiment_dir/textfiles/

In [None]:
%%time
make_network_textfiles(data_dir, network_name)

## make tf-idf matrix

creates the tf-idf matrix for the corpus of cases in the network and saves them to  subnet_dir + 'nlp/'

In [2]:
%%time
make_tf_idf(text_dir, subnet_dir + 'nlp/')

CPU times: user 38min 25s, sys: 14.4 s, total: 38min 39s
Wall time: 38min 57s


# Load network

In [5]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

In [6]:
G.summary()

'IGRAPH DN-- 27885 234312 -- \n+ attr: court (v), id (v), issueArea (v), name (v), num_words (v), year (v)'

# compute snapshots

In [9]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                  'd_pagerank','u_pagerank',
                  'authorities', 'hubs',
                  'd_eigen', 'u_eigen',
                  'd_betweenness', 'u_betweenness',
                  'd_closeness', 'u_closeness']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]

active_years = range(1900, 2015 + 1)

In [5]:
%%time
make_snapshot_vertex_metrics(G, active_years, vertex_metrics, subnet_dir)

year 1900, (2/117) at 10:18:49
year 1902, (4/117) at 10:19:43
year 1906, (8/117) at 10:22:09
year 1914, (16/117) at 10:30:38
year 1930, (32/117) at 10:56:53


  metric_column = G.eigenvector_centrality()


problem with d_eigen
year 1962, (64/117) at 12:29:54
problem with d_eigen
CPU times: user 8h 2min 56s, sys: 1min 19s, total: 8h 4min 15s
Wall time: 8h 6min 21s


# update snapshots

In [13]:
to_add = ['rev_pagerank', 'num_words']
to_add += ['citerank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]

In [19]:
%%time
update_snapshot_vertex_metrics(G, active_years, to_add, subnet_dir)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.2 µs


# add text length

adds word count as a vertex attribute
TODO: put this in the data pipeline

In [6]:
G.vs['num_words'] = [0] * len(G.vs)
for op_id in G.vs['name']:
    
    text = open(text_dir + op_id +'.txt', 'r').read()
    num_words = len(text.split())
    
    G.vs.find(name=op_id)['num_words'] = num_words

In [7]:
G.write_graphml(subnet_dir + network_name +'_network.graphml')