In [2]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# NLP
from nltk.corpus import stopwords


# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe


sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *

# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
G = load_and_clean_graph(data_dir, court_name)

# get edgelist and case metadata

## get raw edgelist

## get raw case metadata

## make clean case metadata (from raw)

In [2]:
start = time.time()

make_clean_case_metadata(data_dir, overwrite=False)

runtime = time.time() - start
print runtime

## make clean edgelist (from raw)

In [1]:
start = time.time()
make_clean_edgelist(data_dir, overwrite=False)

runtime = time.time() - start
print runtime

## make graph

# make NLP similarity matrix

## download opinion files

In [4]:
start = time.time()

download_bulk_resource(court_name,
                       resource='opinions',
                       data_dir=data_dir)

runtime = time.time() - start
print 'downloading opinion files took %d seconds' % runtime

requesting metadata for scotus
Downloading opinions data for court SCOTUS...
downloading opinion files took 281 seconds


## make case text files

In [10]:
start = time.time()

make_text_files(data_dir,
                court_name,
                CLid_good=G.vs['name'],
                CLid_bad=None)

runtime = time.time() - start
print 'making case text files took %d seconds' % runtime

making case text files took 559 seconds


## normalize corpus/put it into dict

In [3]:
start = time.time()

normalized_text_dict = get_normalized_text_dict(experiment_data_dir)

runtime = time.time() - start
print 'normalized text files took %d seconds' % runtime

normalized text files took 90 seconds


## compute td-idf matrix

In [4]:
start = time.time()

tfidf_matrix, vocab, CLid_to_index = get_td_idf(normalized_text_dict)

runtime = time.time() - start
print 'computing td-idf matrix took %d seconds' % runtime

computing td-idf matrix took 147 seconds


In [5]:
save_sparse_csr(experiment_data_dir + 'tdidf_matrix', tfidf_matrix)

## compute pairwise cosine distances

In [6]:
start = time.time()

make_similarity_matrix(experiment_data_dir, tfidf_matrix,
                       CLid_to_index)

runtime = time.time() - start
print 'saving similarity matrix took %d seconds' % runtime

computing cosine distances took 2287 seconds
saving the matrix took 12 seconds
saving similarity matrix took 2393 seconds


# compute snapshots and make edge data frame

## Make snapshots

In [25]:
vertex_metrics = ['indegree', 'outdegree','degree',
                   'd_pagerank','u_pagerank',
                   'd_closeness','u_closeness',
                   'd_betweenness','u_betweenness',
                   'authorities','hubs',
                   'd_eigen','u_eigen']

active_years = range(1900, 2015 + 1)

In [4]:
start = time.time()

make_snapshot_vertex_metrics(G, active_years, vertex_metrics,
                                 experiment_data_dir)

runtime = time.time() - start
print 'make_snapshot_vertex_metrics took %d seconds' % runtime

make_snapshot_vertex_metrics took 59 seconds


## make edge dataframe

In [5]:
columns_to_use = ['indegree', 'decayed_indegree', 's_pagerank', 'hubs', 'age', 'similarity']

num_non_edges_to_add = len(G.es())

seed_edgedf = 2345

In [77]:
start = time.time()

make_edge_df(G,
             experiment_data_dir,
             active_years,
             num_non_edges_to_add,
             columns_to_use,
             seed=seed_edgedf)

runtime = time.time() - start
print 'make_edge_df took %d seconds' % runtime

make_edge_df took 532 seconds


# surgery

In [5]:
g = G.subgraph(G.vs[:100])

In [43]:
metrics = ['indegree', 'outdegree','degree',
           'd_pagerank','u_pagerank',
           'd_closeness','u_closeness',
           'd_betweenness','u_betweenness',
           'authorities','hubs',
           'd_eigen','u_eigen']



mean 0.07 for indegree
mean 0.07 for outdegree
mean 0.14 for degree
mean 1.00 for d_pagerank
mean 1.00 for u_pagerank
mean 0.01 for d_closeness
mean 0.01 for u_closeness
mean 0.00 for d_betweenness
mean 0.04 for u_betweenness
mean 0.03 for authorities
mean 0.01 for hubs
mean 0.00 for d_eigen
mean 0.03 for u_eigen


  metric_column = G.eigenvector_centrality()
