In [11]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# NLP
from nltk.corpus import stopwords


# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource

sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *

# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
G = load_and_clean_graph(data_dir, court_name)

# Make snapshots

In [43]:
vertex_metrics = ['indegree', 's_pagerank', 'hubs']

# snapshot_year_list = np.array([year for year in range(1850, 2021) if year % 5 == 0])

snapshot_year_list = range(1900, 2016 + 1)

In [44]:
start = time.time()
make_snapshot_vertex_metrics(G, snapshot_year_list, vertex_metrics,
                                 experiment_data_dir)
runtime = time.time() - start
print 'make_snapshot_vertex_metrics took %d seconds' % runtime

make_snapshot_vertex_metrics took 77 seconds


# make edge dataframe

In [45]:
columns_to_use = ['indegree', 'decayed_indegree', 's_pagerank', 'hubs', 'age', 'similarity']

num_non_edges_to_add = len(G.es())

seed_edgedf = 432

In [46]:
start = time.time()
make_edge_df(G, experiment_data_dir, snapshot_year_list,
              num_non_edges_to_add, columns_to_use, seed=seed_edgedf)
runtime = time.time() - start
print 'make_edge_df took %d seconds' % runtime

make_edge_df took 482 seconds


# download opinion files

In [None]:
start = time.time()

# download_bulk_resource(court_name,
#                        resource='opinions',
#                        data_dir=data_dir)

runtime = time.time() - start
print 'downloading opinion files took %d seconds' % runtime

# make case text files

In [10]:
start = time.time()

make_text_files(data_dir,
                court_name,
                CLid_good=G.vs['name'],
                CLid_bad=None)

runtime = time.time() - start
print 'making case text files took %d seconds' % runtime

# normalize corpus/put it into dict

In [78]:
start = time.time()

normalized_text_dict = get_normalized_text_dict(experiment_data_dir)

runtime = time.time() - start
print 'normalized text files took %d seconds' % runtime

normalized test files took 100 seconds


# compute td-idf matrix

In [14]:
start = time.time()

tfidf_matrix, vocab, CLid_to_index = get_td_idf(normalized_text_dict)

runtime = time.time() - start
print 'computing td-idf matrix took %d seconds' % runtime

computing td-idf matrix took 142 seconds


# compute pairwise cosine distances

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
start = time.time()
S = cosine_similarity(tfidf_matrix,
                      dense_output=False)

# change data type
S.astype(np.float16)

runtime = time.time() - start
print 'computing parwise distances took %d seconds' % runtime

computing parwise distances took 950 seconds


In [17]:
start = time.time()

# save similarity matrix
save_sparse_csr(filename=experiment_data_dir + 'cosine_sims',
                array=S)

# save clid to index map
with open(experiment_data_dir + 'CLid_to_index.p', 'wb') as fp:
    pickle.dump(CLid_to_index, fp)

runtime = time.time() - start
print 'saving pairwise matrix took %d seconds' % runtime

saving pairwise matrix took 178 seconds


# get NLP similarity

In [36]:
CLid_to_index = {cases[i] : i for i in range(len(cases))}

In [47]:
def get_similarities(similarity_matrix, CLid_A, CLid_B, CLid_to_index):
    
    if len(CLid_A) != len(CLid_B):
        raise ValueError('lists not the same length')
    else:
        N = len(CLid_A)
    
    # list to return
    similarities = [0] * N

    # grab each entry
    for i in range(N):
        # convet CL id to matrix index
        idA = CLid_to_index[CLid_A[i]]
        idB = CLid_to_index[CLid_B[i]]

        similarities[i] = similarity_matrix[idA, idB]
    
    return similarities

In [58]:
CLid_ing = []
CLid_ed = []
for e in G.es:
    
    CLid_ing.append(G.vs[e.source]['name'])
    CLid_ed.append(G.vs[e.target]['name'])


In [59]:
start = time.time()
sims = get_similarities(S, CLid_ing, CLid_ed, CLid_to_index)
runtime = time.time() - start

KeyError: '3181043'

In [None]:
# surgery

In [74]:
tfidf_matrix.shape

(33157, 206044)

In [77]:
len(G.vs)

33247