In [3]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# NLP
from nltk.corpus import stopwords


# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe


sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *

# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
G = load_and_clean_graph(data_dir, court_name)

# get edgelist and case metadata

## get raw edgelist

## get raw case metadata

## make clean case metadata (from raw)

In [2]:
start = time.time()

make_clean_case_metadata(data_dir, overwrite=False)

runtime = time.time() - start
print runtime

## make clean edgelist (from raw)

In [None]:
start = time.time()
make_clean_edgelist(data_dir, overwrite=False)

runtime = time.time() - start
print runtime

## make graph

# make NLP similarity matrix

## download opinion files

In [4]:
start = time.time()

download_bulk_resource(court_name,
                       resource='opinions',
                       data_dir=data_dir)

runtime = time.time() - start
print 'downloading opinion files took %d seconds' % runtime

requesting metadata for scotus
Downloading opinions data for court SCOTUS...
downloading opinion files took 281 seconds


## make case text files

In [10]:
start = time.time()

make_text_files(data_dir,
                court_name,
                CLid_good=G.vs['name'],
                CLid_bad=None)

runtime = time.time() - start
print 'making case text files took %d seconds' % runtime

making case text files took 559 seconds


## normalize corpus/put it into dict

In [91]:
start = time.time()

normalized_text_dict = get_normalized_text_dict(experiment_data_dir)

runtime = time.time() - start
print 'normalized text files took %d seconds' % runtime

normalized text files took 97 seconds


## compute td-idf matrix

In [92]:
start = time.time()

tfidf_matrix, vocab, CLid_to_index = get_td_idf(normalized_text_dict)

runtime = time.time() - start
print 'computing td-idf matrix took %d seconds' % runtime

computing td-idf matrix took 131 seconds


## compute pairwise cosine distances

In [93]:
from sklearn.metrics.pairwise import cosine_similarity

In [94]:
start = time.time()
S = cosine_similarity(tfidf_matrix,
                      dense_output=False)

# change data type
S.astype(np.float16)

runtime = time.time() - start
print 'computing parwise distances took %d seconds' % runtime

computing parwise distances took 992 seconds


In [95]:
start = time.time()

# save similarity matrix
save_sparse_csr(filename=experiment_data_dir + 'cosine_sims',
                array=S)

# save clid to index map
with open(experiment_data_dir + 'CLid_to_index.p', 'wb') as fp:
    pickle.dump(CLid_to_index, fp)

runtime = time.time() - start
print 'saving pairwise matrix took %d seconds' % runtime

saving pairwise matrix took 180 seconds


# compute snapshots and make edge data frame

## Make snapshots

In [7]:
vertex_metrics = ['indegree', 's_pagerank', 'hubs']

# snapshot_year_list = np.array([year for year in range(1850, 2021) if year % 5 == 0])

snapshot_year_list = range(1900, 2016 + 1)

In [44]:
start = time.time()
make_snapshot_vertex_metrics(G, snapshot_year_list, vertex_metrics,
                                 experiment_data_dir)
runtime = time.time() - start
print 'make_snapshot_vertex_metrics took %d seconds' % runtime

make_snapshot_vertex_metrics took 77 seconds


## make edge dataframe

In [8]:
columns_to_use = ['indegree', 'decayed_indegree', 's_pagerank', 'hubs', 'age', 'similarity']

num_non_edges_to_add = len(G.es())

seed_edgedf = 432

In [None]:
start = time.time()
make_edge_df(G, experiment_data_dir, snapshot_year_list,
              num_non_edges_to_add, columns_to_use, seed=seed_edgedf)
runtime = time.time() - start
print 'make_edge_df took %d seconds' % runtime

# Surgery

In [None]:
df = pd.read_csv(experiment_data_dir + 'edge_data.csv', index_col=0)

In [None]:
df

Overnight I ran
- make_edge_df
    - print df above to see if similarities have been added
- make_clean_edgelist

TODO:
- update graph (and everything) after making new clean edgelist
