In [1]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

from __future__ import division

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig



# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe


sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_snapshots import *
from make_edge_df import *
from attachment_model_inference import *
from compute_ranking_metrics import *
from pipeline_helper_functions import *
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *

# directory set up
data_dir = top_directory + 'data/'
experiment_data_dir = data_dir + 'vertex_metrics_experiment/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
%time G = load_and_clean_graph(data_dir, court_name)

# get edgelist and case metadata

## get raw edgelist

In [None]:
%time download_master_edgelist(data_dir)

## get raw case metadata

In [4]:
%time make_raw_case_metadata_court(court_name, data_dir, remove=True)

In [None]:
# in make_raw_case_metadata.py
# get_raw_case_metadata_from_court(court_name, data_dir, remove=True)


## make clean case metadata (from raw)

In [None]:
%time make_clean_case_metadata(data_dir, overwrite=False)

## make clean edgelist (from raw)

In [None]:
%time make_clean_edgelist(data_dir, overwrite=False)

## make graph

# make NLP similarity matrix

## download opinion files

In [None]:
%time download_bulk_resource(court_name,
                           resource='opinions',
                           data_dir=data_dir)

## make case text files

In [None]:
%time make_text_files(data_dir,
                        court_name,
                        CLid_good=G.vs['name'],
                        CLid_bad=None)

## normalize corpus/put it into dict

In [5]:
%time normalized_text_dict = get_normalized_text_dict(experiment_data_dir)

CPU times: user 1min 12s, sys: 5.84 s, total: 1min 18s
Wall time: 1min 30s


## compute td-idf matrix

In [6]:
%time tfidf_matrix, vocab, CLid_to_index = get_tf_idf(normalized_text_dict, min_df=.2, max_df=.8)

CPU times: user 2min, sys: 1.96 s, total: 2min 2s
Wall time: 2min 2s


In [7]:
save_sparse_csr(experiment_data_dir + 'tdidf_matrix', tfidf_matrix)

## compute pairwise cosine distances

In [8]:
%time make_similarity_matrix(experiment_data_dir, tfidf_matrix, CLid_to_index)

CPU times: user 6min 11s, sys: 2min 38s, total: 8min 50s
Wall time: 11min 57s


# compute snapshots and make edge data frame

## Make snapshots

In [18]:
# vertex_metrics = ['indegree', 'outdegree','degree',
#                    'd_pagerank','u_pagerank',
#                    'd_closeness','u_closeness',
#                    'd_betweenness','u_betweenness',
#                    'authorities','hubs',
#                    'd_eigen','u_eigen']

vertex_metrics = ['indegree', 'outdegree', 'degree',
                   'd_pagerank', 'authorities', 'hubs']

active_years = range(1900, 2015 + 1)

In [19]:
%time make_snapshot_vertex_metrics(G, active_years, vertex_metrics, experiment_data_dir)

year 1900, (2/117) at 11:25:21
year 1902, (4/117) at 11:25:22
year 1906, (8/117) at 11:25:23
year 1914, (16/117) at 11:25:25
year 1930, (32/117) at 11:25:32
year 1962, (64/117) at 11:25:56
CPU times: user 1min 28s, sys: 2.76 s, total: 1min 31s
Wall time: 1min 33s


## make edge dataframe

In [20]:
columns_to_use = copy.copy(vertex_metrics)
columns_to_use.append('age')
columns_to_use.append('similarity')

num_non_edges_to_add = len(G.es())

seed_edgedf = 7655

In [23]:
%time make_edge_df(G, experiment_data_dir, active_years, num_non_edges_to_add, columns_to_use, seed=seed_edgedf)

CPU times: user 4min 26s, sys: 41.7 s, total: 5min 8s
Wall time: 6min 21s


# surgery