In [4]:
import sys
sys.path.append('../../../code/')
sys.path.append('../../../code/michael')
import os
import json
from datetime import datetime
import time
from pipeline.download_data import url_to_dict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import networkx as nx
from load_data import load_citation_network, case_info
import operator
import scotus_viz_functions_michael as viz

from collections import OrderedDict


%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../../data/'
court_name = 'scotus'

# assign/make 'scotus_visualizations/michael' directory to save files in
proj_cwd = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
viz_dir = os.path.join(proj_cwd, 'visualization')
if not os.path.exists(viz_dir):
    os.makedirs(viz_dir)
viz_scotus_dir = os.path.join(viz_dir, 'scotus')
if not os.path.exists(viz_scotus_dir):
    os.makedirs(viz_scotus_dir)
viz_scotus_michael_dir = os.path.join(viz_scotus_dir, 'michael')
if not os.path.exists(viz_scotus_michael_dir):
    os.makedirs(viz_scotus_michael_dir)

csvs_dir = os.getcwd() + '/csvs'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Create SCOTUS Network (Directed Graph)

In [5]:
# this will be a little slow the first time you run it

time1 = time.time()
G = load_citation_network(data_dir, court_name)
time2 = time.time()

print 'loading scotus network in networkx took %d seconds' % (time2-time1)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.nodes()), len(G.edges()))

loading scotus network in networkx took 23 seconds
loaded scotus network with 33248 cases and 250465 edges


### Remove Case 96405

In [7]:
G.remove_node(96405)

### U = Undirected SCOTUS

In [9]:
U = G.to_undirected()

### Acquire Dictionaries for Centrality Measures of SCOTUS

In [11]:
time1 = time.time()
close_cent_dict = nx.closeness_centrality(U) ## according to intuition, graph results, and numerous outside resources
time2  = time.time()
print 'closeness centrality took %d seconds to compute on %d nodes and %d edges' % (time2-time1, U.number_of_nodes(), U.number_of_edges())
eigen_cent_dict = nx.eigenvector_centrality(U) ## according to networkx
time3 = time.time()
print 'eigenvector centrality took %d seconds to compute on %d nodes and %d edges' % (time3-time2, U.number_of_nodes(), U.number_of_edges())
between_cent_dict = nx.betweenness_centrality(U) ## both directed and undirected and ok (both have normalization for networkx)
time4 = time.time()
print 'betweenness centrality took %d seconds to compute on %d nodes and %d edges' % (time4-time3, U.number_of_nodes(), U.number_of_edges())
page_rank_dict = nx.pagerank(G) ## according to networkx
time5 = time.time()
print 'page rank took %d seconds to compute on %d nodes and %d edges' % (time5-time4, G.number_of_nodes(), G.number_of_edges())
hubs_auths_dict = nx.hits(G) ## according to networkx
time6 = time.time()
print 'hubs and authorities took %d seconds to compute on %d nodes and %d edges' % (time6-time5, G.number_of_nodes(), G.number_of_edges())


closeness centrality took 3542 seconds to compute on 33247 nodes and 249239 edges
eigenvector centrality took 16 seconds to compute on 33247 nodes and 249239 edges
betweenness centrality took 18773 seconds to compute on 33247 nodes and 249239 edges
page rank took 4 seconds to compute on 33247 nodes and 249826 edges
hubs and authorities took 41 seconds to compute on 33247 nodes and 249826 edges


### Store Centrality Measures for Undirected SCOTUS into CSV File

In [12]:
close_cent_list = [close_cent_dict[n] for n in U.nodes()]
eigen_cent_list = [eigen_cent_dict[n] for n in U.nodes()]
between_cent_list = [between_cent_dict[n] for n in U.nodes()]

data = {'Case ID': U.nodes(),
        'Closeness Centrality': close_cent_list,
        'Eigenvector Centrality': eigen_cent_list,
        'Betweenness Centrality': between_cent_list
       }

df = pd.DataFrame(data, columns=['Case ID', 'Closeness Centrality', 'Eigenvector Centrality', 'Betweenness Centrality'])

df.to_csv(os.path.join(csvs_dir, '/undirected_scotus_centralities_96405_del.csv'))

### Store Centrality Measures for Directed SCOTUS into CSV File

In [15]:
page_rank_list = [page_rank_dict[n] for n in G.nodes()]
hubs_list = [hubs_auths_dict[0][n] for n in G.nodes()]
auths_list = [hubs_auths_dict[1][n] for n in G.nodes()]

data = {'Case ID': G.nodes(),
        'Page Rank': page_rank_list,
        'Hubs': hubs_list,
        'Authorities': auths_list
       }

df = pd.DataFrame(data, columns=['Case ID', 'Page Rank', 'Hubs', 'Authorities'])

df.to_csv(os.path.join(csvs_dir, '/directed_scotus_centralities_96405_del.csv'))