In [16]:
import sys
sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import networkx as nx

from load_data import load_citation_network, case_info

import operator

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load SCOTUS Network (Directed Graph)

In [10]:
# this will be a little slow the first time you run it
G = load_citation_network(data_dir, court_name)

print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.nodes()), len(G.edges()))

loaded scotus network with 33248 cases and 250465 edges


### Acquire Dictionaries for Centrality Measures

In [59]:
time1 = time.time()
close_cent_dict = nx.closeness_centrality(G)
time2  = time.time()
print 'closeness centrality took %d seconds to compute on %d nodes and %d edges' % (time2-time1, G.number_of_nodes(), G.number_of_edges())
eigen_cent_dict = nx.eigenvector_centrality(G)
time3 = time.time()
print 'eigenvector centrality took %d seconds to compute on %d nodes and %d edges' % (time3-time2, G.number_of_nodes(), G.number_of_edges())
between_cent_dict = nx.betweenness_centrality(G)
time4 = time.time()
print 'betweenness centrality took %d seconds to compute on %d nodes and %d edges' % (time4-time3, G.number_of_nodes(), G.number_of_edges())
page_rank_dict = nx.pagerank(G)
time5 = time.time()
print 'page rank took %d seconds to compute on %d nodes and %d edges' % (time5-time4, G.number_of_nodes(), G.number_of_edges())
hubs_auths_dict = nx.hits(G)
time6 = time.time()
print 'hubs and authorities took %d seconds to compute on %d nodes and %d edges' % (time6-time5, G.number_of_nodes(), G.number_of_edges())


closeness centrality took 594 seconds to compute on 33248 nodes and 250465 edges
eigenvector centrality took 24 seconds to compute on 33248 nodes and 250465 edges
betweenness centrality took 5018 seconds to compute on 33248 nodes and 250465 edges
page rank took 4 seconds to compute on 33248 nodes and 250465 edges
hubs and authorities took 33 seconds to compute on 33248 nodes and 250465 edges


### Top 10 In-Degree for SCOTUS (Directed)

In [47]:
ind = sorted(G.in_degree().iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
ind_cases = [x[0] for x in ind]
ind_values = [x[1] for x in ind]
ind_dates = [G.node[each_case]['date'] for each_case in ind_cases]

data = {'In-Degree': ind_values,
        'Date': ind_dates
       }
df = pd.DataFrame(data, columns = ['In-Degree', 'Date'],
                 index=ind_cases)
df

Unnamed: 0,In-Degree,Date
96405,608,1906-02-19
109532,294,1976-07-22
107252,267,1966-06-13
91573,234,1886-02-01
106545,229,1963-03-18
111221,220,1984-06-25
102605,214,1936-02-17
103012,200,1938-04-25
103355,198,1940-05-20
106761,186,1964-03-09


### Top 10 Out-Degree for SCOTUS (Directed)

In [48]:
outd = sorted(G.out_degree().iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
outd_cases = [x[0] for x in outd]
outd_values = [x[1] for x in outd]
outd_dates = [G.node[each_case]['date'] for each_case in outd_cases]

data = {'Out-Degree': outd_values,
        'Date': outd_dates
       }
df = pd.DataFrame(data, columns = ['Out-Degree', 'Date'],
                 index=outd_cases)
df

Unnamed: 0,Out-Degree,Date
105210,193,1954-05-03
104616,182,1949-02-14
106366,178,1962-03-26
102224,163,1934-03-05
108329,163,1971-05-03
101864,158,1932-02-23
106267,154,1961-10-09
106548,153,1963-03-18
97966,150,1913-06-09
108221,143,1970-12-21


### Top 10 Closeness Centrality for SCOTUS (Directed)

In [49]:
ccd = sorted(close_cent_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
ccd_cases = [x[0] for x in ccd]
ccd_values = [x[1] for x in ccd]
ccd_dates = [G.node[each_case]['date'] for each_case in ccd_cases]
ccd_indegs = [G.in_degree(each_case) for each_case in ccd_cases]
ccd_outdegs = [G.out_degree(each_case) for each_case in ccd_cases]

data = {'Closeness Centrality': ccd_values,
        'Date': ccd_dates,
        'In-Degree': ccd_indegs,
        'Out-Degree': ccd_outdegs
       }
df = pd.DataFrame(data, columns = ['Closeness Centrality', 'Date', 'In-Degree', 'Out-Degree'],
                 index=ccd_cases)
df

Unnamed: 0,Closeness Centrality,Date,In-Degree,Out-Degree
1741,0.186302,2010-01-21,4,135
803267,0.186093,2012-06-28,0,118
809122,0.186093,2012-06-28,0,118
118011,0.183343,1996-03-27,42,134
149702,0.182886,2010-06-28,8,122
145795,0.182269,2008-06-12,7,99
149008,0.179324,2010-06-21,4,68
131149,0.178671,2003-12-10,15,102
145706,0.177511,2007-06-25,9,67
1269289,0.176388,2001-06-28,12,50


### Top 10 Eigenvector Centrality for SCOTUS (Directed)

In [50]:
ecd = sorted(eigen_cent_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
ecd_cases = [x[0] for x in ecd]
ecd_values = [x[1] for x in ecd]
ecd_dates = [G.node[each_case]['date'] for each_case in ecd_cases]
ecd_indegs = [G.in_degree(each_case) for each_case in ecd_cases]
ecd_outdegs = [G.out_degree(each_case) for each_case in ecd_cases]

data = {'Eigenvector Centrality': ecd_values,
        'Date': ecd_dates,
        'In-Degree': ecd_indegs,
        'Out-Degree': ecd_outdegs
       }
df = pd.DataFrame(data, columns = ['Eigenvector Centrality', 'Date', 'In-Degree', 'Out-Degree'],
                 index=ecd_cases)
df

Unnamed: 0,Eigenvector Centrality,Date,In-Degree,Out-Degree
85131,0.371655,1816-02-16,181,0
85534,0.353776,1827-03-12,124,0
86696,0.279975,1852-03-18,133,3
85160,0.239934,1816-03-20,94,0
85283,0.239103,1820-02-16,27,1
87748,0.233147,1866-01-29,65,1
89173,0.230106,1876-01-17,78,2
86480,0.189831,1849-02-18,66,10
88605,0.186612,1873-03-18,60,0
88075,0.153327,1869-11-18,49,0


### Top 10 Betweenness Centrality for SCOTUS (Directed)

In [51]:
bcd = sorted(between_cent_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
bcd_cases = [x[0] for x in bcd]
bcd_values = [x[1] for x in bcd]
bcd_dates = [G.node[each_case]['date'] for each_case in bcd_cases]
bcd_indegs = [G.in_degree(each_case) for each_case in bcd_cases]
bcd_outdegs = [G.out_degree(each_case) for each_case in bcd_cases]

data = {'Betweenness Centrality': bcd_values,
        'Date': bcd_dates,
        'In-Degree': bcd_indegs,
        'Out-Degree': bcd_outdegs
       }
df = pd.DataFrame(data, columns = ['Betweenness Centrality', 'Date', 'In-Degree', 'Out-Degree'],
                 index=bcd_cases)
df

Unnamed: 0,Betweenness Centrality,Date,In-Degree,Out-Degree
102605,0.00318,1936-02-17,214,103
101864,0.00308,1932-02-23,120,158
106366,0.002482,1962-03-26,161,178
103012,0.00237,1938-04-25,200,49
97966,0.002271,1913-06-09,140,150
101894,0.002236,1932-04-11,78,133
106545,0.002167,1963-03-18,229,50
106170,0.002022,1961-02-20,133,85
104616,0.001794,1949-02-14,15,182
104894,0.001769,1951-04-30,86,138


### Top 10 Page Rank for SCOTUS (Directed)

In [52]:
prd = sorted(page_rank_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
prd_cases = [x[0] for x in prd]
prd_values = [x[1] for x in prd]
prd_dates = [G.node[each_case]['date'] for each_case in prd_cases]
prd_indegs = [G.in_degree(each_case) for each_case in prd_cases]
prd_outdegs = [G.out_degree(each_case) for each_case in prd_cases]

data = {'Page Rank': prd_values,
        'Date': prd_dates,
        'In-Degree': prd_indegs,
        'Out-Degree': prd_outdegs
       }
df = pd.DataFrame(data, columns = ['Page Rank', 'Date', 'In-Degree', 'Out-Degree'],
                 index=prd_cases)
df

Unnamed: 0,Page Rank,Date,In-Degree,Out-Degree
85131,0.001539,1816-02-16,181,0
91573,0.001348,1886-02-01,234,0
85534,0.001194,1827-03-12,124,0
88661,0.000977,1873-04-14,124,0
85160,0.000895,1816-03-20,94,0
89675,0.000883,1878-01-18,107,3
98094,0.000859,1914-02-24,138,11
88804,0.000854,1874-01-30,93,0
85330,0.000847,1821-03-18,157,0
87010,0.000752,1856-02-19,91,7


### Top 10 Hubs for SCOTUS (Directed)

In [53]:
hd = sorted(hubs_auths_dict[0].iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
hd_cases = [x[0] for x in hd]
hd_values = [x[1] for x in hd]
hd_dates = [G.node[each_case]['date'] for each_case in hd_cases]
hd_indegs = [G.in_degree(each_case) for each_case in hd_cases]
hd_outdegs = [G.out_degree(each_case) for each_case in hd_cases]

data = {'Hubs': hd_values,
        'Date': hd_dates,
        'In-Degree': hd_indegs,
        'Out-Degree': hd_outdegs
       }
df = pd.DataFrame(data, columns = ['Hubs', 'Date', 'In-Degree', 'Out-Degree'],
                 index=hd_cases)
df

Unnamed: 0,Hubs,Date,In-Degree,Out-Degree
108611,0.002173,1972-06-29,52,133
107082,0.002,1965-06-07,104,105
109380,0.0019,1976-02-27,159,127
106267,0.001889,1961-10-09,25,154
108839,0.001831,1973-10-09,81,125
108798,0.001769,1973-05-29,39,86
109505,0.001766,1976-10-04,49,80
109836,0.001761,1978-06-26,70,90
149702,0.00176,2010-06-28,8,122
108329,0.001679,1971-05-03,52,163


### Top 10 Authorities for SCOTUS (Directed)

In [54]:
ad = sorted(hubs_auths_dict[1].iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
ad_cases = [x[0] for x in ad]
ad_values = [x[1] for x in ad]
ad_dates = [G.node[each_case]['date'] for each_case in ad_cases]
ad_indegs = [G.in_degree(each_case) for each_case in ad_cases]
ad_outdegs = [G.out_degree(each_case) for each_case in ad_cases]

data = {'Authorities': ad_values,
        'Date': ad_dates,
        'In-Degree': ad_indegs,
        'Out-Degree': ad_outdegs
       }
df = pd.DataFrame(data, columns = ['Authorities', 'Date', 'In-Degree', 'Out-Degree'],
                 index=ad_cases)
df

Unnamed: 0,Authorities,Date,In-Degree,Out-Degree
103355,0.00373,1940-05-20,198,9
106514,0.003242,1963-01-14,175,63
103243,0.003122,1939-11-22,150,10
106761,0.003095,1964-03-09,186,43
103347,0.003023,1940-04-22,151,26
105746,0.002707,1958-06-30,169,25
105751,0.002447,1958-06-30,151,56
102991,0.002431,1938-03-28,112,18
103870,0.002395,1943-06-14,143,17
101097,0.002288,1927-05-16,135,59
