In [1]:

import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig
from collections import *
from __future__ import division

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

In [2]:
# this will be a little slow the first time you run it
G = load_citation_network_igraph(data_dir, court_name)

print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


### SubGraph Code (vertices = only past cases)

In [4]:
time1 = time.time()
subgraph_dict = {}

min_year = 1754
max_year = 2016

for i in range(min_year, max_year+2):
    sub_vs = G.vs.select(year_lt = i) # so only the past cases
    sub_G = G.subgraph(sub_vs)
    subgraph_dict[i] = sub_G
    
time2 = time.time()

print subgraph_dict[2017].summary()
print G.summary()
print "This took " + str(time2-time1) + " seconds"

IGRAPH DN-- 33248 250465 -- 
+ attr: court (v), name (v), year (v)
IGRAPH DN-- 33248 250465 -- 
+ attr: court (v), name (v), year (v)
This took 5.94799995422 seconds


In [5]:
print "In the whole network case " + str(G.vs[8000]['name']) + " has an indegree of: " + str(G.vs[8000].indegree())
for v in subgraph_dict[1973].vs:
    if v['name'] == G.vs[8000]['name']:
        print "In the sub-network of cases before 1973 case " + str(v['name']) + " has an indegree of: " + str(v.indegree())

In the whole network case 108465 has an indegree of: 11
In the sub-network of cases before 1973 case 108465 has an indegree of: 2


# Get dictionaries of all past cases, where 'key' = year, 'value' = past cases in tuple-form

In [6]:
time1 = time.time()

case_tuple_dict = {}

for i in range(min_year, max_year+2):
    sub_G = subgraph_dict[i]
    
    igraph_index_list = []
    id_list = []
    year_list = []
    metric_list = []
    
    ## to do page rank:
    #metric_list = sub_G.pagerank()
    
    for j in range(0, len(sub_G.vs)): # not " for vertex in sub_G.bs " because not sure if it goes through each index in order
        vertex = sub_G.vs[j]
        igraph_index_list.append(vertex.index) # kinda useless because subgraphs reindex stuff differently from original G
        id_list.append(vertex['name'])
        year_list.append(vertex['year'])
        metric_list.append(vertex.indegree())
    
    tuple_list = zip(igraph_index_list, id_list, year_list, metric_list)
    sorted_tuple_list = sorted(tuple_list, key=lambda tup: tup[3], reverse=True) # sort by metric (indegree or page rank)
    
    case_tuple_dict[i] = sorted_tuple_list # Dictionary containing all past cases' (as sorted tuples by metric)

# 2nd part: rank the cases that case c actually cites

In [7]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, case_tuple_dict):
    all_past_cases = case_tuple_dict[G.vs[case_index]['year']]
    
    neighbors = G.neighbors(case_index, mode='OUT')
    neighbors_names = [G.vs[i]['name'] for i in neighbors]

    # check the ranks by case 'names' because graph and subgraph will have different node indexing
    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, case_tuple_dict)

15.380239325181453

In [8]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, case_tuple_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M

took 220.588000059

174094.901772


# For Page Rank (directed)

In [9]:
time1 = time.time()

case_tuple_dict = {}

for i in range(min_year, max_year+2):
    sub_G = subgraph_dict[i]
    
    igraph_index_list = []
    id_list = []
    year_list = []
    metric_list = []
    
    ## to do page rank:
    metric_list = sub_G.pagerank()
    
    for j in range(0, len(sub_G.vs)): # not " for vertex in sub_G.bs " because not sure if it goes through each index in order
        vertex = sub_G.vs[j]
        igraph_index_list.append(vertex.index) # kinda useless because subgraphs reindex stuff differently from original G
        id_list.append(vertex['name'])
        year_list.append(vertex['year'])
        #metric_list.append(vertex.indegree())
    
    tuple_list = zip(igraph_index_list, id_list, year_list, metric_list)
    sorted_tuple_list = sorted(tuple_list, key=lambda tup: tup[3], reverse=True) # sort by metric (indegree or page rank)
    
    case_tuple_dict[i] = sorted_tuple_list # Dictionary containing all past cases' (as sorted tuples by metric)

{}


In [10]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, case_tuple_dict):
    all_past_cases = case_tuple_dict[G.vs[case_index]['year']]
    
    neighbors = G.neighbors(case_index, mode='OUT')
    neighbors_names = [G.vs[i]['name'] for i in neighbors]

    # check the ranks by case 'names' because graph and subgraph will have different node indexing
    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, case_tuple_dict)

15.49048584319623

In [11]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, case_tuple_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M

took 182.688999891

149192.224771


# For Hubs (directed)

In [12]:
time1 = time.time()

case_tuple_dict = {}

for i in range(min_year, max_year+2):
    sub_G = subgraph_dict[i]
    
    igraph_index_list = []
    id_list = []
    year_list = []
    metric_list = []
    
    metric_list = sub_G.hub_score()
    
    for j in range(0, len(sub_G.vs)): # not " for vertex in sub_G.bs " because not sure if it goes through each index in order
        vertex = sub_G.vs[j]
        igraph_index_list.append(vertex.index) # kinda useless because subgraphs reindex stuff differently from original G
        id_list.append(vertex['name'])
        year_list.append(vertex['year'])
        #metric_list.append(vertex.indegree())
    
    tuple_list = zip(igraph_index_list, id_list, year_list, metric_list)
    sorted_tuple_list = sorted(tuple_list, key=lambda tup: tup[3], reverse=True) # sort by metric (indegree or page rank)
    
    case_tuple_dict[i] = sorted_tuple_list # Dictionary containing all past cases' (as sorted tuples by metric)

In [13]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, case_tuple_dict):
    all_past_cases = case_tuple_dict[G.vs[case_index]['year']]
    
    neighbors = G.neighbors(case_index, mode='OUT')
    neighbors_names = [G.vs[i]['name'] for i in neighbors]

    # check the ranks by case 'names' because graph and subgraph will have different node indexing
    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, case_tuple_dict)

18.62204930360296

In [14]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, case_tuple_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M

took 203.955000162

193326.407438


# For Authorities (directed)

In [15]:
time1 = time.time()

case_tuple_dict = {}

for i in range(min_year, max_year+2):
    sub_G = subgraph_dict[i]
    
    igraph_index_list = []
    id_list = []
    year_list = []
    metric_list = []
    
    metric_list = sub_G.authority_score()
    
    for j in range(0, len(sub_G.vs)): # not " for vertex in sub_G.bs " because not sure if it goes through each index in order
        vertex = sub_G.vs[j]
        igraph_index_list.append(vertex.index) # kinda useless because subgraphs reindex stuff differently from original G
        id_list.append(vertex['name'])
        year_list.append(vertex['year'])
        #metric_list.append(vertex.indegree())
    
    tuple_list = zip(igraph_index_list, id_list, year_list, metric_list)
    sorted_tuple_list = sorted(tuple_list, key=lambda tup: tup[3], reverse=True) # sort by metric (indegree or page rank)
    
    case_tuple_dict[i] = sorted_tuple_list # Dictionary containing all past cases' (as sorted tuples by metric)

In [16]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, case_tuple_dict):
    all_past_cases = case_tuple_dict[G.vs[case_index]['year']]
    
    neighbors = G.neighbors(case_index, mode='OUT')
    neighbors_names = [G.vs[i]['name'] for i in neighbors]

    # check the ranks by case 'names' because graph and subgraph will have different node indexing
    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, case_tuple_dict)

16.64755116720068

In [17]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, case_tuple_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M

took 185.203000069

184295.706751


# Convert subgraphs to undirected

In [3]:
time1 = time.time()
subgraph_dict = {}

min_year = 1754
max_year = 2016

for i in range(min_year, max_year+2):
    sub_vs = G.vs.select(year_lt = i) # so only the past cases
    sub_G = G.subgraph(sub_vs)
    sub_G = sub_G.as_undirected() # convert to undirected form
    subgraph_dict[i] = sub_G
    
time2 = time.time()

print subgraph_dict[2017].summary()
print G.summary()
print "This took " + str(time2-time1) + " seconds"

IGRAPH UN-- 33248 249878 -- 
+ attr: court (v), name (v), year (v)
IGRAPH DN-- 33248 250465 -- 
+ attr: court (v), name (v), year (v)
This took 21.0690000057 seconds


### NOTE: undirected graph has less edges ^ (probably the same problem where the same cases are citing each other but have different case ID's)

# For Eigenvector Centrality (Undirected)

In [4]:
time1 = time.time()

case_tuple_dict = {}

for i in range(min_year, max_year+2):
    sub_G = subgraph_dict[i]
    
    igraph_index_list = []
    id_list = []
    year_list = []
    metric_list = []
    
    metric_list = sub_G.eigenvector_centrality()
    
    for j in range(0, len(sub_G.vs)): # not " for vertex in sub_G.bs " because not sure if it goes through each index in order
        vertex = sub_G.vs[j]
        igraph_index_list.append(vertex.index) # kinda useless because subgraphs reindex stuff differently from original G
        id_list.append(vertex['name'])
        year_list.append(vertex['year'])
        #metric_list.append(vertex.indegree())
    
    tuple_list = zip(igraph_index_list, id_list, year_list, metric_list)
    sorted_tuple_list = sorted(tuple_list, key=lambda tup: tup[3], reverse=True) # sort by metric (indegree or page rank)
    
    case_tuple_dict[i] = sorted_tuple_list # Dictionary containing all past cases' (as sorted tuples by metric)
    
time2 = time.time()

print 'took ' + str(time2-time1) + ' seconds'

took 94.4189999104 seconds


In [5]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, case_tuple_dict):
    all_past_cases = case_tuple_dict[G.vs[case_index]['year']]
    
    neighbors = G.neighbors(case_index, mode='OUT')
    neighbors_names = [G.vs[i]['name'] for i in neighbors]

    # check the ranks by case 'names' because graph and subgraph will have different node indexing
    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, case_tuple_dict)

18.329039429804485

In [6]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, case_tuple_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M

took 366.523000002

198567.34934


# For Closeness Centrality (undirected)

In [None]:
time1 = time.time()

case_tuple_dict = {}

for i in range(min_year, max_year+2):
    sub_G = subgraph_dict[i]
    
    igraph_index_list = []
    id_list = []
    year_list = []
    metric_list = []
    
    metric_list = sub_G.closeness()
    
    for j in range(0, len(sub_G.vs)): # not " for vertex in sub_G.bs " because not sure if it goes through each index in order
        vertex = sub_G.vs[j]
        igraph_index_list.append(vertex.index) # kinda useless because subgraphs reindex stuff differently from original G
        id_list.append(vertex['name'])
        year_list.append(vertex['year'])
        #metric_list.append(vertex.indegree())
    
    tuple_list = zip(igraph_index_list, id_list, year_list, metric_list)
    sorted_tuple_list = sorted(tuple_list, key=lambda tup: tup[3], reverse=True) # sort by metric (indegree or page rank)
    
    case_tuple_dict[i] = sorted_tuple_list # Dictionary containing all past cases' (as sorted tuples by metric)
    
time2 = time.time()

print 'took ' + str(time2-time1) + ' seconds'

In [None]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, case_tuple_dict):
    all_past_cases = case_tuple_dict[G.vs[case_index]['year']]
    
    neighbors = G.neighbors(case_index, mode='OUT')
    neighbors_names = [G.vs[i]['name'] for i in neighbors]

    # check the ranks by case 'names' because graph and subgraph will have different node indexing
    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, case_tuple_dict)

In [None]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, case_tuple_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M