In [1]:

import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig
from collections import *
from __future__ import division

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

In [2]:
# this will be a little slow the first time you run it
G = load_citation_network_igraph(data_dir, court_name)

print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


In [3]:
print G.vs[0]

igraph.Vertex(<igraph.Graph object at 0x000000000B652138>,0,{'court': 'scotus', 'name': 100000, 'year': 1922})


In [4]:
G.vs[0]['court']

'scotus'

In [5]:
G.vs[0].indegree()

1

### SubGraph Code (vertices = only past cases)

In [9]:
time1 = time.time()
subgraph_dict = {}

min_year = 1754
max_year = 2016

for i in range(min_year, max_year+2):
    sub_vs = G.vs.select(year_lt = i) # so only the past cases
    sub_G = G.subgraph(sub_vs)
    subgraph_dict[i] = sub_G
    
time2 = time.time()

print subgraph_dict[2017].summary()
print G.summary()
print "This took " + str(time2-time1) + " seconds"

IGRAPH DN-- 33248 250465 -- 
+ attr: court (v), name (v), year (v)
IGRAPH DN-- 33248 250465 -- 
+ attr: court (v), name (v), year (v)
This took 4.51099991798 seconds


In [10]:
print "In the whole network case " + str(G.vs[8000]['name']) + " has an indegree of: " + str(G.vs[8000].indegree())
for v in subgraph_dict[1973].vs:
    if v['name'] == G.vs[8000]['name']:
        print "In the sub-network of cases before 1973 case " + str(v['name']) + " has an indegree of: " + str(v.indegree())

In the whole network case 108465 has an indegree of: 11
In the sub-network of cases before 1973 case 108465 has an indegree of: 2


In [39]:
time1 = time.time()

case_tuple_dict = {}

for i in range(min_year, max_year+2):
    sub_G = subgraph_dict[i]
    
    igraph_index_list = []
    id_list = []
    year_list = []
    metric_list = []
    
    ## to do page rank:
    #metric_list = sub_G.pagerank()
    
    for j in range(0, len(sub_G.vs)): # not " for vertex in sub_G.bs " because not sure if it goes through each index in order
    #for vertex in sub_G.vs:
        vertex = sub_G.vs[j]
        igraph_index_list.append(vertex.index)
        id_list.append(vertex['name'])
        year_list.append(vertex['year'])
        metric_list.append(vertex.indegree())
    
    tuple_list = zip(igraph_index_list, id_list, year_list, metric_list)
    sorted_tuple_list = sorted(tuple_list, key=lambda tup: tup[3], reverse=True) # sort by metric (indegree or page rank)
    
    case_tuple_dict[i] = sorted_tuple_list

# ^ indexing is useless because subgraphs reindex stuff

# random testing

In [40]:
case_tuple_dict[1759]

[(0, 84581, 1754, 0)]

In [41]:
G.vs[0]

igraph.Vertex(<igraph.Graph object at 0x000000000B6E5138>,0,{'court': 'scotus', 'name': 100000, 'year': 1922})

In [42]:
G.vs[10]

igraph.Vertex(<igraph.Graph object at 0x000000000B6E5138>,10,{'court': 'scotus', 'name': 100010, 'year': 1922})

In [43]:
print G.neighbors(10, mode='OUT')

test_tuple_list = []
for i in G.neighbors(10, mode='OUT'):
    test_tuple_list.append((i, G.vs[i]['name'], G.vs[i]['year'], G.vs[i].indegree()))
    
print
print test_tuple_list
print
print len(test_tuple_list)

[17550, 17551, 23415, 24367, 26214, 26215, 27003, 27524, 28745, 28994, 29328, 29379, 29382, 29592, 30042, 30135, 30632, 30814, 30862, 31150, 31247, 31331, 31805, 32339, 32840, 32878, 33161]

[(17550, 2516136, 1914, 13), (17551, 2516269, 1902, 28), (23415, 90006, 1879, 24), (24367, 90965, 1883, 39), (26214, 92867, 1890, 19), (26215, 92868, 1890, 7), (27003, 93665, 1893, 88), (27524, 94194, 1895, 16), (28745, 95430, 1901, 4), (28994, 95682, 1902, 9), (29328, 96023, 1904, 3), (29379, 96076, 1904, 14), (29382, 96079, 1904, 4), (29592, 96296, 1905, 35), (30042, 96750, 1908, 25), (30135, 96845, 1908, 2), (30632, 97348, 1911, 33), (30814, 97533, 1912, 3), (30862, 97581, 1912, 11), (31150, 97876, 1913, 18), (31247, 97975, 1913, 27), (31331, 98060, 1914, 15), (31805, 98536, 1915, 7), (32339, 99079, 1918, 4), (32840, 99588, 1920, 1), (32878, 99626, 1920, 15), (33161, 99911, 1922, 7)]

27


# 2nd part: rank the cases that case c actually cites

In [44]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []
all_past_cases = case_tuple_dict[G.vs[10]['year']]

print len(all_past_cases)
print all_past_cases[0]
print ''


# i+1 since index start at 0 but need rank to start at 1
neighbors = G.neighbors(10, mode='OUT')
neighbors_names = [G.vs[i]['name'] for i in neighbors]
print neighbors_names

ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

print len(ranks)
print ranks

print ''

scores = []
# score_i = 1 - r_i/ |B_c| (where |B_c| is the number of cases that came before case c)
for some_rank in ranks:
    some_score = 1-some_rank/len(all_past_cases)
    scores.append(some_score)
    
final_score = sum(scores)

print len(scores)
print scores
print ''
print final_score


15293
(1107, 85131, 1816, 107)

[2516136, 2516269, 90006, 90965, 92867, 92868, 93665, 94194, 95430, 95682, 96023, 96076, 96079, 96296, 96750, 96845, 97348, 97533, 97581, 97876, 97975, 98060, 98536, 99079, 99588, 99626, 99911]
26
[77, 269, 814, 867, 1296, 2497, 2540, 3128, 3782, 3871, 4528, 4848, 5913, 5924, 7270, 7280, 7471, 7657, 9148, 9149, 9591, 9848, 10197, 14346, 15035, 15062]

26
[0.9949650166742954, 0.9824102530569542, 0.9467730334139802, 0.9433073955404433, 0.9152553455829464, 0.8367226835807232, 0.8339109396455895, 0.7954619760674818, 0.7526973124959131, 0.7468776564441247, 0.7039168246910351, 0.682992218662133, 0.6133525142221932, 0.6126332308899496, 0.5246191067808801, 0.523965212842477, 0.5114758386189759, 0.49931341136467666, 0.40181782514876085, 0.4017524357549206, 0.3728503236774995, 0.3560452494605375, 0.33322435101026615, 0.061923755966782235, 0.016870463610802355, 0.015104949977113757]

15.3802393252


In [45]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, case_tuple_dict):
    all_past_cases = case_tuple_dict[G.vs[case_index]['year']]
    
    neighbors = G.neighbors(case_index, mode='OUT')
    neighbors_names = [G.vs[i]['name'] for i in neighbors]

    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[1] in neighbors_names]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, case_tuple_dict)

15.380239325181453

In [47]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, case_tuple_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M

took 164.760999918

174094.901772
