In [2]:

import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig
from collections import *
from __future__ import division

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# this will be a little slow the first time you run it
G = load_citation_network_igraph(data_dir, court_name)

print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


In [4]:
time1 = time.time()

U = G.as_undirected()

poop = U.closeness()

time2 = time.time()

print time2-time1
print len(poop)


438.924000025
33248


In [5]:
time1 = time.time()
poop2 = U.betweenness()
time2 = time.time()

print time2-time1
print len(poop2)

805.056999922
33248


In [6]:
time1 = time.time()
poop3 = U.eigenvector_centrality()
time2 = time.time()

print time2-time1
print len(poop3)

0.677000045776
33248


In [3]:
print G.vs[0]

igraph.Vertex(<igraph.Graph object at 0x000000000B652138>,0,{'court': 'scotus', 'name': 100000, 'year': 1922})


In [4]:
G.vs[0]['court']

'scotus'

In [5]:
G.vs[0].indegree()

1

In [6]:
year_list = []
indegree_list = []
id_list = []
igraph_index_list = []
for vertex in G.vs():
    year_list.append(vertex['year'])
    indegree_list.append(vertex.indegree())
    id_list.append(vertex['name'])
    igraph_index_list.append(vertex.index)

tuple_list = zip(igraph_index_list, id_list, year_list, indegree_list)

# keys = all the years, values = metadata of all past cases

In [21]:
min_year = np.amin(year_list)
max_year = np.amax(year_list)

past_now_cases_dict = OrderedDict()
for i in range (min_year,max_year+1):
    past_now_cases_dict[i] = []

for case in tuple_list:
    past_now_cases_dict[case[2]].append(case)

for i in range (min_year+1,max_year+1):
    past_now_cases_dict[i].extend(past_now_cases_dict[i-1])
    past_now_cases_dict[i] = sorted(past_now_cases_dict[i], key=lambda tup: tup[3], reverse=True)

past_cases_dict = past_now_cases_dict
'''
past_cases_dict = OrderedDict()
for i in range (min_year,max_year+1):
    past_cases_dict[i+1] = past_now_cases_dict[i]
'''

'\npast_cases_dict = OrderedDict()\nfor i in range (min_year,max_year+1):\n    past_cases_dict[i+1] = past_now_cases_dict[i]\n'

# random testing

In [23]:
past_cases_dict[1759]

[(18427, 84581, 1754, 2), (18428, 84582, 1759, 0)]

In [24]:
print G.neighbors(10, mode='OUT')

test_tuple_list = []
for i in G.neighbors(10, mode='OUT'):
    test_tuple_list.append((i, G.vs[i]['name'], G.vs[i]['year'], G.vs[i].indegree()))
    
print
print test_tuple_list
print
print len(test_tuple_list)

[17550, 17551, 23415, 24367, 26214, 26215, 27003, 27524, 28745, 28994, 29328, 29379, 29382, 29592, 30042, 30135, 30632, 30814, 30862, 31150, 31247, 31331, 31805, 32339, 32840, 32878, 33161]

[(17550, 2516136, 1914, 13), (17551, 2516269, 1902, 28), (23415, 90006, 1879, 24), (24367, 90965, 1883, 39), (26214, 92867, 1890, 19), (26215, 92868, 1890, 7), (27003, 93665, 1893, 88), (27524, 94194, 1895, 16), (28745, 95430, 1901, 4), (28994, 95682, 1902, 9), (29328, 96023, 1904, 3), (29379, 96076, 1904, 14), (29382, 96079, 1904, 4), (29592, 96296, 1905, 35), (30042, 96750, 1908, 25), (30135, 96845, 1908, 2), (30632, 97348, 1911, 33), (30814, 97533, 1912, 3), (30862, 97581, 1912, 11), (31150, 97876, 1913, 18), (31247, 97975, 1913, 27), (31331, 98060, 1914, 15), (31805, 98536, 1915, 7), (32339, 99079, 1918, 4), (32840, 99588, 1920, 1), (32878, 99626, 1920, 15), (33161, 99911, 1922, 7)]

27


# 2nd part: get the cases that case c actually cites

# Problem: past_(now)_cases dict can include future cases..., so the ranking is a bit skewed to the right, when observing the cited cases for case C

In [25]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []
for key, value in past_cases_dict.iteritems():
    if key == G.vs[10]['year']:
        all_past_cases = value

print len(all_past_cases)
print all_past_cases[0]



ranks = [i+1 for i, v in enumerate(all_past_cases) if v[0] in G.neighbors(10, mode='OUT')]

print len(ranks)
print ranks

scores = []
# score_i = 1 - r_i/ |B_c| (where |B_c| is the number of cases that came before case c)
for some_rank in ranks:
    some_score = 1-some_rank/len(all_past_cases)
    scores.append(some_score)
    
final_score = sum(scores)

print len(scores)
print scores
print 
print final_score


15487
(29701, 96405, 1906, 608)
27
[39, 323, 392, 450, 634, 669, 792, 887, 1310, 1378, 1690, 1769, 1784, 2006, 2173, 2676, 3396, 4146, 4211, 4444, 6235, 6455, 6487, 7311, 7427, 8572, 9846]
27
[0.9974817588945567, 0.9791437980241493, 0.9746884483760574, 0.9709433718602699, 0.9590624394653581, 0.9568024794989346, 0.948860334474075, 0.9427261574223542, 0.915412926971008, 0.911022147607671, 0.8908762187641247, 0.8857751662684832, 0.8848066119971589, 0.8704720087815587, 0.8596887712274811, 0.8272099179957384, 0.7807193129721702, 0.7322915994059533, 0.7280945308968813, 0.7130496545489766, 0.5974042745528507, 0.5831988119067605, 0.5811325627946019, 0.5279266481565184, 0.5204364951249435, 0.4465035190805191, 0.36424097630270547]

21.3499709434


In [26]:
# G.neighbors(10, mode='OUT') are the cases that case c actually cites

#ranks = []

def calculate_score_for_case(case_index, past_cases_dict):
    for key, value in past_cases_dict.iteritems():
        if key == G.vs[case_index]['year']:
            all_past_cases = value

    ranks = [i+1 for i, v in enumerate(all_past_cases) if v[0] in G.neighbors(case_index, mode='OUT')]

    scores = []
    for some_rank in ranks:
        some_score = 1-some_rank/len(all_past_cases)
        scores.append(some_score)

    final_score = sum(scores)
    
    return final_score
    
calculate_score_for_case(10, past_cases_dict)

21.349970943371865

In [27]:
time1 = time.time()

score_M = 0
for i in G.vs():
    score_M += calculate_score_for_case(i.index, past_cases_dict)
    
time2=time.time()
total_time = str(time2-time1)
print 'took ' + total_time
print
print score_M

took 697.003000021

213733.884905
