In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl
#import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

In [11]:
#determines the number of citing cases to a particular case that are within some year difference (threshold)
def time_decay_indegree(graph, vertex, threshold=10):
    td_indeg = 0
    vertex_year = vertex["year"]
    neighbors = graph.neighbors(vertex.index, mode='IN')
    #for each in-edge adds 1 to the count only if the year diff is less than the given threshold
    for neighbor in neighbors:
        neighbor_year = graph.vs[neighbor]["year"]
        if neighbor_year - vertex_year <= threshold:
            td_indeg += 1
    return td_indeg

In [12]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


In [13]:
#create a sub-network of cases to reduce runtime
sub_vs = G.vs.select(year_lt=1951)
sub_vs = sub_vs.select(year_gt=1939)
sub_G = G.subgraph(sub_vs)
print ig.summary(sub_G)

IGRAPH DN-- 1593 5635 -- 
+ attr: court (v), name (v), year (v)
None


In [14]:
time1 = time.time()
#get lists of statistics for all cases
indegrees = sub_G.indegree()
pageranks = sub_G.pagerank()

#fill a list of cases with required info that will be sorted by year
case_tuples = []
for vertex in sub_G.vs:
    index = vertex.index
    neighbors = sub_G.neighbors(index, mode='OUT')
    year = vertex["year"]
    indegree = indegrees[index]
    log_indegree = np.log10(indegree + 1)
    decay_indegree = time_decay_indegree(sub_G, vertex)
    log_decay_indegree = np.log10(decay_indegree + 1)
    pagerank = pageranks[index]
    case_tuple = (index, neighbors, year, indegree, pagerank, decay_indegree, log_indegree, log_decay_indegree)
    case_tuples.append(case_tuple)

#sort the list of cases by year
sorted_case_tuples = sorted(case_tuples, key=lambda x: x[2])

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.0260000228882 seconds


In [None]:
time1 = time.time()
#uses the list of case tuples to fill a list of (all possible) edge tuples with approriate information
#currently does not include cases with the same year (so it is an approximation)
edge_count = 0
edge_tuples = []

#go down the list of cases, then create possible edges by going back up the list
#(since only cases with a lower year can be possibly cited)
for i in range(0,len(sorted_case_tuples)):
    citing_case = sorted_case_tuples[i]
    citing_index = citing_case[0]
    neighbors = citing_case[1]
    citing_year = citing_case[2]
    for j in range(i,0,-1):
        cited_case = sorted_case_tuples[j]
        cited_index = cited_case[0]
        cited_year = cited_case[2]
        
        time_difference = citing_year - cited_year
        cited_indegree = cited_case[3]
        cited_decay_indegree = cited_case[5]
        cited_log_indegree = cited_case[6]
        cited_log_decay_indegree = cited_case[7]
        cited_pagerank = cited_case[4]
        #if one of the out-edges of the citing case points to the (possible) cited case, then an edge is there (1)
        if cited_index in neighbors:
            edge = 1
            edge_count += 1
        else:
            edge = 0
        
        edge_tuple = (citing_index, cited_index, time_difference, cited_indegree, cited_log_indegree, cited_decay_indegree, cited_log_decay_indegree, cited_pagerank, edge)
        #only add an edge if it is not in the same year
        if not time_difference == 0:
            edge_tuples.append(edge_tuple)

print "number of vertices: " + str(len(sorted_case_tuples))
print "number of edges: " + str(edge_count)
print "number of possible edges: " + str(len(edge_tuples))
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

In [None]:
time1 = time.time()
list_of_column_names = ["citing index","cited index","time difference","cited indegree","cited log indegree","cited decay indegree","cited log decay indegree","cited pagerank","edge"]
df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
print df
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

In [None]:
time1 = time.time()
#set up training data
y_train = df['edge']
x_train = df[['time difference', 'cited decay indegree']]

#calculate logistical regression
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

print 'classes: ',clf.classes_
print 'coefficients: ',clf.coef_
print 'intercept :', clf.intercept_
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

time1 = time.time()
# Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
prob = clf.predict_proba(x_train)

# predicted probabilities for ALL case for edge present (1)
prob_up = prob[:,1:2]
# convert to list
prob_up2 = [i.tolist()[0] for i in prob_up]

y_predicted = []
for i in prob_up2:
    if i>0.5:
        y_predicted.append(1)
    else:
        y_predicted.append(0)
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

time1 = time.time()
#calculate 0-1 loss from predicted values
right_prediction = [i for i,j in zip(y_train, y_predicted) if i==j]
number_right = len(right_prediction)
zero_one_loss = number_right/len(y_predicted)
print "L1 (0-1 loss): ", zero_one_loss
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"