In [26]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl
#import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


# Focus on 1940-1950

In [28]:
#create a sub-network of cases to reduce runtime
sub_vs = G.vs.select(year_lt=1951)
sub_vs = sub_vs.select(year_gt=1939)
sub_G = G.subgraph(sub_vs)
print ig.summary(sub_G)

IGRAPH DN-- 1593 5635 -- 
+ attr: court (v), name (v), year (v)
None


# time decay indegree method

In [29]:
def time_decay_indegree(graph, vertex, threshold=10):
    td_indeg = 0
    vertex_year = vertex["year"]
    neighbors = graph.neighbors(vertex.index, mode='IN')
    for neighbor in neighbors:
        neighbor_year = graph.vs[neighbor]["year"]
        if neighbor_year - vertex_year <= threshold:
            td_indeg += 1
    return td_indeg

# Get each node's information (index, neighbors node cites, indeg, pagerank, etc.)

In [30]:
time1 = time.time()
#get lists of statistics for all cases
indegrees = sub_G.indegree()
pageranks = sub_G.pagerank()

#fill a list of cases with required info that will be sorted by year
case_tuples = []
for vertex in sub_G.vs:
    index = vertex.index
    neighbors = sub_G.neighbors(index, mode='OUT')
    year = vertex["year"]
    indegree = indegrees[index]
    decay_indegree = time_decay_indegree(sub_G, vertex, 5) # make sure this threshold is <range of years we're evaluating scotus on
    pagerank = pageranks[index]
    case_tuple = (index, neighbors, year, indegree, pagerank, decay_indegree)
    case_tuples.append(case_tuple)

#sort the list of cases by year
sorted_case_tuples = sorted(case_tuples, key=lambda x: x[2])

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.0119998455048 seconds


# Create giant tuple with all the information for data table, later used for logsitic regression

In [31]:
time1 = time.time()
#uses the list of case tuples to fill a list of (all possible) edge tuples with approriate information
#currently does not include cases with the same year (so it is an approximation)
edge_count = 0
edge_tuples = []

#go down the list of cases, then create possible edges by going back up the list
#(since only cases with a lower year can be possibly cited)
for i in range(0,len(sorted_case_tuples)):
    citing_case = sorted_case_tuples[i]
    citing_index = citing_case[0]
    neighbors = citing_case[1]
    citing_year = citing_case[2]
    for j in range(i,0,-1):
        cited_case = sorted_case_tuples[j]
        cited_index = cited_case[0]
        cited_year = cited_case[2]
        
        time_difference = citing_year - cited_year
        cited_indegree = cited_case[3]
        cited_pagerank = cited_case[4]
        cited_decay_indegree = cited_case[5]
        #if one of the out-edges of the citing case points to the (possible) cited case, then an edge is there (1)
        if cited_index in neighbors:
            edge = 1
            edge_count += 1
        else:
            edge = 0
        
        edge_tuple = (citing_index, cited_index, time_difference, cited_indegree, cited_pagerank, cited_decay_indegree, edge)
        if not time_difference == 0:
            edge_tuples.append(edge_tuple)

print "number of vertices: " + str(len(sorted_case_tuples))
print "number of edges: " + str(edge_count)
print "number of possible edges: " + str(len(edge_tuples))
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

number of vertices: 1593
number of edges: 5564
number of possible edges: 1149579
this took 1.52799987793 seconds


# Create and Save Giant Data Table

In [32]:
time1 = time.time()
list_of_column_names = ["citing index","cited index", "time difference", "cited indegree", "cited pagerank", 
                        "cited decay indegree", "edge"]
df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
df.to_csv('1940-1950_scotus_logreg_table.csv')
print df
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

         citing index  cited index  time difference  cited indegree  \
0                 126         1583                1               4   
1                 126         1582                1               2   
2                 126         1581                1               5   
3                 126         1571                1               1   
4                 126         1570                1              12   
5                 126          168                1               0   
6                 126          152                1               9   
7                 126          148                1               2   
8                 126          147                1               5   
9                 126          146                1               1   
10                126          145                1               1   
11                126          143                1               0   
12                126          142                1               1   
13    

# Logistic Regression for Training Set -> Training Set (No Future Prediction, Just Accuracy Measurement) 

In [33]:
def logreg(x_train_list):
    time1 = time.time()
    #set up training data
    y_train = df['edge']
    x_train = df[x_train_list]

    #calculate logistical regression
    clf = skl_lm.LogisticRegression(solver='newton-cg')
    clf.fit(x_train, y_train)

    print 'classes: ',clf.classes_
    print 'coefficients: ',clf.coef_
    print 'intercept :', clf.intercept_
    time2 = time.time()
    print "this took " + str(time2-time1) + " seconds"

    time1 = time.time()
    # Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
    prob = clf.predict_proba(x_train)

    # predicted probabilities for ALL case for edge present (1)
    prob_up = prob[:,1:2]
    # convert to list
    prob_up2 = [i.tolist()[0] for i in prob_up]
    
    print prob_up2[0]
    print prob_up2[1000]
    print prob_up2[-1]
    print prob_up2[500]

    y_predicted = []
    for i in prob_up2:
        if i>0.5:
            y_predicted.append(1)
        else:
            y_predicted.append(0)
    time2 = time.time()
    print "this took " + str(time2-time1) + " seconds"
    
    print y_predicted[0]
    print y_predicted[1000]
    print y_predicted[-1]
    print y_predicted[500]

    time1 = time.time()
    #calculate 0-1 loss from predicted values
    
    #right_prediction = [i for i,j in zip(y_train, y_predicted) if i==j]
    #number_right = len(right_prediction)
    
    #right_prediction = set(y_train).intersection(set(y_predicted))
    #number_right = len(list(right_prediction))
    
    #number_right = len(set(y_train) & set(y_predicted))
    
    right_prediction = y_train - y_predicted
    x = np.array(right_prediction)
    y = np.absolute(x)
    number_wrong = sum(y)
    
    print "predicted wrong:", number_wrong
    print "total number of edges:", len(y_predicted)
    zero_one_loss = number_wrong/len(y_predicted)
    print "L1 (0-1 loss): ", zero_one_loss
    time2 = time.time()
    print "this took " + str(time2-time1) + " seconds"

# indegree

In [34]:
logreg(['cited indegree'])

classes:  [0 1]
coefficients:  [[ 0.08204973]]
intercept : [-5.99903272]
this took 6.867000103 seconds
0.00343316438562
0.00715767776996
0.00372563981016
0.00268607922802
this took 0.867000102997 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.202999830246 seconds


# time difference, indegree

In [36]:
logreg(['time difference', 'cited indegree'])

classes:  [0 1]
coefficients:  [[-0.10724407  0.08517454]]
intercept : [-5.62755891]
this took 8.37400007248 seconds
0.00452273747329
0.00968414414529
0.00188089107708
0.00350648526729
this took 0.705999851227 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.180999994278 seconds


# time difference, indegree, page rank

In [37]:
logreg(['time difference', 'cited indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[-0.1072013   0.08532187 -0.9637516 ]]
intercept : [-5.62763195]
this took 10.9910001755 seconds
0.00452174522569
0.00967690923894
0.00187652840926
0.00350392430246
this took 0.876999855042 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.18799996376 seconds


# indegree, page rank

In [38]:
logreg(['cited indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[ 0.08222015 -1.09717052]]
intercept : [-5.99894378]
this took 8.08700013161 seconds
0.0034327649367
0.00715269966914
0.00371470619598
0.00268418165752
this took 0.830999851227 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.169000148773 seconds


# time difference, time decay indegree, page rank

In [39]:
logreg(['time difference', 'cited decay indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[-0.0928293   0.12355491 -0.51027   ]]
intercept : [-5.74305634]
this took 9.62800002098 seconds
0.00476311607452
0.0112265998783
0.00161621761415
0.00329259611397
this took 0.727999925613 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.171000003815 seconds


# time decay indegree, page rank

In [40]:
logreg(['cited decay indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[ 0.1212854  -0.72496938]]
intercept : [-6.07022932]
this took 7.53399991989 seconds
0.00373720130545
0.00867948607378
0.0029287995357
0.00260014765824
this took 0.8140001297 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.176999807358 seconds


# page rank

In [41]:
logreg(['cited pagerank'])

classes:  [0 1]
coefficients:  [[ 2.91239562]]
intercept : [-5.45748565]
this took 3.69199991226 seconds
0.00425612829057
0.00428001881023
0.00429005211398
0.00425703493814
this took 0.819000005722 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.165999889374 seconds


# time decay indegree

In [42]:
logreg(['cited decay indegree'])
'''
coefficients:  [[ 0.08204973]]
intercept : [-5.99903272]
0.00343316438562
0.00715767776996
0.00372563981016
0.00268607922802
'''

classes:  [0 1]
coefficients:  [[ 0.12114879]]
intercept : [-6.07032492]
this took 6.04399991035 seconds
0.00373699385152
0.008682819182
0.00293523750991
0.00260120434589
this took 0.738000154495 seconds
0
0
0
0
predicted wrong: 4893
total number of edges: 1149579
L1 (0-1 loss):  0.00425634079954
this took 0.251999855042 seconds


'\ncoefficients:  [[ 0.08204973]]\nintercept : [-5.99903272]\n0.00343316438562\n0.00715767776996\n0.00372563981016\n0.00268607922802\n'