In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl
#import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


# Focus on 1925-1950

In [3]:
#create a sub-network of cases to reduce runtime
sub_vs = G.vs.select(year_lt=1951)
sub_vs = sub_vs.select(year_gt=1924)
sub_G = G.subgraph(sub_vs)
print ig.summary(sub_G)

IGRAPH DN-- 4245 20006 -- 
+ attr: court (v), name (v), year (v)
None


# time decay indegree method

In [4]:
def time_decay_indegree(graph, vertex, threshold=10):
    td_indeg = 0
    vertex_year = vertex["year"]
    neighbors = graph.neighbors(vertex.index, mode='IN')
    for neighbor in neighbors:
        neighbor_year = graph.vs[neighbor]["year"]
        if neighbor_year - vertex_year <= threshold:
            td_indeg += 1
    return td_indeg

# Get each node's information (index, neighbors node cites, indeg, pagerank, etc.)

In [5]:
time1 = time.time()
#get lists of statistics for all cases
indegrees = sub_G.indegree()
pageranks = sub_G.pagerank()

#fill a list of cases with required info that will be sorted by year
case_tuples = []
for vertex in sub_G.vs:
    index = vertex.index
    neighbors = sub_G.neighbors(index, mode='OUT')
    year = vertex["year"]
    indegree = indegrees[index]
    decay_indegree = time_decay_indegree(sub_G, vertex, 10)
    pagerank = pageranks[index]
    case_tuple = (index, neighbors, year, indegree, pagerank, decay_indegree)
    case_tuples.append(case_tuple)

#sort the list of cases by year
sorted_case_tuples = sorted(case_tuples, key=lambda x: x[2])

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.0329999923706 seconds


# Create giant tuple with all the information for data table, later used for logsitic regression

In [6]:
time1 = time.time()
#uses the list of case tuples to fill a list of (all possible) edge tuples with approriate information
#currently does not include cases with the same year (so it is an approximation)
edge_count = 0
edge_tuples = []

#go down the list of cases, then create possible edges by going back up the list
#(since only cases with a lower year can be possibly cited)
for i in range(0,len(sorted_case_tuples)):
    citing_case = sorted_case_tuples[i]
    citing_index = citing_case[0]
    neighbors = citing_case[1]
    citing_year = citing_case[2]
    for j in range(i,0,-1):
        cited_case = sorted_case_tuples[j]
        cited_index = cited_case[0]
        cited_year = cited_case[2]
        
        time_difference = citing_year - cited_year
        cited_indegree = cited_case[3]
        cited_pagerank = cited_case[4]
        cited_decay_indegree = cited_case[5]
        #if one of the out-edges of the citing case points to the (possible) cited case, then an edge is there (1)
        if cited_index in neighbors:
            edge = 1
            edge_count += 1
        else:
            edge = 0
        
        edge_tuple = (citing_index, cited_index, time_difference, cited_indegree, cited_pagerank, cited_decay_indegree, edge)
        if not time_difference == 0:
            edge_tuples.append(edge_tuple)

print "number of vertices: " + str(len(sorted_case_tuples))
print "number of edges: " + str(edge_count)
print "number of possible edges: " + str(len(edge_tuples))
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

number of vertices: 4245
number of edges: 19862
number of possible edges: 8648189
this took 12.9449999332 seconds


# Create and Save Giant Data Table

In [7]:
time1 = time.time()
list_of_column_names = ["citing index","cited index", "time difference", "cited indegree", "cited pagerank", 
                        "cited decay indegree", "edge"]
df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
df.to_csv('1925-1950_scotus_logreg_table.csv')
print df
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

         citing index  cited index  time difference  cited indegree  \
0                 219         4215                1               1   
1                 219         4214                1               0   
2                 219         4213                1               2   
3                 219         4197                1              19   
4                 219         4180                1               6   
5                 219         4179                1               1   
6                 219         4178                1              34   
7                 219          240                1               8   
8                 219          218                1               0   
9                 219          217                1               5   
10                219          216                1               6   
11                219          215                1               5   
12                219          214                1               5   
13    

# Logistic Regression for Training Set -> Training Set (No Future Prediction, Just Accuracy Measurement) 

In [28]:
def logreg(x_train_list):
    time1 = time.time()
    #set up training data
    y_train = df['edge']
    x_train = df[x_train_list]

    #calculate logistical regression
    clf = skl_lm.LogisticRegression(solver='newton-cg')
    clf.fit(x_train, y_train)

    print 'classes: ',clf.classes_
    print 'coefficients: ',clf.coef_
    print 'intercept :', clf.intercept_
    time2 = time.time()
    print "this took " + str(time2-time1) + " seconds"

    time1 = time.time()
    # Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
    prob = clf.predict_proba(x_train)

    # predicted probabilities for ALL case for edge present (1)
    prob_up = prob[:,1:2]
    # convert to list
    prob_up2 = [i.tolist()[0] for i in prob_up]
    
    print prob_up2[0]
    print prob_up2[1000]
    print prob_up2[-1]
    print prob_up2[500]

    y_predicted = []
    for i in prob_up2:
        if i>0.5:
            y_predicted.append(1)
        else:
            y_predicted.append(0)
    time2 = time.time()
    print "this took " + str(time2-time1) + " seconds"
    
    print y_predicted[0]
    print y_predicted[1000]
    print y_predicted[-1]
    print y_predicted[500]

    time1 = time.time()
    #calculate 0-1 loss from predicted values
    
    right_prediction = [i for i,j in zip(y_train, y_predicted) if i==j]
    number_right = len(right_prediction)
    
    #right_prediction = set(y_train).intersection(set(y_predicted))
    #number_right = len(list(right_prediction))
    
    #number_right = len(set(y_train) & set(y_predicted))
    
    print "predicted right:", number_right
    print "total number of edges:", len(y_predicted)
    zero_one_loss = number_right/len(y_predicted)
    print "L1 (0-1 loss): ", 1-zero_one_loss
    time2 = time.time()
    print "this took " + str(time2-time1) + " seconds"

# indegree

In [29]:
logreg(['cited indegree'])

classes:  [0 1]
coefficients:  [[ 0.08114617]]
intercept : [-6.77684979]
this took 61.6100001335 seconds
0.00123468541422
0.00157446331471
0.00113856224892
0.00185137648818
this took 6.24799990654 seconds
0
0
0
0
predicted right: 8629604
total number of edges: 8648189
L1 (0-1 loss):  0.00214900483789
this took 4.01700019836 seconds


# time difference, indegree

In [15]:
logreg(['time difference', 'cited indegree'])

classes:  [0 1]
coefficients:  [[-0.08410068  0.08061678]]
intercept : [-6.1307243]
this took 84.0429999828 seconds
0.00216275309746
0.00275286112175
0.00026560326156
0.00323294284994
this took 5.95799994469 seconds
0
0
0
0
predicted right: 8629604
total number of edges: 8648189
L1 (0-1 loss):  0.00214900483789
this took 4.07799983025 seconds


# time difference, indegree, page rank

In [30]:
logreg(['time difference', 'cited indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[-0.08408354  0.0806531  -1.08291797]]
intercept : [-6.13070418]
this took 90.9830000401 seconds
0.00216158387036
0.00275161087229
0.000265697847989
0.00323145855961
this took 6.13800001144 seconds
0
0
0
0
predicted right: 8629604
total number of edges: 8648189
L1 (0-1 loss):  0.00214900483789
this took 4.16899991035 seconds


# indegree, page rank

In [27]:
logreg(['cited indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[ 0.08121607 -1.90701937]]
intercept : [-6.77660668]
this took 79.9149999619 seconds
this took 5.66700005531 seconds
L1 (0-1 loss):  0.00214900483789
this took 3.44899988174 seconds


# time difference, time decay indegree, page rank

In [14]:
logreg(['time difference', 'cited decay indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[-0.07570508  0.09178308  0.21549962]]
intercept : [-6.107215]
this took 142.355999947 seconds
this took 7.01900005341 seconds
L1 (0-1 loss):  0.00214900483789
this took 4.43799996376 seconds


# time decay indegree, page rank

In [23]:
logreg(['cited decay indegree', 'cited pagerank'])

classes:  [0 1]
coefficients:  [[ 0.0948969  -0.52392089]]
intercept : [-6.70127577]
this took 70.4949998856 seconds
this took 6.68099999428 seconds
L1 (0-1 loss):  0.00214900483789
this took 3.70000004768 seconds


# page rank

In [24]:
logreg(['cited pagerank'])

classes:  [0 1]
coefficients:  [[ 3.06984896]]
intercept : [-6.14153267]
this took 27.1399998665 seconds
this took 5.98100018501 seconds
L1 (0-1 loss):  0.00214900483789
this took 3.56799983978 seconds


# time decay indegree

In [26]:
logreg(['cited decay indegree'])

classes:  [0 1]
coefficients:  [[ 0.09488172]]
intercept : [-6.70139034]
this took 58.0340001583 seconds
this took 5.17499995232 seconds
L1 (0-1 loss):  0.00214900483789
this took 3.96300005913 seconds
