In [47]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl
#import seaborn as sns

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
G = load_citation_network_igraph(data_dir, court_name)

print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


In [None]:
#scotus_adjacency_full = G.get_adjacency()

In [59]:
sub_vs = G.vs.select(year_lt=1900)
sub_G = G.subgraph(sub_vs)
print ig.summary(sub_G)

IGRAPH DN-- 10446 25674 -- 
+ attr: court (v), name (v), year (v)
None


In [61]:
time1 = time.time()
scotus_adjacency = sub_G.get_adjacency()
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 13.9709999561 seconds


In [62]:
time1 = time.time()
number_of_edges = 0
edge_tuples = []
for i in range(0,scotus_adjacency.shape[1]):
    for j in range(0,scotus_adjacency.shape[1]):
        citing_index = i
        cited_index = j
        time_diff = sub_G.vs[i]["year"]-sub_G.vs[j]["year"]
        cited_indegree = sub_G.vs[j].indegree()
        edge = scotus_adjacency[i,j]
        edge_tuple = (citing_index, cited_index, time_diff, cited_indegree, edge)
        if edge == 1:
            number_of_edges += 1
        if not i==j and time_diff >= 0:
            edge_tuples.append(edge_tuple)
print "number of edges: " + str(number_of_edges)
print "number of possible edges: " + str(len(edge_tuples))
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

number of edges: 25674
number of possible edges: 55491372
this took 587.805000067 seconds


In [63]:
time1 = time.time()
list_of_column_names = ["citing index","cited index", "time difference", "cited indegree", "edge"]
df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
print df
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

          citing index  cited index  time difference  cited indegree  edge
0                    0            1               35               1     0
1                    0            2               56               3     0
2                    0            9               13               4     0
3                    0           10               13               5     0
4                    0           11               13               2     0
5                    0           12               13               4     0
6                    0           13               64               0     0
7                    0           14               22              12     0
8                    0           15               16               2     0
9                    0           16               16               4     0
10                   0           22               49               1     0
11                   0           26               24               1     0
12                   0   

# Logistic Regression on Training Set = All Years

In [64]:
y_train = df['edge']

# dataset --for training set
x_train = df[['time difference', 'cited indegree']]

clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

print 'classes: ',clf.classes_
print 'coefficients: ',clf.coef_
print 'intercept :', clf.intercept_



classes:  [0 1]
coefficients:  [[-0.02953166  0.10705958]]
intercept : [-7.5718848]


# Acquire Probabilities of Direction == Yes for Training Set = All Years

In [65]:
# Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
prob = clf.predict_proba(x_train)

# predicted probabilities for ALL years for UP direction
prob_up = prob[:,1:2]

# convert to list
prob_up2 = [i.tolist()[0] for i in prob_up]

y_predicted = []
for i in prob_up2:
    if i>0.5:
        y_predicted.append(1)
    else:
        y_predicted.append(0)

# L1: 0-1 Loss

In [66]:
right_prediction = [i for i,j in zip(y_train, y_predicted) if i==j]
number_right = len(right_prediction)
zero_one_loss = number_right/len(y_predicted)
print "L1 (0-1 loss): ", zero_one_loss

L1 (0-1 loss):  0.9995373515


# L2: Cross Entropy Loss

In [None]:
# source: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html
#         https://en.wikipedia.org/wiki/Cross_entropy

# using sci-kit:
y_train2 = []
for i in y_train:
    if i==1:
        y_train2.append(1)
    else:
        y_train2.append(0)

print "L2 (Cross Entropy Loss) by sci-kit: ", log_loss(y_train2, prob_up2)

# manually:
def ln(x):
    return np.log(x)

cross_entropy_losses = [-i*ln(j)-(1-i)*ln(1-j) for i,j in zip(y_train2, prob_up2)]
print "L2 (Cross Entropy Loss) by manually: ", sum(cross_entropy_losses)/len(cross_entropy_losses)

L2 (Cross Entropy Loss) by sci-kit:  0.00375218860171
L2 (Cross Entropy Loss) by manually:  0.00375218860174


# L3: Logistic Loss

In [None]:
# source (with ln(2)): https://en.wikipedia.org/wiki/Loss_functions_for_classification

# source (without ln(2)): https://people.eecs.berkeley.edu/~russell/classes/cs194/f11/lectures/CS194%20Fall%202011%20Lecture%2006.pdf
#                         https://github.com/JohnLangford/vowpal_wabbit/wiki/Loss-functions
#                         http://www.cs.cmu.edu/~yandongl/loss.html

logistic_losses = [(1/ln(2))*(ln(1+np.exp(-i*j))) for i,j in zip(y_train2, prob_up2)]        
print "L3 (Logistic Loss) with ln(2): ", sum(logistic_losses)/len(logistic_losses)