In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


# Temporary: Focus on up to 1900 Sub-Graph

In [3]:
#create a sub-network of cases to reduce runtime
sub_vs = G.vs.select(year_lt=1901)
#sub_vs = sub_vs.select(year_gt=1939)
sub_G = G.subgraph(sub_vs)
G = sub_G
print ig.summary(G)

IGRAPH DN-- 10676 27111 -- 
+ attr: court (v), name (v), year (v)
None


# Get each node's information (index, neighbors, year)

In [4]:
time1 = time.time()

#fill a list of cases with required info that will be sorted by year
case_tuples = []
for vertex in G.vs:
    index = vertex.index
    #neighbors is the cases that this case has cited (i.e. vertex is citing, neighbors are cited)
    neighbors = G.neighbors(index, mode='OUT')
    year = vertex["year"]
    case_tuple = (index, neighbors, year)
    case_tuples.append(case_tuple)

#sort the list of cases by year
sorted_case_tuples = sorted(case_tuples, key=lambda x: x[2])

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.066999912262 seconds


# Create list of tuples of every potential edge with info (indices, years)

In [5]:
time1 = time.time()
#uses the list of case tuples to fill a list of (all possible) edge tuples with approriate information
#currently does not include cases with the same year (so it is an approximation)
edge_count = 0
edge_tuples = []

#go down the list of cases, then create possible edges by going back up the list
#(since only cases with a lower year can be possibly cited)
for i in range(0,len(sorted_case_tuples)):
    citing_case = sorted_case_tuples[i]
    citing_index = citing_case[0]
    neighbors = citing_case[1]
    citing_year = citing_case[2]
    for j in range(i,0,-1):
        cited_case = sorted_case_tuples[j]
        cited_index = cited_case[0]
        cited_year = cited_case[2]
        
        age = citing_year - cited_year
        #if one of the out-edges of the citing case points to the (possible) cited case, then an edge is there (1)
        if cited_index in neighbors:
            edge = 1
            edge_count += 1
        else:
            edge = 0
        
        edge_tuple = (edge, citing_index, cited_index, citing_year, cited_year)
        if not age == 0:
            edge_tuples.append(edge_tuple)

print "number of vertices: " + str(len(sorted_case_tuples))
print "number of edges: " + str(edge_count)
print "number of possible edges: " + str(len(edge_tuples))
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

number of vertices: 10676
number of edges: 26736
number of possible edges: 56009003
this took 59.9459998608 seconds


# Tidy Data Frame

In [6]:
time1 = time.time()
list_of_column_names = ["edge", "citing_index", "cited_index", "citing_year", "cited_year"]

df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
print df
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"



          edge  citing_index  cited_index  citing_year  cited_year
0            0           467          469         1760        1759
1            0           470          469         1760        1759
2            0           471          470         1763        1760
3            0           471          467         1763        1760
4            0           471          469         1763        1759
5            0           271          471         1764        1763
6            0           271          470         1764        1760
7            0           271          467         1764        1760
8            0           271          469         1764        1759
9            0           472          471         1764        1763
10           0           472          470         1764        1760
11           0           472          467         1764        1760
12           0           472          469         1764        1759
13           0           473          471         1764        

In [None]:
#save tidy df
#df.to_csv('1925_scotus_logreg_table.csv')

# Below is experimental code

In [None]:
def make_random_subset_df(df):
    time1 = time.time()
    df1 = df[df['edge'] == 1]
    df0 = df[df['edge'] == 0]
    frac_val = len(df1)/len(df0)
    df0_random_subset = df0.sample(frac=frac_val, replace=False)
    
    df_subset = df1.append(df0_random_subset, ignore_index=True)
    
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df_subset

In [None]:
def add_cited_indegree(df, G):
    time1 = time.time()
    igraph_indices = df['cited_index']
    indegree = G.indegree()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(indegree[igraph_indices[i]])
    df['cited_indegree'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [None]:
def add_cited_pagerank(df, G):
    time1 = time.time()
    igraph_indices = df['cited index']
    pagerank = G.pagerank()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(pagerank[igraph_indices[i]])
    df['pagerank'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [None]:
new_df = make_random_subset_df(df)

In [None]:
print new_df

In [None]:
new_df = add_cited_indegree(new_df, G)
print new_df

In [None]:
import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
time1 = time.time()
#set up training data
y_train = new_df['edge']
x_train = new_df[['cited_indegree']]

#calculate logistical regression
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

print 'classes: ',clf.classes_
print 'coefficients: ',clf.coef_
print 'intercept :', clf.intercept_
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

time1 = time.time()
# Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
prob = clf.predict_proba(x_train)

# predicted probabilities for ALL case for edge present (1)
prob_edge = prob[:,1:2]
# convert to list
prob_edge = [i.tolist()[0] for i in prob_edge]


In [None]:
index_prob = zip(new_df['cited_index'],prob_edge)

cited_indexes = []
attachment_probs = []
for cited_tuple in index_prob:
    if cited_tuple[0] not in cited_indexes:
        cited_indexes.append(cited_tuple[0])
        attachment_probs.append(cited_tuple[1])

index_prob_unique = zip(cited_indexes, attachment_probs)

print len(index_prob)
print len(index_prob_unique)

In [None]:
index_indegree = zip(new_df['cited_index'],new_df['cited_indegree'])

cited_indexes = []
indegrees = []
for cited_tuple in index_indegree:
    if cited_tuple[0] not in cited_indexes:
        cited_indexes.append(cited_tuple[0])
        indegrees.append(cited_tuple[1])

index_indegree_unique = zip(cited_indexes, indegrees)

print len(index_indegree)
print len(index_indegree_unique)

In [None]:
rank_diff = 0
sorted_by_prob = sorted(index_prob_unique, key=lambda tup: tup[1], reverse=True)
sorted_by_indegree = sorted(index_indegree_unique, key=lambda tup: tup[1], reverse=True)
for i in range(0,len(sorted_by_prob)):
    rank_diff += abs(sorted_by_prob[i][0]-sorted_by_indegree[i][0])
print rank_diff

In [1]:
a = [1,2,3,4]

In [6]:
a = (1,2,3)

In [7]:
a[0]

1