In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


# --Temporary: Focus on 1940-1950 Sub-Graph--

In [3]:

#create a sub-network of cases to reduce runtime
sub_vs = G.vs.select(year_lt=1951)
sub_vs = sub_vs.select(year_gt=1939)
sub_G = G.subgraph(sub_vs)
G = sub_G
print ig.summary(G)


IGRAPH DN-- 1593 5635 -- 
+ attr: court (v), name (v), year (v)
None


# Thresholded-Indegree

In [4]:
def thresholded_indegree(graph, vertex, threshold=10):
    td_indeg = 0
    vertex_year = vertex["year"]
    neighbors = graph.neighbors(vertex.index, mode='IN')
    for neighbor in neighbors:
        neighbor_year = graph.vs[neighbor]["year"]
        if neighbor_year - vertex_year <= threshold:
            td_indeg += 1
    return td_indeg

# Get each node's information (index, neighbors node cites, indeg, pagerank, etc.)

In [5]:
time1 = time.time()
#get lists of statistics for all cases
indegrees = G.indegree()
pageranks = G.pagerank()

#fill a list of cases with required info that will be sorted by year
case_tuples = []
for vertex in G.vs:
    index = vertex.index
    neighbors = G.neighbors(index, mode='OUT')
    year = vertex["year"]
    indegree = indegrees[index]
    thresholded_indegree_value = thresholded_indegree(G, vertex, 5) # make sure this threshold is <range of years we're evaluating scotus on
    pagerank = pageranks[index]
    case_tuple = (index, neighbors, year, indegree, pagerank, thresholded_indegree_value)
    case_tuples.append(case_tuple)

#sort the list of cases by year
sorted_case_tuples = sorted(case_tuples, key=lambda x: x[2])

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.0540001392365 seconds


# Create giant tuple with all the information for data table, later used for logsitic regression

In [6]:
time1 = time.time()
#uses the list of case tuples to fill a list of (all possible) edge tuples with approriate information
#currently does not include cases with the same year (so it is an approximation)
edge_count = 0
edge_tuples = []

#go down the list of cases, then create possible edges by going back up the list
#(since only cases with a lower year can be possibly cited)
for i in range(0,len(sorted_case_tuples)):
    citing_case = sorted_case_tuples[i]
    citing_index = citing_case[0]
    neighbors = citing_case[1]
    citing_year = citing_case[2]
    for j in range(i,0,-1):
        cited_case = sorted_case_tuples[j]
        cited_index = cited_case[0]
        cited_year = cited_case[2]
        
        age = citing_year - cited_year
        cited_indegree = cited_case[3]
        cited_pagerank = cited_case[4]
        cited_thresholded_indegree = cited_case[5]
        #if one of the out-edges of the citing case points to the (possible) cited case, then an edge is there (1)
        if cited_index in neighbors:
            edge = 1
            edge_count += 1
        else:
            edge = 0
        
        #edge_tuple = (edge, citing_index, cited_index, cited_indegree, cited_pagerank, cited_thresholded_indegree,
        #              age, citing_year, cited_year)
        edge_tuple = (edge, citing_index, cited_index)
        if not age == 0:
            edge_tuples.append(edge_tuple)

print "number of vertices: " + str(len(sorted_case_tuples))
print "number of edges: " + str(edge_count)
print "number of possible edges: " + str(len(edge_tuples))
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

number of vertices: 1593
number of edges: 5564
number of possible edges: 1149579
this took 1.33799982071 seconds


# Tidy Data Frame

In [46]:
time1 = time.time()
#list_of_column_names = ["edge", "citing index","cited index", "cited indegree", "cited pagerank", 
#                        "cited thresholded indegree", "age", "citing_year", "cited_year"]
list_of_column_names = ["edge", "citing index", "cited index"]

'''
index_is_edge = []
for i in range(0, len(edge_tuples)):
    #"id_citing"_"id_cited" (12_42)
    label = str(edge_tuples[i][1]) + '_' + str(edge_tuples[i][2])
    index_is_edge.append(label)
'''
    

#df = pd.DataFrame(edge_tuples, columns=list_of_column_names, index=index_is_edge)
df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
#df.to_csv('1940-1950_scotus_logreg_table.csv')
print df
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"



         edge  citing index  cited index
0           0           126         1583
1           0           126         1582
2           0           126         1581
3           0           126         1571
4           0           126         1570
5           0           126          168
6           0           126          152
7           0           126          148
8           0           126          147
9           0           126          146
10          0           126          145
11          0           126          143
12          0           126          142
13          0           126          141
14          0           126          140
15          0           126          139
16          0           126          138
17          0           126          137
18          0           126          136
19          0           126          135
20          0           126          134
21          0           126          133
22          0           126          130
23          0   

In [8]:
len(df)

1149579

In [11]:
count = 0
for index, row in df.iterrows():
    if row['edge']==0:
        count+=1

print count

KeyboardInterrupt: 

In [9]:
df0 = df.query('edge == 0')
df1 = df.query('edge == 1')
print len(df1)
print len(df0)
print len(df1) + len(df0) == len(df)



4893
1144686
True


In [50]:
df1 = df[df['edge'] == 1]
df0 = df[df['edge'] == 0]

print len(df1)
print len(df0)
print len(df1) + len(df0) == len(df)

4893
1144686
True


In [35]:
df0_random = df0.sample(frac=0.1, replace=False)
df1_random = df1.sample(frac=0.1, replace=False)

print len(df0_random)
print len(df1_random)

df_random_final = df1_random.append(df0_random, ignore_index=True)
#print df_random_final
#print df1_random
print df1_random


114469
489
         edge  citing index  cited index
55468       1           404           32
837993      1          1330          452
793616      1          1299         1140
308625      1           842          312
775278      1          1284          865
837014      1          1329           77
1094575     1          1513         1211
608948      1          1143          174
655633      1          1186          972
1082484     1          1505         1361
741786      1          1257          823
147647      1           616          195
21963       1           296           87
304348      1           837          645
144358      1           609          173
179695      1           666          446
407042      1           948          792
464838      1          1008          547
157092      1           630          575
654814      1          1185          707
409592      1           950          178
717744      1          1237           48
533358      1          1073          617
83622

In [30]:
test_indices = df_random_final['cited index']
print test_indices[1089]

481


In [13]:
def add_cited_indegree(dataframe, G):
    time1 = time.time()
    igraph_indices = dataframe['cited index']
    indegree = G.indegree()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(indegree[igraph_indices[i]])
        if i==1:
            print new_column
    dataframe['indegree'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return dataframe

def add_cited_pagerank(dataframe, G):
    time1 = time.time()
    igraph_indices = df['cited index']
    pagerank = G.pagerank()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(pagerank[igraph_indices[i]])
    df['pagerank'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [39]:
# test
#add_cited_indegree(df, G)
#add_cited_indegree(df_random_final, G)
#df0
#df1
#df0_random
#df1_random
#add_cited_indegree(df1, G)

df_test = df.sample(frac=0.1, replace=False)
df_test

print add_cited_indegree(df, G)

[4, 2]
This took 20.9060001373 seconds
         edge  citing index  cited index  indegree
0           0           126         1583         4
1           0           126         1582         2
2           0           126         1581         5
3           0           126         1571         1
4           0           126         1570        12
5           0           126          168         0
6           0           126          152         9
7           0           126          148         2
8           0           126          147         5
9           0           126          146         1
10          0           126          145         1
11          0           126          143         0
12          0           126          142         1
13          0           126          141         3
14          0           126          140         5
15          0           126          139         3
16          0           126          138        12
17          0           126          137   

In [49]:
add_cited_indegree(df1, G)

KeyError: 0L

In [54]:
#df_random_final
new_df_random_final = add_cited_indegree(df_random_final, G)
new_df_random_final2 = add_cited_pagerank(new_df_random_final, G)

print new_df_random_final2

KeyError: 0L