In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

import random as random

from collections import *

from load_data import load_citation_network_igraph, case_info

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors


%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


# SCOTUS Subgraphs by decade

In [3]:
# start from 1800

sub_G_dict = {}
scotus_years = range(1800, 2021) # scotus years actually from 1754-2016
scotus_decades = [year for year in scotus_years if year % 10 == 0] # 1800, ... , 2000, 2010, 2020

dict_decades_subgraphs = OrderedDict() # key: decade, value: subgraph with vertices less than that decade year
for i in scotus_decades:
    sub_vs = G.vs.select(year_lt=i)
    sub_G = G.subgraph(sub_vs) # IMPORTANT NOTE: THESE NEW SUBGRAPHS RE-INDEX (DIFFERENT INDICES THAN G_2)
    dict_decades_subgraphs[i] = sub_G
    print i, ": ", len(sub_G.vs), "vertices and ", len(sub_G.es), " edges"

1800 :  252 vertices and  17  edges
1810 :  478 vertices and  48  edges
1820 :  803 vertices and  120  edges
1830 :  1148 vertices and  156  edges
1840 :  1584 vertices and  294  edges
1850 :  1956 vertices and  474  edges
1860 :  2680 vertices and  831  edges
1870 :  3475 vertices and  884  edges
1880 :  5319 vertices and  2271  edges
1890 :  7902 vertices and  10255  edges
1900 :  10446 vertices and  25674  edges
1910 :  12463 vertices and  37864  edges
1920 :  14880 vertices and  52274  edges
1930 :  16887 vertices and  67361  edges
1940 :  18585 vertices and  86576  edges
1950 :  20079 vertices and  106644  edges
1960 :  21329 vertices and  118369  edges
1970 :  23642 vertices and  136684  edges
1980 :  25734 vertices and  166573  edges
1990 :  27848 vertices and  199818  edges
2000 :  29206 vertices and  221713  edges
2010 :  32505 vertices and  238179  edges
2020 :  33248 vertices and  250465  edges


# Create list of tuples of every present edge (1) with info (indices, years)

In [4]:
time1 = time.time()
edge_tuple_dict = {}
#for each subgraph makes a list of present edges (1)
for key in dict_decades_subgraphs:
    decade_sub_G = dict_decades_subgraphs[key]
    edge_tuple = decade_sub_G.get_edgelist() #returns a list of edges of the form (citing_index, cited_index)
    for i in range(0,len(edge_tuple)):
        #adds two items to each tuple which corresponds to the citing year and cited year
        #each list item is now in the form (1, citing_index, cited_index, citing_year, cited_year)
        edge_tuple[i] = (1, decade_sub_G.vs(edge_tuple[i][0])['name'][0]) + edge_tuple[i] + (decade_sub_G.vs(edge_tuple[i][0])['year'][0], decade_sub_G.vs(edge_tuple[i][1])['year'][0])
    edge_tuple_dict[key] = edge_tuple
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 397.924999952 seconds


# Tidy Data Frame (N1)

In [5]:
time1 = time.time()
df_dict = {}
list_of_column_names = ["edge", "citing_name" , "citing_index", "cited_index", "citing_year", "cited_year"]
#for each list of edges converts it into a df
for key in edge_tuple_dict:
    edge_tuples = edge_tuple_dict[key]
    df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
    df_dict[key] = df 
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 1.73600006104 seconds


In [None]:
#save tidy df
#df.to_csv('1925_scotus_logreg_table.csv')

In [6]:
print len(G.get_edgelist())
print len(set(G.get_edgelist()))

250465
250465


# Create list of tuples of not present edges (0) with info (indices, years)

# old code, where we had set(vertices) in the while loop -- 0.45 seconds

In [70]:
time1 = time.time()
edge_tuple_dict_0 = {}

# for each subgraph make a list of non-present edges
decade_sub_G = dict_decades_subgraphs[1850]

vertices = decade_sub_G.vs

one_edges = set(decade_sub_G.get_edgelist())
zero_edges = set([])
time3 = time.time()

while len(zero_edges) < len(one_edges):
    new_edge_list = random.sample(set(vertices), 2)
    
    new_edge = (new_edge_list[0],new_edge_list[1])
    final_edge = (0, decade_sub_G.vs(new_edge[0].index)['name'][0], new_edge[0].index, new_edge[1].index, decade_sub_G.vs(new_edge[0].index)['year'][0], decade_sub_G.vs(new_edge[1].index)['year'][0])
    # todo: explain this long if statement (last conditional makes assumption that we will not use edge in same year)
    if new_edge not in one_edges and final_edge not in zero_edges and final_edge[4]>final_edge[5]:
        zero_edges.add(final_edge)
time4 = time.time()
print 1850, ":", len(zero_edges), "NON-present edges", "--took", time4-time3, "seconds--"
edge_tuple_dict_0[1850] = zero_edges

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

1850 : 474 NON-present edges --took 0.458000183105 seconds--
this took 0.460000038147 seconds


# np.random.choice(vertices)--5.8 seconds vs. 
# random.sample(vertices_set)--0.06 seconds

In [93]:
time1 = time.time()
edge_tuple_dict_0 = {}

# for each subgraph make a list of non-present edges
decade_sub_G = dict_decades_subgraphs[1850]

vertices = decade_sub_G.vs
vertices_set = set(vertices)

one_edges = set(decade_sub_G.get_edgelist())
zero_edges = set([])
time3 = time.time()

while len(zero_edges) < len(one_edges):
    #temp = np.random.choice(vertices, size=2, replace=False)
    temp = random.sample(vertices_set, 2)
    
    new_edge = (temp[0], temp[1])
    final_edge = (0, decade_sub_G.vs(new_edge[0].index)['name'][0], new_edge[0].index, new_edge[1].index, decade_sub_G.vs(new_edge[0].index)['year'][0], decade_sub_G.vs(new_edge[1].index)['year'][0])
    # todo: explain this long if statement (last conditional makes assumption that we will not use edge in same year)
    if new_edge not in one_edges and final_edge not in zero_edges and final_edge[4]>final_edge[5]:
        zero_edges.add(final_edge)
time4 = time.time()
print 1850, ":", len(zero_edges), "NON-present edges", "--took", time4-time3, "seconds--"
edge_tuple_dict_0[1850] = zero_edges

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

1850 : 474 NON-present edges --took 5.58799982071 seconds--
this took 5.59100008011 seconds


# np.random.choice(indices)-- 0.28 vs.
# random.sample(indices_set)--0.08

In [92]:
time1 = time.time()
edge_tuple_dict_0 = {}

# for each subgraph make a list of non-present edges
decade_sub_G = dict_decades_subgraphs[1850]

vertices = decade_sub_G.vs
indices = range(0, len(vertices))
indices_set = set(indices)

one_edges = set(decade_sub_G.get_edgelist())
zero_edges = set([])
time3 = time.time()

while len(zero_edges) < len(one_edges):
    temp = np.random.choice(indices, size=2, replace=False)
    #temp = random.sample(indices_set, 2)
    
    new_edge = (temp[0], temp[1])
    final_edge = (0, decade_sub_G.vs(new_edge[0])['name'][0], new_edge[0], new_edge[1], decade_sub_G.vs(new_edge[0])['year'][0], decade_sub_G.vs(new_edge[1])['year'][0])
    # todo: explain this long if statement (last conditional makes assumption that we will not use edge in same year)
    if new_edge not in one_edges and final_edge not in zero_edges and final_edge[4]>final_edge[5]:
        zero_edges.add(final_edge)
time4 = time.time()
print 1850, ":", len(zero_edges), "NON-present edges", "--took", time4-time3, "seconds--"
edge_tuple_dict_0[1850] = zero_edges

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

1850 : 474 NON-present edges --took 0.27999997139 seconds--
this took 0.281999826431 seconds


# Tidy Data Frame (N0)

In [7]:
time1 = time.time()
df_dict_0 = {}
list_of_column_names = ["edge", "citing_name", "citing_index", "cited_index", "citing_year", "cited_year"]
#for each list of edges converts it into a df
for key in edge_tuple_dict_0:
    edge_tuples = edge_tuple_dict_0[key]
    df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
    df_dict_0[key] = df 
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.125999927521 seconds


# combine the dataframes in the dictionaries containing dataframes for N0 and N1

In [10]:
df_dict_final = {}
for key in df_dict:
    df_dict_final[key] = df_dict[key].append(df_dict_0[key], ignore_index=True)

# add ed in-degree, ed PageRank etc

In [12]:
def add_cited_indegree(dataframe, G):
    igraph_indices = dataframe['cited_index']
    indegree = G.indegree()
    new_column = []
    for i in igraph_indices:
        new_column.append(indegree[i])
    dataframe['indegree'] = new_column
    return dataframe

In [13]:
def add_cited_pagerank(dataframe, G):
    igraph_indices = dataframe['cited_index']
    pagerank = G.pagerank()
    new_column = []
    for i in igraph_indices:
        new_column.append(pagerank[i])
    dataframe['pagerank'] = new_column
    return dataframe

In [14]:
for i in scotus_decades:
    temp_df = add_cited_indegree(df_dict_final[i], dict_decades_subgraphs[i])
    df_dict_final[i] = add_cited_pagerank(temp_df, dict_decades_subgraphs[i])

# run logistic regression and attachment probabilities to DF

In [16]:
def logreg(dataframe, x_train_list):
    #set up training data
    y_train = dataframe['edge']
    x_train = dataframe[x_train_list]

    #calculate logistical regression
    clf = skl_lm.LogisticRegression(solver='newton-cg')
    clf.fit(x_train, y_train)

    
    # Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
    prob = clf.predict_proba(x_train)

    # predicted probabilities for ALL case for edge present (1)
    prob_up = prob[:,1:2]
    # convert to list
    prob_up2 = [i.tolist()[0] for i in prob_up]
    
    return prob_up2

In [17]:
for i in scotus_decades:
    df_dict_final[i]['indegree_attachment_p'] = logreg(df_dict_final[i], ['indegree'])
    df_dict_final[i]['pagerank_attachment_p'] = logreg(df_dict_final[i], ['pagerank'])

In [18]:
df_dict_final[1900]

      edge  citing_name  citing_index  cited_index  citing_year  cited_year  \
0        1      1087705             2         2364         1899        1894   
1        1      1087706             3         2480         1899        1895   
2        1      1087839             7         1609         1883        1883   
3        1      1087846             9         1367         1894        1879   
4        1      1100582            10           12         1896        1896   
5        1      1100676            11         1408         1896        1879   
6        1      1100694            12           10         1896        1896   
7        1      1100810            13          522         1896        1845   
8        1      1100810            13         1642         1896        1883   
9        1      1206937            17         2276         1895        1893   
10       1      1206937            17         2369         1895        1894   
11       1      1206937            17         2437  

# Score-Rank

In [59]:
for i in scotus_decades:
    df_dict_final[i] = df_dict_final[i].sort_values('indegree', ascending=False, kind='mergesort').reset_index(drop=True)

df_dict_final[2020]
# pagerank attachment probability ranges between 0.49 and 0.51 (weird)
# indegree has attachment probability 1 for indeg=70 (repeating ones because they're all the same cited index)
# make the cited vertices unique (so there's only one row that matches each indegree/PR/indeg P/PR P)

       edge  citing_name  citing_index  cited_index  citing_year  cited_year  \
0         1       107318          1704         1694         1966        1966   
1         1       107342          1714         1694         1966        1966   
2         1       107650          1778         1694         1968        1966   
3         1       107900          1843         1694         1969        1966   
4         1       107913          1848         1694         1969        1966   
5         1       107949          1856         1694         1969        1966   
6         1       107952          1859         1694         1969        1966   
7         1       108139          1900         1694         1970        1966   
8         1       108231          1920         1694         1971        1966   
9         1       108302          1939         1694         1971        1966   
10        1       108419          1964         1694         1971        1966   
11        1       108515          1986  

In [62]:
vertex_name = G_2.vs(0)['name'][0]
vertex_year = G_2.vs(0)['year'][0]
print vertex_year
decade = vertex_year + (10-vertex_year%10)
print decade
decade_sub_G = dict_decades_subgraphs[decade]
decade_sub_df = df_dict_final[decade]

row_index = decade_sub_df[decade_sub_df['citing_name'] == vertex_name].index.tolist()
if decade >= 1800 and len(row_index) != 0:
    case_index = decade_sub_df.iloc[row_index[0]]['citing_index']
    neighbors = decade_sub_G.neighbors(case_index, mode='OUT')
    
    
case_index = decade_sub_df.iloc[row_index[0]]['citing_index']
#neighbors = decade_sub_G.neighbors(case_index, mode='OUT')
case_index

1922
1930


IndexError: list index out of range

In [63]:
print G_2.neighbors(0, mode='OUT')

[]


# Below is experimental code

In [None]:
def make_random_subset_df(df):
    time1 = time.time()
    df1 = df[df['edge'] == 1]
    df0 = df[df['edge'] == 0]
    frac_val = len(df1)/len(df0)
    df0_random_subset = df0.sample(frac=frac_val, replace=False)
    
    df_subset = df1.append(df0_random_subset, ignore_index=True)
    
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df_subset

In [None]:
def add_cited_indegree(df, G):
    time1 = time.time()
    igraph_indices = df['cited_index']
    indegree = G.indegree()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(indegree[igraph_indices[i]])
    df['cited_indegree'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [None]:
def add_cited_pagerank(df, G):
    time1 = time.time()
    igraph_indices = df['cited index']
    pagerank = G.pagerank()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(pagerank[igraph_indices[i]])
    df['pagerank'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [None]:
new_df = make_random_subset_df(df)

In [None]:
print new_df

In [None]:
new_df = add_cited_indegree(new_df, G)
print new_df

In [None]:
import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
time1 = time.time()
#set up training data
y_train = new_df['edge']
x_train = new_df[['cited_indegree']]

#calculate logistical regression
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

print 'classes: ',clf.classes_
print 'coefficients: ',clf.coef_
print 'intercept :', clf.intercept_
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

time1 = time.time()
# Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
prob = clf.predict_proba(x_train)

# predicted probabilities for ALL case for edge present (1)
prob_edge = prob[:,1:2]
# convert to list
prob_edge = [i.tolist()[0] for i in prob_edge]


In [None]:
index_prob = zip(new_df['cited_index'],prob_edge)

cited_indexes = []
attachment_probs = []
for cited_tuple in index_prob:
    if cited_tuple[0] not in cited_indexes:
        cited_indexes.append(cited_tuple[0])
        attachment_probs.append(cited_tuple[1])

index_prob_unique = zip(cited_indexes, attachment_probs)

print len(index_prob)
print len(index_prob_unique)

In [None]:
index_indegree = zip(new_df['cited_index'],new_df['cited_indegree'])

cited_indexes = []
indegrees = []
for cited_tuple in index_indegree:
    if cited_tuple[0] not in cited_indexes:
        cited_indexes.append(cited_tuple[0])
        indegrees.append(cited_tuple[1])

index_indegree_unique = zip(cited_indexes, indegrees)

print len(index_indegree)
print len(index_indegree_unique)

In [None]:
rank_diff = 0
sorted_by_prob = sorted(index_prob_unique, key=lambda tup: tup[1], reverse=True)
sorted_by_indegree = sorted(index_indegree_unique, key=lambda tup: tup[1], reverse=True)
for i in range(0,len(sorted_by_prob)):
    rank_diff += abs(sorted_by_prob[i][0]-sorted_by_indegree[i][0])
print rank_diff

In [None]:
a = [1,2,3,4]

In [None]:
a = (1,2,3)

In [None]:
a[0]