In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

import random as random

from collections import *

from load_data import load_citation_network_igraph, case_info

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors


%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


# SCOTUS Subgraphs by decade -- randomized subset of vertices

In [3]:
# start from 1800
# get randomized subgraph of entire scotus with 1/4 of vertices

G_vs_index_list = []
for j in G.vs:
    G_vs_index_list.append(j.index)
G_vs_subset_index_list = random.sample(set(G_vs_index_list), int(round(len(G_vs_index_list)/4)))

G_2 = G.subgraph(G_vs_subset_index_list) # get randomized subgraph of entire scotus with 1/4 of vertices

sub_G_dict = {}
scotus_years = range(1800, 2021) # scotus years actually from 1754-2016
scotus_decades = [year for year in scotus_years if year % 10 == 0] # 1800, ... , 2000, 2010, 2020

dict_decades_subgraphs = OrderedDict() # key: decade, value: subgraph with vertices less than that decade year
for i in scotus_decades:
    sub_vs = G_2.vs.select(year_lt=i)
    sub_G = G_2.subgraph(sub_vs)
    dict_decades_subgraphs[i] = sub_G
    print i, ": ", len(sub_G.vs), "vertices and ", len(sub_G.es), " edges"

1800 :  67 vertices and  4  edges
1810 :  124 vertices and  5  edges
1820 :  204 vertices and  7  edges
1830 :  284 vertices and  8  edges
1840 :  392 vertices and  24  edges
1850 :  478 vertices and  30  edges
1860 :  661 vertices and  51  edges
1870 :  890 vertices and  55  edges
1880 :  1356 vertices and  126  edges
1890 :  1974 vertices and  592  edges
1900 :  2608 vertices and  1586  edges
1910 :  3123 vertices and  2378  edges
1920 :  3704 vertices and  3285  edges
1930 :  4234 vertices and  4305  edges
1940 :  4705 vertices and  5667  edges
1950 :  5060 vertices and  6841  edges
1960 :  5361 vertices and  7483  edges
1970 :  5908 vertices and  8618  edges
1980 :  6450 vertices and  10545  edges
1990 :  6983 vertices and  12794  edges
2000 :  7357 vertices and  14419  edges
2010 :  8127 vertices and  15449  edges
2020 :  8312 vertices and  16235  edges


# Create list of tuples of every present edge (1) with info (indices, years)

In [4]:
time1 = time.time()
edge_tuple_dict = {}
#for each subgraph makes a list of present edges (1)
for key in dict_decades_subgraphs:
    decade_sub_G = dict_decades_subgraphs[key]
    edge_tuple = decade_sub_G.get_edgelist() #returns a list of edges of the form (citing_index, cited_index)
    for i in range(0,len(edge_tuple)):
        #adds two items to each tuple which corresponds to the citing year and cited year
        #each list item is now in the form (1, citing_index, cited_index, citing_year, cited_year)
        edge_tuple[i] = (1,decade_sub_G.vs(edge_tuple[i][0])['name'][0]) + edge_tuple[i] + (decade_sub_G.vs(edge_tuple[i][0])['year'][0],decade_sub_G.vs(edge_tuple[i][1])['year'][0])
    edge_tuple_dict[key] = edge_tuple
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 6.82100009918 seconds


# Tidy Data Frame (N1)

In [5]:
time1 = time.time()
df_dict = {}
list_of_column_names = ["edge", "citing_name","citing_index", "cited_index", "citing_year", "cited_year"]
#for each list of edges converts it into a df
for key in edge_tuple_dict:
    edge_tuples = edge_tuple_dict[key]
    df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
    df_dict[key] = df 
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.128000020981 seconds


In [None]:
#save tidy df
#df.to_csv('1925_scotus_logreg_table.csv')

# Create list of tuples of not present edges (0) with info (indices, years)

In [22]:
time1 = time.time()
edge_tuple_dict_0 = {}

# for each subgraph make a list of non-present edges
for key in dict_decades_subgraphs:
    decade_sub_G = dict_decades_subgraphs[key]
    vertices = decade_sub_G.vs
    one_edges = decade_sub_G.get_edgelist()
    zero_edges = []
    time3 = time.time()
    while len(zero_edges) < len(one_edges):
        temp = random.sample(set(vertices), 2)
        new_edge = (temp[0].index, temp[1].index)
        final_new_edge = (0,decade_sub_G.vs(new_edge[0])['name'][0]) + new_edge + (decade_sub_G.vs(new_edge[0])['year'][0], decade_sub_G.vs(new_edge[1])['year'][0])
        # todo: explain this long if statement (last conditional makes assumption that we will not use edge in same year)
        if not new_edge[0]==new_edge[1] and new_edge not in one_edges and final_new_edge not in zero_edges and final_new_edge[4]>final_new_edge[5]:
            zero_edges.append(final_new_edge)
    time4 = time.time()
    print key, ":", len(zero_edges), "NON-present edges", "--took", time4-time3, "seconds--"
    edge_tuple_dict_0[key] = zero_edges

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

1800 : 4 NON-present edges --took 0.000999927520752 seconds--
1810 : 5 NON-present edges --took 0.00100016593933 seconds--
1820 : 7 NON-present edges --took 0.0 seconds--
1830 : 8 NON-present edges --took 0.000999927520752 seconds--
1840 : 24 NON-present edges --took 0.00500011444092 seconds--
1850 : 30 NON-present edges --took 0.00999999046326 seconds--
1860 : 51 NON-present edges --took 0.0160000324249 seconds--
1870 : 55 NON-present edges --took 0.029000043869 seconds--
1880 : 126 NON-present edges --took 0.0659999847412 seconds--
1890 : 592 NON-present edges --took 0.570999860764 seconds--
1900 : 1586 NON-present edges --took 2.56700015068 seconds--
1910 : 2378 NON-present edges --took 3.55300021172 seconds--
1920 : 3285 NON-present edges --took 6.99900007248 seconds--
1930 : 4305 NON-present edges --took 9.1210000515 seconds--
1940 : 5667 NON-present edges --took 15.007999897 seconds--
1950 : 6841 NON-present edges --took 22.628000021 seconds--
1960 : 7483 NON-present edges --took

# Tidy Data Frame (N0)

In [23]:
time1 = time.time()
df_dict_0 = {}
list_of_column_names = ["edge", "citing_name","citing_index", "cited_index", "citing_year", "cited_year"]
#for each list of edges converts it into a df
for key in edge_tuple_dict_0:
    edge_tuples = edge_tuple_dict_0[key]
    df = pd.DataFrame(edge_tuples, columns=list_of_column_names)
    df_dict_0[key] = df 
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 0.131999969482 seconds


# combine the dataframes in the dictionaries containing dataframes for N0 and N1

In [24]:
df_dict_final = {}
for key in df_dict:
    df_dict_final[key] = df_dict[key].append(df_dict_0[key], ignore_index=True)
    
df_dict_final[1850]

    edge  citing_name  citing_index  cited_index  citing_year  cited_year
0      1      1303460             4           71         1794        1794
1      1      2527082            35          194         1817        1817
2      1      2620861            49           87         1794        1794
3      1        84671            71            4         1794        1794
4      1        84715            87           49         1794        1794
5      1        84845           119          120         1807        1807
6      1        85186           194           35         1817        1817
7      1        85565           269          265         1828        1827
8      1        85797           323          234         1832        1823
9      1        85820           327          248         1833        1824
10     1        85828           329          250         1833        1824
11     1        85853           333          313         1834        1832
12     1        85889           340   

# add ed in-degree, ed PageRank etc

In [25]:
def add_cited_indegree(dataframe, G):
    igraph_indices = dataframe['cited_index']
    indegree = G.indegree()
    new_column = []
    for i in igraph_indices:
        new_column.append(indegree[i])
    dataframe['cited_indegree'] = new_column
    return dataframe

In [26]:
def add_cited_pagerank(dataframe, G):
    igraph_indices = dataframe['cited_index']
    pagerank = G.pagerank()
    new_column = []
    for i in igraph_indices:
        new_column.append(pagerank[i])
    dataframe['cited_pagerank'] = new_column
    return dataframe

In [27]:
for i in scotus_decades:
    temp_df = add_cited_indegree(df_dict_final[i], dict_decades_subgraphs[i])
    df_dict_final[i] = add_cited_pagerank(temp_df, dict_decades_subgraphs[i])

In [28]:
df_dict_final[1900]

      edge  citing_name  citing_index  cited_index  citing_year  cited_year  \
0        1      1087706             2          461         1899        1842   
1        1      1087706             2         2401         1899        1895   
2        1      1087731             3         1330         1880        1878   
3        1      1087734             4          605         1880        1853   
4        1      1087734             4          814         1880        1865   
5        1      1087745             5         1398         1897        1880   
6        1      1087745             5         2287         1897        1894   
7        1      1087768             6         1079         1898        1873   
8        1      1087768             6         1105         1898        1873   
9        1      1087768             6         1604         1898        1884   
10       1      1087768             6         1890         1898        1888   
11       1      1087768             6         2019  

# run logistic regression

In [29]:
def logreg(dataframe, x_train_list):
    #set up training data
    y_train = dataframe['edge']
    x_train = dataframe[x_train_list]

    #calculate logistical regression
    clf = skl_lm.LogisticRegression(solver='newton-cg')
    clf.fit(x_train, y_train)

    
    # Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
    prob = clf.predict_proba(x_train)

    # predicted probabilities for ALL case for edge present (1)
    prob_up = prob[:,1:2]
    # convert to list
    prob_up2 = [i.tolist()[0] for i in prob_up]
    
    return prob_up2

In [30]:
for i in scotus_decades:
    df_dict_final[i]['indegree_attachment_p'] = logreg(df_dict_final[i], ['indegree'])
    df_dict_final[i]['pagerank_attachment_p'] = logreg(df_dict_final[i], ['pagerank'])

In [31]:
df_dict_final[1900]

      edge  citing_name  citing_index  cited_index  citing_year  cited_year  \
0        1      1087706             2          461         1899        1842   
1        1      1087706             2         2401         1899        1895   
2        1      1087731             3         1330         1880        1878   
3        1      1087734             4          605         1880        1853   
4        1      1087734             4          814         1880        1865   
5        1      1087745             5         1398         1897        1880   
6        1      1087745             5         2287         1897        1894   
7        1      1087768             6         1079         1898        1873   
8        1      1087768             6         1105         1898        1873   
9        1      1087768             6         1604         1898        1884   
10       1      1087768             6         1890         1898        1888   
11       1      1087768             6         2019  

# Start of Ranking Code By Indegree

In [79]:
for i in scotus_decades:
    df_dict_final[i] = df_dict_final[i].sort_values('indegree', ascending=False, kind='mergesort').reset_index(drop=True)

In [80]:
df_dict_final[2020]

       edge  citing_name  citing_index  cited_index  citing_year  cited_year  \
0         1      1087951          2165         2789         1994        1984   
1         1       111557          2880         2789         1986        1984   
2         1       111789          2934         2789         1987        1984   
3         1       111790          2935         2789         1987        1984   
4         1       111838          2945         2789         1987        1984   
5         1       111943          2977         2789         1987        1984   
6         1       112043          3005         2789         1988        1984   
7         1       112272          3061         2789         1989        1984   
8         1       112359          3084         2789         1990        1984   
9         1       112379          3088         2789         1990        1984   
10        1       112405          3094         2789         1990        1984   
11        1       112466          3109  

In [76]:
case_index_array = []
for v in G_2.vs:
    i = v.index
    vertex_name = G_2.vs(i)['name'][0]
    vertex_year = G_2.vs(i)['year'][0]
    decade = vertex_year + (10-vertex_year%10)
    if decade >= 1800:
        decade_sub_G = dict_decades_subgraphs[decade]
        decade_sub_df = df_dict_final[decade]

        row_index = decade_sub_df[decade_sub_df['citing_name'] == vertex_name].index.tolist()
        if not row_index == []:
            row_index = row_index[0]
        case_index = decade_sub_df.iloc[row_index]['citing_index']
        #neighbors = decade_sub_G.neighbors(case_index, mode='OUT')
        case_index_array.append(case_index)
case_index_array

[0.0,
 1.0,
 Series([], Name: citing_index, dtype: int64),
 3.0,
 4.0,
 Series([], Name: citing_index, dtype: int64),
 6.0,
 7.0,
 8.0,
 9.0,
 10.0,
 11.0,
 12.0,
 Series([], Name: citing_index, dtype: int64),
 14.0,
 15.0,
 16.0,
 17.0,
 18.0,
 19.0,
 20.0,
 21.0,
 22.0,
 23.0,
 24.0,
 25.0,
 26.0,
 27.0,
 28.0,
 29.0,
 30.0,
 31.0,
 32.0,
 Series([], Name: citing_index, dtype: int64),
 34.0,
 35.0,
 36.0,
 37.0,
 38.0,
 39.0,
 40.0,
 41.0,
 42.0,
 43.0,
 44.0,
 45.0,
 46.0,
 47.0,
 48.0,
 49.0,
 50.0,
 51.0,
 52.0,
 53.0,
 54.0,
 Series([], Name: citing_index, dtype: int64),
 56.0,
 57.0,
 58.0,
 59.0,
 60.0,
 61.0,
 62.0,
 63.0,
 64.0,
 65.0,
 66.0,
 67.0,
 68.0,
 69.0,
 70.0,
 71.0,
 72.0,
 73.0,
 74.0,
 75.0,
 76.0,
 77.0,
 78.0,
 79.0,
 80.0,
 81.0,
 82.0,
 Series([], Name: citing_index, dtype: int64),
 84.0,
 85.0,
 86.0,
 87.0,
 88.0,
 89.0,
 90.0,
 91.0,
 92.0,
 93.0,
 94.0,
 95.0,
 96.0,
 97.0,
 98.0,
 99.0,
 100.0,
 101.0,
 102.0,
 103.0,
 104.0,
 105.0,
 106.0,
 107.0,
 108

# Below is experimental code

In [None]:
def make_random_subset_df(df):
    time1 = time.time()
    df1 = df[df['edge'] == 1]
    df0 = df[df['edge'] == 0]
    frac_val = len(df1)/len(df0)
    df0_random_subset = df0.sample(frac=frac_val, replace=False)
    
    df_subset = df1.append(df0_random_subset, ignore_index=True)
    
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df_subset

In [None]:
def add_cited_indegree(df, G):
    time1 = time.time()
    igraph_indices = df['cited_index']
    indegree = G.indegree()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(indegree[igraph_indices[i]])
    df['cited_indegree'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [None]:
def add_cited_pagerank(df, G):
    time1 = time.time()
    igraph_indices = df['cited index']
    pagerank = G.pagerank()
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(pagerank[igraph_indices[i]])
    df['pagerank'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [None]:
new_df = make_random_subset_df(df)

In [None]:
print new_df

In [None]:
new_df = add_cited_indegree(new_df, G)
print new_df

In [None]:
import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
time1 = time.time()
#set up training data
y_train = new_df['edge']
x_train = new_df[['cited_indegree']]

#calculate logistical regression
clf = skl_lm.LogisticRegression(solver='newton-cg')
clf.fit(x_train, y_train)

print 'classes: ',clf.classes_
print 'coefficients: ',clf.coef_
print 'intercept :', clf.intercept_
time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

time1 = time.time()
# Matrix, where column = probability for no edge (0), probability for edge (1)--reference: clf.classes_
prob = clf.predict_proba(x_train)

# predicted probabilities for ALL case for edge present (1)
prob_edge = prob[:,1:2]
# convert to list
prob_edge = [i.tolist()[0] for i in prob_edge]


In [None]:
index_prob = zip(new_df['cited_index'],prob_edge)

cited_indexes = []
attachment_probs = []
for cited_tuple in index_prob:
    if cited_tuple[0] not in cited_indexes:
        cited_indexes.append(cited_tuple[0])
        attachment_probs.append(cited_tuple[1])

index_prob_unique = zip(cited_indexes, attachment_probs)

print len(index_prob)
print len(index_prob_unique)

In [None]:
index_indegree = zip(new_df['cited_index'],new_df['cited_indegree'])

cited_indexes = []
indegrees = []
for cited_tuple in index_indegree:
    if cited_tuple[0] not in cited_indexes:
        cited_indexes.append(cited_tuple[0])
        indegrees.append(cited_tuple[1])

index_indegree_unique = zip(cited_indexes, indegrees)

print len(index_indegree)
print len(index_indegree_unique)

In [None]:
rank_diff = 0
sorted_by_prob = sorted(index_prob_unique, key=lambda tup: tup[1], reverse=True)
sorted_by_indegree = sorted(index_indegree_unique, key=lambda tup: tup[1], reverse=True)
for i in range(0,len(sorted_by_prob)):
    rank_diff += abs(sorted_by_prob[i][0]-sorted_by_indegree[i][0])
print rank_diff

In [None]:
a = [1,2,3,4]

In [None]:
a = (1,2,3)

In [None]:
a[0]