In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

import random as random

from collections import *

from load_data import load_citation_network_igraph, case_info

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors


%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


# Delete bad edges (cases that cite forward in time)

In [3]:
all_edges = G.get_edgelist() # list of tuples

bad_edges = []
for edge in all_edges:
    citing_year = G.vs(edge[0])['year'][0]
    cited_year = G.vs(edge[1])['year'][0]
    
    if citing_year < cited_year:
        bad_edges.append(edge)

G.delete_edges(bad_edges)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

loaded scotus network with 33248 cases and 250449 edges


# create subgraphs

In [4]:
# start from 1760
scotus_years = range(1760, 2021) # scotus years actually from 1754-2016
scotus_decades = [year for year in scotus_years if year % 10 == 0] # 1760, ... , 2000, 2010, 2020

subgraph_dict = OrderedDict() # key: decade, value: subgraph with vertices less than that decade year
for i in scotus_decades:
    sub_vs = G.vs.select(year_lt=i)
    sub_G = G.subgraph(sub_vs) # IMPORTANT NOTE: THESE NEW SUBGRAPHS RE-INDEX (DIFFERENT INDICES THAN G_2)
    subgraph_dict[i] = sub_G
    print i, ": ", len(sub_G.vs), "vertices and ", len(sub_G.es), " edges"

1760 :  2 vertices and  0  edges
1770 :  12 vertices and  3  edges
1780 :  19 vertices and  3  edges
1790 :  109 vertices and  6  edges
1800 :  252 vertices and  17  edges
1810 :  478 vertices and  48  edges
1820 :  803 vertices and  120  edges
1830 :  1148 vertices and  156  edges
1840 :  1584 vertices and  294  edges
1850 :  1956 vertices and  474  edges
1860 :  2680 vertices and  831  edges
1870 :  3475 vertices and  884  edges
1880 :  5319 vertices and  2271  edges
1890 :  7902 vertices and  10255  edges
1900 :  10446 vertices and  25673  edges
1910 :  12463 vertices and  37863  edges
1920 :  14880 vertices and  52273  edges
1930 :  16887 vertices and  67360  edges
1940 :  18585 vertices and  86575  edges
1950 :  20079 vertices and  106643  edges
1960 :  21329 vertices and  118368  edges
1970 :  23642 vertices and  136683  edges
1980 :  25734 vertices and  166571  edges
1990 :  27848 vertices and  199816  edges
2000 :  29206 vertices and  221711  edges
2010 :  32505 vertices and  2

# create vertex DFs (with metrics)

In [5]:
vertex_df_dict = OrderedDict()
for key in scotus_decades:
    #get subgraph
    subgraph = subgraph_dict[key]
    
    #get metrics in lists
    vertex_name = subgraph.vs['name']
    vertex_year = subgraph.vs['year']
    indegree = subgraph.indegree()
    pagerank = subgraph.pagerank()
    
    #build df from lists
    column_names = ['name','year','indegree','pagerank']
    vertex_tuples = zip(vertex_name,vertex_year,indegree,pagerank)
    vertex_df = pd.DataFrame(vertex_tuples, columns=column_names)
    
    #add df to dict
    vertex_df_dict[key] = vertex_df

In [6]:
vertex_df_dict[1950]

         name  year  indegree  pagerank
0      100000  1922         1  0.000022
1      100001  1922         1  0.000025
2      100002  1922         4  0.000022
3      100003  1922         3  0.000023
4      100004  1922         4  0.000028
5      100005  1922         6  0.000029
6      100006  1922         5  0.000053
7      100007  1922        16  0.000061
8      100008  1922         0  0.000020
9      100009  1922        16  0.000056
10     100010  1922        21  0.000055
11     100011  1922        23  0.000090
12     100012  1922         1  0.000020
13     100013  1922         4  0.000034
14     100014  1922         4  0.000028
15     100015  1922         9  0.000045
16     100016  1922         2  0.000026
17     100017  1922         0  0.000020
18     100018  1922        28  0.000161
19     100019  1922         0  0.000020
20     100020  1922        16  0.000074
21     100021  1922         3  0.000026
22     100022  1922         8  0.000047
23     100023  1922        17  0.000061


# create edgelist -- 'ed' metric in 'ing' year

# create N1 (present edges)

In [7]:
time1 = time.time()

#list of tuples that will become N1 df
#each tuple in the form: 0 (non-present edge), citing_name, cited_name, cited metrics (indegree, pagerank, etc.)
n1_list = []

#get all present edges
n1 = subgraph_dict[scotus_decades[-1]].get_edgelist() #list of tuples

for edge in n1:
    #get info from edge
    citing_year = G.vs(edge[0])['year'][0]
    cited_year = G.vs(edge[1])['year'][0]
    citing_name = G.vs(edge[0])['name'][0]
    cited_name = G.vs(edge[1])['name'][0]

    #determine which vertex_df to retrieve
    decade = citing_year + (10 - citing_year%10)
    vertex_df = vertex_df_dict[decade]
    
    #get row from df using cited_name
    row = vertex_df.loc[vertex_df['name']==cited_name].values.tolist()
    
    edge_tuple = (1, citing_name) + tuple(row[0])
    n1_list.append(edge_tuple)
        
column_names = ['edge','citing_name','cited_name','cited_year','cited_indegree','cited_pagerank']
n1_df = pd.DataFrame(n1_list, columns=column_names)

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 199.434000015 seconds


In [8]:
# Fix some columns that are type double
integer_columns = ['edge', 'citing_name', 'cited_name', 'cited_year', 'cited_indegree'] # keep pagerank as float (obviously)
n1_df[integer_columns] = n1_df[integer_columns].astype(np.int64) # turn double columns into integer columns

# create N0 (non-present edges)

In [9]:
time1 = time.time()

# set of tuples that will become N0 df
# each tuple in the form: 0 (non-present edge), citing_name, cited_name, cited metrics (indegree, pagerank, etc.)

n0_set = set([]) # set makes adding 'edge_tuple' unique in the while loop (need b/c random sampling can return duplicates)
n1_edges = set(subgraph_dict[scotus_decades[-1]].get_edgelist()) # when searching for element, set is faster than list
n1_vertices_set = set(subgraph_dict[scotus_decades[-1]].vs) # just for consistency

while len(n0_set) < len(n1_edges):
    # get random_edge
    temp = random.sample(n1_vertices_set, 2) # default: without replacement
    random_edge = (temp[0].index, temp[1].index)
    
    # get info from edge
    citing_year = G.vs(random_edge[0])['year'][0]
    cited_year = G.vs(random_edge[1])['year'][0]
    citing_name = G.vs(random_edge[0])['name'][0]
    cited_name = G.vs(random_edge[1])['name'][0]
    
    if random_edge not in n1_edges and citing_year >= cited_year:
        # determine which vertex_df to retrieve
        decade = citing_year + (10 - citing_year%10)
        vertex_df = vertex_df_dict[decade]

        # get row from df using cited name
        row = vertex_df.loc[vertex_df['name']==cited_name].values.tolist()

        edge_tuple = (0, citing_name) + tuple(row[0])
        n0_set.add(edge_tuple)

time2 = time.time()
print "this took " + str(time2-time1) + " seconds"

this took 556.496999979 seconds


In [10]:
column_names = ['edge','citing_name','cited_name','cited_year','cited_indegree','cited_pagerank']
n0_df = pd.DataFrame(list(n0_set), columns=column_names)

# Fix some columns that are type double
integer_columns = ['edge', 'citing_name', 'cited_name', 'cited_year', 'cited_indegree'] # keep pagerank as float (obviously)
n0_df[integer_columns] = n0_df[integer_columns].astype(np.int64) # turn double columns into integer columns

# Combine n0_df and n1_df into edges_df

In [11]:
edges_df = n0_df.append(n1_df)

In [12]:
edges_df

        edge  citing_name  cited_name  cited_year  cited_indegree  \
0          0       105146       94388        1896               2   
1          0       111030       99966        1922              20   
2          0       106778       84925        1809               0   
3          0       110601      109319        1975               0   
4          0       104605       87232        1859               1   
5          0      1087630      101060        1927               0   
6          0       145708       87720        1866              13   
7          0       111872      106361        1962               1   
8          0       107721     1964908        1921               6   
9          0       104397      102881        1937               0   
10         0       110958       95957        1903              10   
11         0       106207      105235        1954              13   
12         0        96289       86537        1850              12   
13         0        99592       94

# run logistic regression for different training sets (combinations of metrics)

In [13]:
def createLogReg(dataframe, x_train_list):
    #set up training data
    y_train = dataframe['edge']
    x_train = dataframe[x_train_list]

    #calculate logistical regression
    clf = skl_lm.LogisticRegression(solver='newton-cg')
    clf.fit(x_train, y_train)
    return clf

def getProb(clf, x_test_df):
    # get attachment probabilities on testing set
    prob = clf.predict_proba(x_test_df)
    
    # predicted probabilities for ALL case for edge present (1)
    prob_present = prob[:,1:2]
    # convert to list
    prob_present2 = [i.tolist()[0] for i in prob_present]
    
    return prob_present2

In [15]:
def calcRankScore(R_list, edges_df, vertex_df_dict, x_train_columns, x_test_columns):
    ### get attachment probabilities added into dataframe for one vertex

    time1 = time.time()

    # get year of each case in R
    # attachment probabilities for all vertices in that year
    # rank the dataframe
    # get neighbors of each case 
    # see how each neighbor ranks
    # get each neighbor's score
    # sum up scores for one case in R
    # final score = sum of all 1000 scores

    #The logistical regression is calculated outside the for loop and saved as clf
    x_train_list = x_train_columns
    clf = createLogReg(edges_df, x_train_list)

    final_scores = [] # sum of scores for each case
    for vertex_r in R_list:
        # get year of each case in R
        year = vertex_r['year']

        # determine which vertex_df to retrieve
        decade = year + (10 - year%10)
        vertex_df = vertex_df_dict[decade]

        # attachment probabilities for all vertices in that year
        #use the logreg already calculated and just apply a new test set
        x_test_df = vertex_df[x_test_columns]
        indeg_attach_p = getProb(clf, x_test_df)

        # add the attachment probabilities as column
        vertex_df['indegree_attachment_p'] = indeg_attach_p
        # sort by attachment probabilities
        vertex_df = vertex_df.sort_values('indegree_attachment_p', ascending=False, kind='mergesort').reset_index(drop=True)

        # get neighbors
        neighbors = G.neighbors(vertex_r.index, mode='OUT')

        # rank and score neighbors using dataframe indices
        scores = [] # list of scores for each vertex
        for i in neighbors:
            rank = vertex_df.loc[vertex_df['name']==G.vs[i]['name']].index[0] + 1
            score = 1-rank/len(indeg_attach_p)
            scores.append(score)

        sum_scores = sum(scores) # sum up the scores for each case
        final_scores.append(sum_scores)

    score_M = sum(final_scores) # score of metric

    time2 = time.time()
    print "this took " + str(time2-time1) + " seconds"
    print "Score: " + str(score_M)
    print

# Pick R Cases

In [17]:
R = 500 # number of random cases
n1_vertices_set = set(G.vs)
R_list = random.sample(n1_vertices_set, R)

# Test R cases

In [18]:
x_train_columns = ['cited_indegree']
x_test_columns = ['indegree']
print "Indegree"
calcRankScore(R_list, edges_df, vertex_df_dict, x_train_columns, x_test_columns)

x_train_columns = ['cited_pagerank']
x_test_columns = ['pagerank']
print "Pagerank"
calcRankScore(R_list, edges_df, vertex_df_dict, x_train_columns, x_test_columns)

x_train_columns = ['cited_indegree','cited_pagerank']
x_test_columns = ['indegree','pagerank']
print "Indegree and Pagerank"
calcRankScore(R_list, edges_df, vertex_df_dict, x_train_columns, x_test_columns)

Indegree
this took 9.77799987793 seconds
Score: 2998.94088151

Pagerank
this took 9.22500014305 seconds
Score: 2563.98054996

Indegree and Pagerank
this took 10.3090000153 seconds
Score: 2970.85944083

