In [17]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

import random as random

from collections import *

from load_data import load_citation_network_igraph, case_info

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score, log_loss
from sklearn import preprocessing
from sklearn import neighbors


%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

import glob

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [39]:
def compute_ranking_metrics(G, logistic_regression_object, columns_to_use, path_to_vertex_metrics_folder, year_interval, R):
    '''
    Computes the rank score metric for a given logistic regression object.

    Parameters
    ------------
    G: network (so we can get each cases' ancestor network)

    logistic_regression_object: a logistic regression object (i.e. the output of fit_logistic_regression)

    columns_to_use: list of column names of edge metrics data frame that we should use to fit logistic regression

    path_to_vertex_metrics_folder: we will need these for prediciton

    year_interval: the year interval between each vertex metric .csv file

    R: how many cases to compute ranking metrics for
 
    Output
    -------
    The average ranking score over all R cases we tested
    '''
    
    #select cases for sample
    vertices = set(G.vs)
    cases_to_test = random.sample(vertices, R)

    cases_to_test_rank_scores = []

    #load all the vertex metric dataframes into a dict so they only have to be read in once
    all_vertex_metrics_df = glob.glob(path_to_vertex_metrics_folder + "/vertex_metrics*.csv")
    vertex_metric_dict = {}
    for vertex_metrc_df in all_vertex_metrics_df:
        #add df to dict with filepath as key
        vertex_metric_dict[vertex_metrc_df] = pd.read_csv(vertex_metrc_df, index_col=0)
    
    #calculate each case's score
    for case in cases_to_test:

        #determine which vertex_df to retrieve
        year = case['year'] + (year_interval - case['year']%year_interval)
        
        #look-up that dataframe from given path
        vertex_df = vertex_metric_dict[path_to_vertex_metrics_folder + '\\vertex_metrics_' + str(year) + '.csv']
        
        #create df that the logistical regression object will evaluate
        x_test_df = vertex_df[columns_to_use]
        attachment_p = get_attachment_probabilty(logistic_regression_object, x_test_df)

        # add the attachment probabilities as column
        vertex_df['attachment_p'] = attachment_p
        # sort by attachment probabilities
        vertex_df = vertex_df.sort_values('attachment_p', ascending=False, kind='mergesort')#.reset_index(drop=True)

        # get neighbors
        neighbors = G.neighbors(case.index, mode='OUT')

        # rank and score neighbors using dataframe indices
        scores = [] # list of scores for each vertex
        for i in neighbors:
            rank = vertex_df.index.get_loc(G.vs[i]['name']) + 1
            score = 1-rank/len(attachment_p)
            scores.append(score)
        
        case_rank_score = sum(scores) # sum up the scores for each case
        
        #add score to list of all cases' scores
        cases_to_test_rank_scores.append(case_rank_score)

    return np.mean(cases_to_test_rank_scores)

In [57]:
def get_attachment_probabilty(logistic_regression_object, x_test_df):
    '''
    Evaluates our logistic regression model for a given dataset.

    Parameters
    ------------
    logistic_regression_object: a logistic regression object (i.e. the output of fit_logistic_regression)

    x_test_df: columns of vertex_df used in evaluating the logistical regression

    Output
    ------
    returns a list of attachment probabilities for the dataset
    '''
    
    # get attachment probabilities on testing set
    prob = logistic_regression_object.predict_proba(x_test_df)
    
    # predicted probabilities for ALL case for edge present (1)
    prob_present = prob[:,1:2]
    # convert to list
    prob_present_list = [i.tolist()[0] for i in prob_present]
    
    return prob_present_list

Testing above defs

In [23]:
def fit_logistic_regression(path_to_edge_data_frame, columns_to_use):
    '''
    Fits our logistic regression model. Any data you need for logistic regression should be in the edge data frame

    Parameters
    ------------
    path_to_edge_data_frame:

    columns_to_use: list of column names of edge metrics data frame that we should use to fit logistic regression

    Output
    ------
    returns a logistic regression object 
    '''
    #set up training data
    df = pd.read_csv(path_to_edge_data_frame, index_col=0)
    y_train = df['edge']
    x_train = df[columns_to_use]

    #calculate logistical regression
    clf = skl_lm.LogisticRegression(solver='newton-cg')
    clf.fit(x_train, y_train)
    return clf

In [24]:
#This def is not required, I just used it to make excuted code concise
def load_scotus_graph():
    G = load_citation_network_igraph(data_dir, court_name)
    all_edges = G.get_edgelist() # list of tuples
    bad_edges = []
    for edge in all_edges:
        citing_year = G.vs(edge[0])['year'][0]
        cited_year = G.vs(edge[1])['year'][0]
    
        if citing_year < cited_year:
            bad_edges.append(edge)

    G.delete_edges(bad_edges)
    return G

In [55]:
path_to_edge_data_frame = 'C:\\Research\\law-net\\explore\\James\\edge_data.csv'
columns_to_use = ['indegree','pagerank']
logistic_regression_object = fit_logistic_regression(path_to_edge_data_frame, columns_to_use)

In [13]:
G = load_scotus_graph()
path_to_vertex_metrics_folder = 'C:\\Research\\law-net\\explore\\James'
year_interval = 10
R = 500

0 seconds for 250465 edges


In [59]:
compute_ranking_metrics(G, logistic_regression_object, columns_to_use, path_to_vertex_metrics_folder, year_interval, R)

6.9000145966315207