In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'

import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from rankscore_experiment_search import *

from make_tr_edge_df import *


# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

## parameters from make snapshots

In [3]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                  'd_pagerank','u_pagerank',
                  'authorities', 'hubs',
                  #'d_eigen', 'u_eigen', # d_eigen is being problematic
                  'u_eigen',
                  'd_betweenness', 'u_betweenness',
                  'd_closeness', 'u_closeness']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]

vertex_metrics += ['age', 'similarity']

active_years = range(1900, 2015 + 1)

## test parameters

In [None]:
test_params = {'active_years':  active_years,
               'seed': 4332,
               'num_test_cases': 1000}

# rank by sorting

In [None]:
%%time 

# this took 35 min before for 1000 

scores_sort = get_rankscores_sort(G, test_params, vertex_metrics, subnet_dir)

In [None]:
sort_rankscore = scores_sort.mean().sort_values(ascending=False)
sort_rankscore

In [None]:
# histogram of scores

# plt.figure(figsize=[20, 20])
# k = 1
# h = ceil(scores_sort.shape[1] / 4.0)
# for c in sort_mean.index:
#     plt.subplot(h, 4, k)
#     plt.hist(scores_sort[c])
#     plt.xlabel(c)
    
#     k += 1

# Search

In [None]:
num_to_keep = 5000

In [None]:
%%time

# this took 15 min before
scores_search = get_rankscores_search(G, test_params,
                                      vertex_metrics, subnet_dir, num_to_keep)

In [None]:
search_rankscore = scores_search.mean().sort_values(ascending=False)
search_rankscore

# logistic regression

## make training data for logistic regression

In [None]:
# how many abset edges to add
num_absent_edges = len(G.es)
seed_edge_df = 32432

# how to normalize yearly metrics
metric_normalization = 'mean'

In [None]:
%%time 

# make_tr_edge_df(G, subnet_dir,
#                 active_years, num_absent_edges,
#                 vertex_metrics, metric_normalization,
#                 seed=seed_edge_df)

## rank by logistic regression

In [None]:
# vertex_metrics += ['all']

In [None]:
%%time

scores_LR, scores_LR_logloss, LogRegs = get_rankscores_LR(G, test_params, vertex_metrics,
                                                            subnet_dir,
                                                            metric_normalization)              

In [None]:
LR_logloss = scores_LR_logloss.mean().sort_values(ascending=True)
LR_logloss

In [None]:
LR_rankscore = scores_LR.mean().sort_values(ascending=False)
LR_rankscore

# results

In [None]:
df_metric = pd.DataFrame(columns=['sort', 'search', 'LR', 'LR_logloss'],
                         index = range(len(vertex_metrics)))

df_metric['sort'] = sort_rankscore.index
df_metric['search'] = search_rankscore.index
df_metric['LR'] = LR_rankscore.index
df_metric['LR_logloss'] = LR_logloss.index

In [None]:
df_metric

In [None]:
rankscores = pd.DataFrame(columns=['sort', 'search', 'LR'],
                         index = vertex_metrics)

rankscores['sort'] = sort_rankscore
rankscores['search'] = search_rankscore
rankscores['LR'] = LR_rankscore

In [None]:
rankscores.sort_values(by='sort', ascending=False)

In [None]:
rs_ranking = rankscores.apply(lambda c: rankdata(c))

In [None]:
rs_ranking.sort_values(by='sort')