In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'

import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from rankscore_experiment_search import *

from make_tr_edge_df import *


# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

## parameters from make snapshots

In [3]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                  'd_pagerank','u_pagerank',
                  'authorities', 'hubs',
                  #'d_eigen', 'u_eigen', # d_eigen is being problematic
                  'u_eigen',
                  'd_betweenness', 'u_betweenness',
                  'd_closeness', 'u_closeness']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]

vertex_metrics += ['age', 'similarity']

active_years = range(1900, 2015 + 1)

## test parameters

In [4]:
test_params = {'active_years':  active_years,
               'seed': 4332,
               'num_test_cases': 1000}

# rank by sorting

In [5]:
%%time 

# this took 35 min before for 1000 

scores_sort = get_rankscores_sort(G, test_params, vertex_metrics, subnet_dir)

CPU times: user 43min 12s, sys: 6min 6s, total: 49min 19s
Wall time: 51min 4s


In [6]:
sort_rankscore = scores_sort.mean().sort_values(ascending=False)
sort_rankscore

similarity       0.956440
age              0.811348
u_eigen          0.774729
hubs             0.774541
recentcite_10    0.771521
recentcite_9     0.770481
degree           0.769954
recentcite_8     0.769766
outdegree        0.768459
recentcite_7     0.767913
recentcite_6     0.765021
recentcite_15    0.763191
recentcite_5     0.762005
recentcite_4     0.757937
u_closeness      0.755328
recentcite_20    0.751888
recentcite_3     0.749053
u_pagerank       0.744059
recentcite_25    0.741185
recentcite_2     0.733322
recentcite_30    0.729977
recentcite_35    0.721077
u_betweenness    0.717126
recentcite_40    0.713362
recentcite_1     0.704219
authorities      0.692724
d_betweenness    0.681532
indegree         0.653403
d_pagerank       0.573857
d_closeness      0.441688
dtype: float64

In [7]:
# histogram of scores

# plt.figure(figsize=[20, 20])
# k = 1
# h = ceil(scores_sort.shape[1] / 4.0)
# for c in sort_mean.index:
#     plt.subplot(h, 4, k)
#     plt.hist(scores_sort[c])
#     plt.xlabel(c)
    
#     k += 1

# Search

In [8]:
num_to_keep = 5000

In [9]:
%%time

# this took 15 min before
scores_search = get_rankscores_search(G, test_params,
                                      vertex_metrics, subnet_dir, num_to_keep)

CPU times: user 58min 9s, sys: 9min 44s, total: 1h 7min 53s
Wall time: 1h 9min 46s


In [10]:
search_rankscore = scores_search.mean().sort_values(ascending=False)
search_rankscore

similarity       0.908962
age              0.740020
recentcite_8     0.693931
recentcite_7     0.692373
recentcite_9     0.691630
recentcite_6     0.689617
recentcite_10    0.689482
recentcite_5     0.687246
recentcite_4     0.681658
recentcite_3     0.675257
recentcite_15    0.674934
recentcite_20    0.658743
recentcite_2     0.658147
hubs             0.650419
recentcite_25    0.644732
outdegree        0.644196
u_eigen          0.639943
recentcite_30    0.631488
degree           0.631385
recentcite_1     0.628595
recentcite_35    0.621037
u_pagerank       0.616345
recentcite_40    0.612278
u_closeness      0.599810
u_betweenness    0.588220
authorities      0.577197
d_betweenness    0.571101
indegree         0.555301
d_pagerank       0.500283
d_closeness      0.390356
dtype: float64

# logistic regression

## make training data for logistic regression

In [11]:
# how many abset edges to add
num_absent_edges = len(G.es)
seed_edge_df = 32432

# how to normalize yearly metrics
metric_normalization = 'mean'

In [12]:
%%time 

# make_tr_edge_df(G, subnet_dir,
#                 active_years, num_absent_edges,
#                 vertex_metrics, metric_normalization,
#                 seed=seed_edge_df)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 6.91 µs


## rank by logistic regression

In [13]:
# vertex_metrics += ['all']

In [14]:
%%time

scores_LR, scores_LR_logloss, LogRegs = get_rankscores_LR(G, test_params, vertex_metrics,
                                                            subnet_dir,
                                                            metric_normalization)              

CPU times: user 1h 33min 9s, sys: 7min 19s, total: 1h 40min 29s
Wall time: 1h 11min 9s


In [15]:
LR_logloss = scores_LR_logloss.mean().sort_values(ascending=True)
LR_logloss

recentcite_7      0.298579
recentcite_8      0.298626
recentcite_9      0.299490
recentcite_6      0.299898
recentcite_10     0.300850
recentcite_5      0.303925
similarity        0.306924
age               0.307392
recentcite_15     0.311331
recentcite_4      0.314003
recentcite_20     0.323574
recentcite_25     0.335564
recentcite_3      0.341446
recentcite_30     0.346579
recentcite_35     0.356572
recentcite_40     0.365597
recentcite_2      0.418001
outdegree         0.423174
indegree          0.433238
recentcite_1      0.616313
degree            0.623308
d_betweenness     0.687134
u_betweenness     0.688606
hubs              5.054075
authorities       5.601875
u_eigen           5.723755
d_closeness      12.071317
d_pagerank       27.008041
u_closeness      30.478725
u_pagerank       31.515806
dtype: float64

In [16]:
LR_rankscore = scores_LR.mean().sort_values(ascending=False)
LR_rankscore

age              0.962134
recentcite_3     0.961799
recentcite_4     0.960019
recentcite_2     0.957479
similarity       0.956440
recentcite_5     0.952114
recentcite_1     0.944626
recentcite_6     0.940091
recentcite_7     0.928503
recentcite_8     0.919384
indegree         0.912858
recentcite_9     0.912024
recentcite_10    0.906739
outdegree        0.895467
recentcite_40    0.895136
recentcite_35    0.892602
degree           0.890659
recentcite_30    0.890374
recentcite_15    0.889667
recentcite_25    0.887724
recentcite_20    0.886310
hubs             0.854870
u_eigen          0.842713
authorities      0.822620
d_pagerank       0.617973
u_pagerank       0.602601
u_closeness      0.589533
d_closeness      0.560804
d_betweenness    0.402120
u_betweenness    0.311704
dtype: float64

# save results

In [24]:
scores_sort.to_csv(subnet_dir + 'results/scores_sort.csv', index=True)
scores_search.to_csv(subnet_dir + 'results/scores_search.csv', index=True)
scores_LR.to_csv(subnet_dir + 'results/scores_LR.csv', index=True)
scores_LR_logloss.to_csv(subnet_dir + 'results/scores_LR_logloss.csv', index=True)

with open(subnet_dir + 'results/LogRegs.p', 'wb') as fp:
    pickle.dump(LogRegs, fp)


# results

In [17]:
df_metric = pd.DataFrame(columns=['sort', 'search', 'LR', 'LR_logloss'],
                         index = range(len(vertex_metrics)))

df_metric['sort'] = sort_rankscore.index
df_metric['search'] = search_rankscore.index
df_metric['LR'] = LR_rankscore.index
df_metric['LR_logloss'] = LR_logloss.index

In [18]:
df_metric

Unnamed: 0,sort,search,LR,LR_logloss
0,similarity,similarity,age,recentcite_7
1,age,age,recentcite_3,recentcite_8
2,u_eigen,recentcite_8,recentcite_4,recentcite_9
3,hubs,recentcite_7,recentcite_2,recentcite_6
4,recentcite_10,recentcite_9,similarity,recentcite_10
5,recentcite_9,recentcite_6,recentcite_5,recentcite_5
6,degree,recentcite_10,recentcite_1,similarity
7,recentcite_8,recentcite_5,recentcite_6,age
8,outdegree,recentcite_4,recentcite_7,recentcite_15
9,recentcite_7,recentcite_3,recentcite_8,recentcite_4


In [19]:
rankscores = pd.DataFrame(columns=['sort', 'search', 'LR'],
                         index = vertex_metrics)

rankscores['sort'] = sort_rankscore
rankscores['search'] = search_rankscore
rankscores['LR'] = LR_rankscore

In [20]:
rankscores.sort_values(by='sort', ascending=False)

Unnamed: 0,sort,search,LR
similarity,0.95644,0.908962,0.95644
age,0.811348,0.74002,0.962134
u_eigen,0.774729,0.639943,0.842713
hubs,0.774541,0.650419,0.85487
recentcite_10,0.771521,0.689482,0.906739
recentcite_9,0.770481,0.69163,0.912024
degree,0.769954,0.631385,0.890659
recentcite_8,0.769766,0.693931,0.919384
outdegree,0.768459,0.644196,0.895467
recentcite_7,0.767913,0.692373,0.928503


In [21]:
rs_ranking = rankscores.apply(lambda c: rankdata(c))

In [22]:
rs_ranking.sort_values(by='sort')

Unnamed: 0,sort,search,LR
d_closeness,1.0,1.0,3.0
d_pagerank,2.0,2.0,6.0
indegree,3.0,3.0,20.0
d_betweenness,4.0,4.0,2.0
authorities,5.0,5.0,7.0
recentcite_1,6.0,11.0,24.0
recentcite_40,7.0,8.0,16.0
u_betweenness,8.0,6.0,1.0
recentcite_35,9.0,10.0,15.0
recentcite_30,10.0,13.0,13.0
