In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

# %load ../standard_import.txt
from __future__ import division
import matplotlib as mpl

pd.set_option('display.notebook_repr_html', False)

%matplotlib inline
plt.style.use('seaborn-white')

# Definitions to manipulate the Tidy Dataframe

In [2]:
#takes in a df and randomly samples it so new df will include all present edges and an equal number of missing edges
def make_random_subset_df(df):
    time1 = time.time()
    #gets a dataframe of just present edges (df1)
    df1 = df[df['edge'] == 1]
    #gets a dataframe of just missing edges (df0)
    df0 = df[df['edge'] == 0]
    #determines the fraction of df0 that will equal len(df1)
    frac_val = len(df1)/len(df0)
    #randomly take a subset of df0
    df0_random_subset = df0.sample(frac=frac_val, replace=False)
    
    #combines df1 and df0 to make dataframe subset
    df_subset = df1.append(df0_random_subset, ignore_index=True)
    
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df_subset

In [3]:
#adds cited indegree column to a df bsaed on graph G
def add_cited_indegree(df, G):
    time1 = time.time()
    #get column of indices to be used as references for the indegree list
    igraph_indices = df['cited_index']
    #get indegree list
    indegree = G.indegree()
    #adds the indegree value to the list in the index that corresponds to the dataframe
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(indegree[igraph_indices[i]])
    #add that new column
    df['cited_indegree'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

In [12]:
def add_cited_pagerank(df, G):
    time1 = time.time()
    #get column of indices to be used as references for the indegree list
    igraph_indices = df['cited_index']
    #get pagerank list
    pagerank = G.pagerank()
    #adds the pagerank value to the list in the index that corresponds to the dataframe
    new_column = []
    for i in range(0,len(igraph_indices)):
        new_column.append(pagerank[igraph_indices[i]])
    #add that new column
    df['pagerank'] = new_column
    time2 = time.time()
    print "This took " + str(time2-time1) + " seconds"
    return df

# Load Graph and Tidy Dataframe

In [5]:
G = load_citation_network_igraph(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))

0 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges


In [6]:
df = pd.read_csv('http://localhost:8888/notebooks/law-net/explore/James/1900_scotus_tidy.csv', index_col=0)

In [7]:
print df

          edge  citing_index  cited_index  citing_year  cited_year
0            0           467          469         1760        1759
1            0           470          469         1760        1759
2            0           471          470         1763        1760
3            0           471          467         1763        1760
4            0           471          469         1763        1759
5            0           271          471         1764        1763
6            0           271          470         1764        1760
7            0           271          467         1764        1760
8            0           271          469         1764        1759
9            0           472          471         1764        1763
10           0           472          470         1764        1760
11           0           472          467         1764        1760
12           0           472          469         1764        1759
13           0           473          471         1764        

# Manipulate Tidy Dataframe

__Make Subset Dataframe__

In [8]:
#create a subset from tidy df
subset_df = make_random_subset_df(df)
print subset_df

This took 8.6740000248 seconds
       edge  citing_index  cited_index  citing_year  cited_year
0         1           969          964         1817        1816
1         1           980          958         1817        1816
2         1           994          955         1817        1816
3         1            57          978         1818        1817
4         1          1004          955         1818        1816
5         1          1015          961         1818        1816
6         1          1020          966         1818        1817
7         1          1029          992         1818        1817
8         1          1033          955         1818        1816
9         1          1035          948         1819        1816
10        1          1039          161         1819        1816
11        1          1044         1003         1819        1817
12        1          1048          964         1819        1816
13        1          1051          949         1819        1816
14       

__Add Indegree Column__ 

In [9]:
subset_df_indegree = add_cited_indegree(subset_df, G)

This took 0.604000091553 seconds


In [10]:
print subset_df_indegree

       edge  citing_index  cited_index  citing_year  cited_year  \
0         1           969          964         1817        1816   
1         1           980          958         1817        1816   
2         1           994          955         1817        1816   
3         1            57          978         1818        1817   
4         1          1004          955         1818        1816   
5         1          1015          961         1818        1816   
6         1          1020          966         1818        1817   
7         1          1029          992         1818        1817   
8         1          1033          955         1818        1816   
9         1          1035          948         1819        1816   
10        1          1039          161         1819        1816   
11        1          1044         1003         1819        1817   
12        1          1048          964         1819        1816   
13        1          1051          949         1819        181

__Add Pagerank Column__

In [13]:
subset_df_indegree_pagerank = add_cited_pagerank(subset_df_indegree, G)

This took 0.883000135422 seconds


In [14]:
print subset_df_indegree_pagerank

       edge  citing_index  cited_index  citing_year  cited_year  \
0         1           969          964         1817        1816   
1         1           980          958         1817        1816   
2         1           994          955         1817        1816   
3         1            57          978         1818        1817   
4         1          1004          955         1818        1816   
5         1          1015          961         1818        1816   
6         1          1020          966         1818        1817   
7         1          1029          992         1818        1817   
8         1          1033          955         1818        1816   
9         1          1035          948         1819        1816   
10        1          1039          161         1819        1816   
11        1          1044         1003         1819        1817   
12        1          1048          964         1819        1816   
13        1          1051          949         1819        181