In [2]:
import sys
sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import networkx as nx

from load_data import load_citation_network, case_info

import operator

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

### Load SCOTUS Network (Directed)

In [3]:
# this will be a little slow the first time you run it
G = load_citation_network(data_dir, court_name)
print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.nodes()), len(G.edges()))

loaded scotus network with 33248 cases and 250465 edges


### Convert SCOTUS Network into Undirected Form

In [6]:
G2 = G.to_undirected()
print 'loaded %s network (undirected) with %d cases and %d edges' % (court_name, len(G2.nodes()), len(G2.edges()))

loaded scotus network (undirected) with 33248 cases and 249878 edges


In [11]:
print 'number of missing edges', 250465-249878

number of missing edges 587


### What are the missing edges:

In [14]:
from collections import Counter
original_edges = Counter(frozenset((u,v)) for (u,v) in G.edges())
undirected_edges = Counter(frozenset((u,v)) for (u,v) in G2.edges())
missing_edges = original_edges - undirected_edges
counter = 0
for ((u,v), count) in missing_edges.items():
    if count != 0:
        print('Missing edge: {} - {}'.format(u, v))
    counter += 1

print 'number of missing edges', counter

Missing edge: 1521208 - 98215
Missing edge: 103985 - 104013
Missing edge: 1745828 - 94614
Missing edge: 2527082 - 85186
Missing edge: 1490516 - 1490284
Missing edge: 1882017 - 1880474
Missing edge: 98653 - 2620695
Missing edge: 103073 - 103074
Missing edge: 92841 - 2465894
Missing edge: 1087609 - 104169
Missing edge: 102572 - 102573
Missing edge: 2620816 - 2620815
Missing edge: 2503240 - 92308
Missing edge: 1087640 - 107141
Missing edge: 1335001 - 107757
Missing edge: 84884 - 2620975
Missing edge: 2540239 - 2540047
Missing edge: 1087650 - 1087635
Missing edge: 85252 - 85253
Missing edge: 1491546 - 1490404
Missing edge: 1860043 - 1860647
Missing edge: 130139 - 130140
Missing edge: 1434315 - 92773
Missing edge: 139329 - 139330
Missing edge: 107070 - 2510431
Missing edge: 93971 - 93972
Missing edge: 93426 - 2620739
Missing edge: 1246952 - 90010
Missing edge: 2533674 - 2533575
Missing edge: 1382048 - 1380068
Missing edge: 101592 - 1333165
Missing edge: 2000099 - 1998276
Missing edge: 14570

### Top 10 In-Degree for SCOTUS (Undirected) -- SHOULDN'T EXIST!

In [10]:
ind = sorted(G.in_degree().iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
ind_cases = [x[0] for x in ind]
ind_values = [x[1] for x in ind]
ind_dates = [G.node[each_case]['date'] for each_case in ind_cases]

data = {'Case ID': ind_cases,
        'In-Degree': ind_values,
        'Date': ind_dates
       }
df = pd.DataFrame(data, columns = ['In-Degree', 'Case ID', 'Date'],
                 index=[1,2,3,4,5,6,7,8,9,10])
df

AttributeError: 'Graph' object has no attribute 'in_degree'

In [11]:
page_rank_dict = nx.pagerank(G)

### Top 10 Page Rank for SCOTUS (Undirected)

In [13]:
prd = sorted(page_rank_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:10]
prd_cases = [x[0] for x in prd]
prd_values = [x[1] for x in prd]
prd_dates = [G.node[each_case]['date'] for each_case in prd_cases]

data = {'Case ID': prd_cases,
        'Page Rank': prd_values,
        'Date': prd_dates
       }
df = pd.DataFrame(data, columns = ['Page Rank', 'Case ID', 'Date'],
                 index=[1,2,3,4,5,6,7,8,9,10])
df

Unnamed: 0,Page Rank,Case ID,Date
1,0.00094,96405,1906-02-19
2,0.000767,112790,1992-11-02
3,0.000584,109532,1976-07-22
4,0.000474,106545,1963-03-18
5,0.00046,106366,1962-03-26
6,0.000413,107252,1966-06-13
7,0.000411,103012,1938-04-25
8,0.000407,101864,1932-02-23
9,0.00039,102605,1936-02-17
10,0.000385,106548,1963-03-18
