In [42]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

import random as random

from collections import *

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2

data_dir = '../../data/'
court_name = 'scotus'

from __future__ import division

pd.set_option('display.notebook_repr_html', False)

import glob

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [67]:
def get_snapshot_edge_metrics(G, path_to_vertex_metrics_folder, year_interval, num_non_edges_to_add):
    """
    Creates the edge data frame. Rows are edges. Columns are the metrics of cited case in citing case's year (also year of citing case).

    Include all edges that exist in the network and randomly add non existent edges.

    Parameters
    ----------
    G: network

    path_to_vertex_metrics_folder: where can we find the vertex metrics

    year_interval: the year interval between each vertex metric .csv file

    num_non_edges_to_add: how many nonexistent edges to add 

    Output
    --------
    Saves edge data frame as a csv file
    """
    #load all the vertex metric dataframes into a dict so they only have to be read in once
    all_vertex_metrics_df = glob.glob(path_to_vertex_metrics_folder + "/vertex_metrics*.csv")
    vertex_metric_dict = {}
    for vertex_metrc_df in all_vertex_metrics_df:
        #add df to dict with filepath as key
        vertex_metric_dict[vertex_metrc_df] = pd.read_csv(vertex_metrc_df, index_col=0)
    
    #list of edges that will be added to the edge df
    edges_to_add_list = []
    
    #get all present edges
    edges_to_add = G.get_edgelist()
    
    #go through each edge and add it to a list which will become the df
    for edge in edges_to_add:
        #get info from edge
        citing_year = G.vs(edge[0])['year'][0]
        cited_name = G.vs(edge[1])['name'][0]

        #determine which vertex_df to retrieve
        year = citing_year + (year_interval - citing_year%year_interval)
        
        #look-up that dataframe from given path
        vertex_df = vertex_metric_dict[path_to_vertex_metrics_folder + '\\vertex_metrics_' + str(year) + '.csv']
        
        #get row from df using cited_name
        row = vertex_df.loc[cited_name].values.tolist()

        edge_tuple = (1,) + tuple(row)
        edges_to_add_list.append(edge_tuple)
    
    #get a sample of non-present edges
    non_edges_to_add = sample_non_edges(G, year_interval, num_non_edges_to_add)
    
    #go through each sampled non-edge and add it to the list already with edges
    for edge in non_edges_to_add:
        #get info from edge
        citing_year = G.vs(edge[0])['year'][0]
        cited_name = G.vs(edge[1])['name'][0]

        #determine which vertex_df to retrieve
        year = citing_year + (year_interval - citing_year%year_interval)
        
        #look-up that dataframe from given path
        vertex_df = vertex_metric_dict[path_to_vertex_metrics_folder + '\\vertex_metrics_' + str(year) + '.csv']
        
        #get row from df using cited_name
        row = vertex_df.loc[cited_name].values.tolist()

        edge_tuple = (0,) + tuple(row)
        edges_to_add_list.append(edge_tuple)
    
    #get column names from the last loaded dataframe (since all df should have same column names)
    column_names = ['edge'] + list(vertex_df.columns.values)
    df = pd.DataFrame(edges_to_add_list, columns=column_names)

    # columns of df are the vertex of cited case in citing case's year

    df.to_csv('edge_data.csv')

In [68]:
def sample_non_edges(G, year_interval, num_non_edges_to_add):
    '''
    Samples a number of nonexistent edges from the network G
    
    Parameters
    ----------
    G: network

    year_interval: the year interval between each vertex metric .csv file

    num_non_edges_to_add: how many nonexistent edges to add 

    Output
    --------
    List of non-present edges
    '''
    non_edge_set = set([]) # set makes adding 'edge_tuple' unique in the while loop (need b/c random sampling can return duplicates)
    edges = set(G.get_edgelist()) # when searching for element, set is faster than list
    vertices = set(G.vs) # vertices to select from

    while len(non_edge_set) < num_non_edges_to_add:
        # get random_edge
        temp = random.sample(vertices, 2) # default: without replacement
        random_edge = (temp[0].index, temp[1].index)

        # get info from edge
        citing_year = G.vs(random_edge[0])['year'][0]
        cited_year = G.vs(random_edge[1])['year'][0]

        #only add a random edge if the citing year is after the cited year,
        #if it is not a present edge, and if it is not already sampled
        if random_edge not in edges and citing_year >= cited_year and random_edge not in non_edge_set:
            # determine which vertex_df to retrieve
            non_edge_set.add(random_edge)
            
    return list(non_edge_set)

Testing above defs

In [69]:
#This def is not required, I just used it to make excuted code concise
def load_scotus_graph():
    G = load_citation_network_igraph(data_dir, court_name)
    all_edges = G.get_edgelist() # list of tuples
    bad_edges = []
    for edge in all_edges:
        citing_year = G.vs(edge[0])['year'][0]
        cited_year = G.vs(edge[1])['year'][0]
    
        if citing_year < cited_year:
            bad_edges.append(edge)

    G.delete_edges(bad_edges)
    return G

In [70]:
G = load_scotus_graph()
path_to_vertex_metrics_folder = 'C:\\Research\\law-net\\explore\\James'
year_interval = 10
num_non_edges_to_add = len(G.get_edgelist())

0 seconds for 250465 edges


In [71]:
get_snapshot_edge_metrics(G, path_to_vertex_metrics_folder, year_interval, num_non_edges_to_add)
print 'done'

done


In [72]:
pd.read_csv('edge_data.csv', index_col=0)

        edge    year  indegree  pagerank
0          1  1921.0       7.0  0.000060
1          1  1921.0       4.0  0.000037
2          1  1921.0       3.0  0.000035
3          1  1917.0       4.0  0.000050
4          1  1895.0      34.0  0.000343
5          1  1918.0       6.0  0.000062
6          1  1877.0      43.0  0.000372
7          1  1886.0      18.0  0.000117
8          1  1888.0      28.0  0.000168
9          1  1891.0      33.0  0.000197
10         1  1896.0      15.0  0.000077
11         1  1899.0      27.0  0.000187
12         1  1904.0      22.0  0.000174
13         1  1912.0       1.0  0.000027
14         1  1912.0       5.0  0.000045
15         1  1912.0       8.0  0.000052
16         1  1920.0       1.0  0.000027
17         1  1922.0       3.0  0.000046
18         1  1921.0       9.0  0.000082
19         1  1921.0       7.0  0.000075
20         1  1890.0       3.0  0.000035
21         1  1895.0       7.0  0.000087
22         1  1898.0       6.0  0.000070
23         1  19