# Summarize Modularity Clusters

In [31]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle as pickle
from collections import OrderedDict


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.sparse
import random
import itertools


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## load tf-idf vectors
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [None]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

# Clustering Work:
focus on largest connected component of **undirected scotus**

In [3]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

## modularity on undirected scotus
"For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules"--*Wikipedia*

In [4]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

# save clusters in pandas
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 126 clusters
Wall time: 1min 36s


## get the top 5 biggest clusters

In [23]:
dict_top_n_clusters, biggest_n_clusters = get_top_n_clusters(5, len(mod_clust), graph_clusters)

sizes of top 5 biggest clusters:

cluster 2 : 9273 opinions
cluster 0 : 6870 opinions
cluster 1 : 6234 opinions
cluster 3 : 1458 opinions
cluster 15 : 76 opinions


# Summarize Cluster Function 1: "top_k_words"
This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

In [56]:
%%time

k=20 # number of words to get

print "top", k, "words for..."
print ''
for i in biggest_n_clusters:
    top_words = top_k_words(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words]

top 20 words for...

[1;31mcluster 2[0m: ['squier', 'reinsur', 'scophoni', 'reinsur', 'graeff', 'arbitr', 'arbitr', 'sugg', 'passport', 'arbitr', 'wenzel', 'vetterlein', 'meserv', 'tray', 'arbitr', 'softlit', 'metsker', 'artwar', 'vichi', 'huntley']
[1;31mcluster 0[0m: ['dispensari', 'pension', 'fslic', 'wass', 'bicknel', 'milk', 'jumel', 'boom', 'merriam', 'ch', 'cheroke', 'cheroke', 'tea', 'vs', 'pilkey', 'cheroke', 'menard', 'airco', 'vs', 'aaf']
[1;31mcluster 1[0m: ['baal', 'stumpf', 'lagrand', 'ree', 'lesag', 'bail', 'kaupp', 'ashcraft', 'penri', 'mazzei', 'juvenil', 'copyright', 'deport', 'partin', 'maloney', 'anastaplo', 'church', 'flag', 'fior', 'vaccin']
[1;31mcluster 3[0m: ['shaeffer', 'wool', 'carusi', 'toy', 'seed', 'jen', 'paper', 'renfrow', 'pearl', 'cork', 'tile', 'cadet', 'pardon', 'postmast', 'hilsman', 'ore', 'collector', 'nail', 'hoppl', 'frerich']
[1;31mcluster 15[0m: ['flaglor', 'dippold', 'nailor', 'goff', 'turpin', 'reeder', 'bilsland', 'shappirio', 'ra

# Summarize Cluster Function 2: "top_k_words_from_mean_vector"
compute the mean tf-idf vector of the cluster, return the top K words from this mean vector

In [55]:
%%time

k=20 # number of words to get

print "top", k, "words (from mean vector) for..."
print ''
for i in biggest_n_clusters:
    top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words_from_mean]

 top 20 words (from mean vector) for...

[1;31mcluster 2[0m: ['court', 'state', 'v', 'case', 'compani', 'plaintiff', 'defend', 'act', 'upon', 'law', 'said', 'co', 'jurisdict', 'u', 'judgment', '\xc2\xa7', 'unit', 'decre', 'suit', 'contract']
[1;31mcluster 0[0m: ['state', 'tax', 'court', 'land', 'v', 'act', 'compani', 'upon', 'case', 'unit', 'l', 'u', 'law', 'properti', 'ct', 'railroad', 'commiss', 'said', 'ed', 'co']
[1;31mcluster 1[0m: ['court', 'state', 'v', 'sct', 'us', 'led2d', 'case', 'petition', '\xc2\xa7', 'unit', 'constitut', 'law', 'see', 'feder', 'trial', 'act', 'amend', 'would', 'juri', 'right']
[1;31mcluster 3[0m: ['court', 'state', 'act', 'unit', 'case', 'v', 'upon', 'said', 'contract', 'offic', 'law', 'made', 'shall', 'defend', 'plaintiff', 'error', 'duti', 'u', 'section', 'claim']
[1;31mcluster 15[0m: ['deed', 'court', 'properti', 'wife', 'convey', 'husband', 'estat', 'said', 'land', 'upon', 'case', 'v', 'titl', 'decre', 'bill', 'defend', 'made', 'plaintiff', '

# Summarize Cluster Function 3: "top_k_words_from_difference"
compute the mean tf-idf vector of the cluster and also of the complement of the cluster,  
take the difference mu_cluster - mu_complement, return the top K words in this difference

In [57]:
%%time

k=20 # number of words to get

print "top", k, "words (from mean difference) for..."
print ''
for i in biggest_n_clusters:
    top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[i], all_the_opinions, 
                                                      k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words_from_diff]

top 20 words (from mean difference) for...

[1;31mcluster 2[0m: ['plaintiff', 'bankruptci', 'court', 'suit', 'patent', 'jurisdict', 'compani', 'creditor', 'decre', 'defend', 'bank', 'circuit', 'co', 'neglig', 'employe', 'action', 'bond', 'bankrupt', 'mortgag', 'parti']
[1;31mcluster 0[0m: ['tax', 'land', 'state', 'compani', 'commiss', 'indian', 'railroad', 'rate', 'l', 'ct', 'incom', 'interst', 'commerc', 'ed', 'corpor', 'properti', 'act', 'carrier', 'taxat', 'water']
[1;31mcluster 1[0m: ['led2d', 'sct', 'us', 'petition', 'v', 'convict', 'constitut', 'sentenc', 'state', 'crimin', 'see', 'amend', 'trial', 'feder', 'juri', '\xc2\xa7', 'search', 'habea', 'court', 'school']
[1;31mcluster 3[0m: ['indict', 'offic', 'collector', 'duti', 'unit', 'contract', 'navi', 'claimant', 'treasuri', 'act', 'servic', 'depart', 'articl', 'cent', 'shall', 'apprais', 'govern', 'section', 'charg', 'made']
[1;31mcluster 15[0m: ['deed', 'wife', 'convey', 'husband', 'estat', 'properti', 'titl', 'lot', 

# Summarize Cluster Function 4: "document_closest_to_mean"
compute the mean tf-idf vector, return the document in the cluster closet to the mean  

In [51]:
%%time

print "most relevant document for..."
print ''
for i in biggest_n_clusters:
    most_relev_op = document_closest_to_mean(dict_top_n_clusters[i], tfidf_matrix, op_id_to_bow_id)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ": opinion " + most_relev_op

most relevant document for...

[1;31mcluster 2[0m: opinion 89905
[1;31mcluster 0[0m: opinion 95354
[1;31mcluster 1[0m: opinion 104135
[1;31mcluster 3[0m: opinion 86062
[1;31mcluster 15[0m: opinion 87645
Wall time: 1min 57s
