# Summarize Modularity Clusters

In [72]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle as pickle
from collections import OrderedDict


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.sparse
import random
import itertools


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## load tf-idf vectors
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [None]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

# Clustering Work:
focus on largest connected component of **undirected scotus**

In [3]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

## modularity on undirected scotus
"For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules"--*Wikipedia*

In [4]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

# save clusters in pandas
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 126 clusters
Wall time: 1min 36s


## get the top 5 biggest clusters

In [73]:
dict_top_n_clusters, biggest_n_clusters = get_top_n_clusters(5, len(mod_clust), graph_clusters)

cluster 2 : 9273 opinions
cluster 0 : 6870 opinions
cluster 1 : 6234 opinions
cluster 3 : 1458 opinions
cluster 15 : 76 opinions


# Top K Words of Each Cluster
This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

In [63]:
%%time

k=10 # number of words to get

for i in biggest_n_clusters:
    top_words = top_k_words(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words]

[1;31mcluster 2[0m: ['squier', 'reinsur', 'scophoni', 'reinsur', 'graeff', 'arbitr', 'arbitr', 'sugg', 'passport', 'arbitr']
[1;31mcluster 0[0m: ['dispensari', 'pension', 'fslic', 'wass', 'bicknel', 'milk', 'jumel', 'boom', 'merriam', 'ch']
[1;31mcluster 1[0m: ['baal', 'stumpf', 'lagrand', 'ree', 'lesag', 'bail', 'kaupp', 'ashcraft', 'penri', 'mazzei']
[1;31mcluster 3[0m: ['shaeffer', 'wool', 'carusi', 'toy', 'seed', 'jen', 'paper', 'renfrow', 'pearl', 'cork']
[1;31mcluster 15[0m: ['flaglor', 'dippold', 'nailor', 'goff', 'turpin', 'reeder', 'bilsland', 'shappirio', 'randel', 'forsyth']
Wall time: 1min 37s


# Top K Words ($\mu_{cluster}$) of Each Cluster
compute the mean tf-idf vector of the cluster, return the top K words from this mean vector

In [64]:
%%time

k=10 # number of words to get

for i in biggest_n_clusters:
    top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words_from_mean]

[1;31mcluster 2[0m: ['court', 'state', 'v', 'case', 'compani', 'plaintiff', 'defend', 'act', 'upon', 'law']
[1;31mcluster 0[0m: ['state', 'tax', 'court', 'land', 'v', 'act', 'compani', 'upon', 'case', 'unit']
[1;31mcluster 1[0m: ['court', 'state', 'v', 'sct', 'us', 'led2d', 'case', 'petition', '\xc2\xa7', 'unit']
[1;31mcluster 3[0m: ['court', 'state', 'act', 'unit', 'case', 'v', 'upon', 'said', 'contract', 'offic']
[1;31mcluster 15[0m: ['deed', 'court', 'properti', 'wife', 'convey', 'husband', 'estat', 'said', 'land', 'upon']
Wall time: 4.54 s


# Top K Words ($\mu_{cluster} - \mu_{complement}$ ) of Each Cluster
compute the mean tf-idf vector of the cluster and also of the complement of the cluster,  
take the difference mu_cluster - mu_complement, return the top K words in this difference

In [65]:
%%time

k=10 # number of words to get

for i in biggest_n_clusters:
    top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[i], all_the_opinions, 
                                                      k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words_from_diff]

[1;31mcluster 2[0m: ['plaintiff', 'bankruptci', 'court', 'suit', 'patent', 'jurisdict', 'compani', 'creditor', 'decre', 'defend']
[1;31mcluster 0[0m: ['tax', 'land', 'state', 'compani', 'commiss', 'indian', 'railroad', 'rate', 'l', 'ct']
[1;31mcluster 1[0m: ['led2d', 'sct', 'us', 'petition', 'v', 'convict', 'constitut', 'sentenc', 'state', 'crimin']
[1;31mcluster 3[0m: ['indict', 'offic', 'collector', 'duti', 'unit', 'contract', 'navi', 'claimant', 'treasuri', 'act']
[1;31mcluster 15[0m: ['deed', 'wife', 'convey', 'husband', 'estat', 'properti', 'titl', 'lot', 'said', 'complain']
Wall time: 26.1 s


# Most Relevant Opinion of Each Cluster
compute the mean tf-idf vector, return the document in the cluster closet to the mean  

In [66]:
%%time

for i in biggest_n_clusters:
    most_relev_op = document_closest_to_mean(dict_top_n_clusters[i], tfidf_matrix, op_id_to_bow_id)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ": opinion " + most_relev_op

[1;31mcluster 2[0m: opinion 89905
[1;31mcluster 0[0m: opinion 95354
[1;31mcluster 1[0m: opinion 104135
[1;31mcluster 3[0m: opinion 86062
[1;31mcluster 15[0m: opinion 87645
Wall time: 1min 59s


# Cluster 2 Summary (9273 opinions)

In [71]:
%%time

k=10 # number of words to get

top_words = top_k_words(dict_top_n_clusters[2], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[2], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[2], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[2], tfidf_matrix, op_id_to_bow_id)


print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', [x.encode('utf-8') for x in top_words]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_mean]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_diff]
print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op


[1;31mTop K Words:[0m ['squier', 'reinsur', 'scophoni', 'reinsur', 'graeff', 'arbitr', 'arbitr', 'sugg', 'passport', 'arbitr']
[1;31mTop K Words (Mu_Cluster):[0m ['court', 'state', 'v', 'case', 'compani', 'plaintiff', 'defend', 'act', 'upon', 'law']
[1;31mTop K Words (Mu_Cluster - Mu_Complement):[0m ['plaintiff', 'bankruptci', 'court', 'suit', 'patent', 'jurisdict', 'compani', 'creditor', 'decre', 'defend']
[1;31mMost Relevent Opinion:[0m 89905
Wall time: 1min 41s


# Cluster 0 Summary (6870 opinions)

In [74]:
%%time

k=10 # number of words to get

top_words = top_k_words(dict_top_n_clusters[0], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[0], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[0], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[0], tfidf_matrix, op_id_to_bow_id)


print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', [x.encode('utf-8') for x in top_words]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_mean]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_diff]
print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op


[1;31mTop K Words:[0m ['dispensari', 'pension', 'fslic', 'wass', 'bicknel', 'milk', 'jumel', 'boom', 'merriam', 'ch']
[1;31mTop K Words (Mu_Cluster):[0m ['state', 'tax', 'court', 'land', 'v', 'act', 'compani', 'upon', 'case', 'unit']
[1;31mTop K Words (Mu_Cluster - Mu_Complement):[0m ['tax', 'land', 'state', 'compani', 'commiss', 'indian', 'railroad', 'rate', 'l', 'ct']
[1;31mMost Relevent Opinion:[0m 95354
Wall time: 1min 1s


# Cluster 1 Summary (6234 opinions)

In [75]:
%%time

k=10 # number of words to get

top_words = top_k_words(dict_top_n_clusters[1], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[1], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[1], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[1], tfidf_matrix, op_id_to_bow_id)


print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', [x.encode('utf-8') for x in top_words]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_mean]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_diff]
print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op


[1;31mTop K Words:[0m ['baal', 'stumpf', 'lagrand', 'ree', 'lesag', 'bail', 'kaupp', 'ashcraft', 'penri', 'mazzei']
[1;31mTop K Words (Mu_Cluster):[0m ['court', 'state', 'v', 'sct', 'us', 'led2d', 'case', 'petition', '\xc2\xa7', 'unit']
[1;31mTop K Words (Mu_Cluster - Mu_Complement):[0m ['led2d', 'sct', 'us', 'petition', 'v', 'convict', 'constitut', 'sentenc', 'state', 'crimin']
[1;31mMost Relevent Opinion:[0m 104135
Wall time: 1min 15s


# Cluster 3 Summary (1458 opinions)

In [76]:
%%time

k=10 # number of words to get

top_words = top_k_words(dict_top_n_clusters[3], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[3], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[3], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[3], tfidf_matrix, op_id_to_bow_id)


print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', [x.encode('utf-8') for x in top_words]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_mean]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_diff]
print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op


[1;31mTop K Words:[0m ['shaeffer', 'wool', 'carusi', 'toy', 'seed', 'jen', 'paper', 'renfrow', 'pearl', 'cork']
[1;31mTop K Words (Mu_Cluster):[0m ['court', 'state', 'act', 'unit', 'case', 'v', 'upon', 'said', 'contract', 'offic']
[1;31mTop K Words (Mu_Cluster - Mu_Complement):[0m ['indict', 'offic', 'collector', 'duti', 'unit', 'contract', 'navi', 'claimant', 'treasuri', 'act']
[1;31mMost Relevent Opinion:[0m 86062
Wall time: 14.9 s


# Cluster 15 Summary (76 opinions)

In [77]:
%%time

k=10 # number of words to get

top_words = top_k_words(dict_top_n_clusters[15], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[15], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[15], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[15], tfidf_matrix, op_id_to_bow_id)


print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', [x.encode('utf-8') for x in top_words]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_mean]
print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', [x.encode('utf-8') for x in top_words_from_diff]
print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op


[1;31mTop K Words:[0m ['flaglor', 'dippold', 'nailor', 'goff', 'turpin', 'reeder', 'bilsland', 'shappirio', 'randel', 'forsyth']
[1;31mTop K Words (Mu_Cluster):[0m ['deed', 'court', 'properti', 'wife', 'convey', 'husband', 'estat', 'said', 'land', 'upon']
[1;31mTop K Words (Mu_Cluster - Mu_Complement):[0m ['deed', 'wife', 'convey', 'husband', 'estat', 'properti', 'titl', 'lot', 'said', 'complain']
[1;31mMost Relevent Opinion:[0m 87645
Wall time: 4.22 s
