# Summarize Modularity Clusters

In [1]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict
import json


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import mutual_info_score as mi
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabaz_score as ch # (X, labels)
from sklearn.metrics import completeness_score as cs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import fowlkes_mallows_score as fm
from sklearn.metrics import homogeneity_completeness_v_measure as hcvm
from sklearn.metrics import homogeneity_score as hs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import silhouette_score as ss # (X, labels)
from sklearn.metrics import silhouette_samples as ss2 # (X, labels)
from sklearn.metrics import v_measure_score as vm

import scipy.sparse
import random
import itertools
from itertools import combinations


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *
from helpful_functions import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)
nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"
csv_dir_mod = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_modularity/"
csv_dir_mod_info = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_modularity/info/"
csv_dir_mod_summary = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_modularity/summary/"
csv_dir_walk = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_walktrap/"

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# clusters directory
clusters_dir = "C:/Users/Michael/Desktop/network_data/raw/scotus/clusters/"

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

## load tf-idf vectors
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [2]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

In [3]:
tfidf_matrix

<27885x567570 sparse matrix of type '<type 'numpy.float64'>'
	with 20817470 stored elements in Compressed Sparse Row format>

In [4]:
print type(tfidf_matrix)

<class 'scipy.sparse.csr.csr_matrix'>


# Clustering Work:
focus on largest connected component of **undirected scotus**

In [5]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

## modularity on undirected scotus
"For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules"--*Wikipedia*

In [6]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

# save clusters in pandas
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 126 clusters
Wall time: 1min 29s


## get the top 5 biggest clusters

In [7]:
'''
dict_top_n_clusters = dictionary of top K clusters 
                      (key=cluster #, value=opinions in cluster)
                      
biggest_n_clusters = list of top K clusters (int)
'''

dict_top_n_clusters, biggest_n_clusters = get_top_n_clusters(len(mod_clust), len(mod_clust), graph_clusters)

cluster 2 : 9273 opinions
cluster 0 : 6870 opinions
cluster 1 : 6234 opinions
cluster 3 : 1458 opinions
cluster 15 : 76 opinions
cluster 7 : 75 opinions
cluster 8 : 69 opinions
cluster 4 : 52 opinions
cluster 31 : 50 opinions
cluster 17 : 36 opinions
cluster 48 : 20 opinions
cluster 36 : 18 opinions
cluster 13 : 16 opinions
cluster 40 : 16 opinions
cluster 23 : 15 opinions
cluster 51 : 15 opinions
cluster 54 : 14 opinions
cluster 16 : 13 opinions
cluster 45 : 12 opinions
cluster 42 : 11 opinions
cluster 10 : 10 opinions
cluster 28 : 10 opinions
cluster 19 : 9 opinions
cluster 47 : 8 opinions
cluster 65 : 8 opinions
cluster 6 : 7 opinions
cluster 26 : 7 opinions
cluster 43 : 7 opinions
cluster 46 : 7 opinions
cluster 79 : 7 opinions
cluster 82 : 7 opinions
cluster 94 : 7 opinions
cluster 24 : 6 opinions
cluster 49 : 6 opinions
cluster 21 : 5 opinions
cluster 29 : 5 opinions
cluster 41 : 5 opinions
cluster 44 : 5 opinions
cluster 50 : 5 opinions
cluster 55 : 5 opinions
cluster 58 : 5 opi

## print 'p' opinions in biggest cluster

In [16]:
cluster = 2
p = 100 # to print all opinions in the cluster, let p = len(dict_top_n_clusters[cluster])

cluster_opinions = dict_top_n_clusters[cluster]
print "printing", p, "out of", len(cluster_opinions), "opinions in cluster", cluster, "..."
print ''
print cluster_opinions[0:p]

printing 100 out of 9273 opinions in cluster 2 ...

['89374', '89375', '89376', '89378', '89379', '91138', '106304', '103549', '103543', '103542', '103547', '97818', '102819', '88397', '88391', '88390', '88398', '97159', '93172', '97157', '102812', '97155', '89377', '88023', '88022', '88021', '88024', '88029', '100014', '91070', '95798', '91132', '97029', '85513', '90312', '90313', '90314', '90316', '90317', '90319', '97027', '96133', '97025', '94542', '94545', '94546', '97934', '97021', '87996', '92260', '89580', '107895', '92887', '92884', '92885', '92883', '92880', '92889', '93139', '104278', '104279', '104276', '104275', '104271', '98102', '97152', '97749', '97156', '98641', '99019', '105639', '105633', '105636', '102946', '102947', '94276', '99013', '94275', '102367', '102364', '102365', '102363', '107704', '112230', '108107', '108103', '109859', '109858', '92600', '92602', '92603', '92605', '103706', '103704', '103703', '107161', '103709', '111955', '104715', '145772']


# Top K Words of Each Cluster
This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

In [32]:
%%time

k=10 # number of words to get

for i in biggest_n_clusters:
    top_words = top_k_words(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words]

[1;31mcluster 2[0m: ['squier', 'reinsur', 'scophoni', 'graeff', 'arbitr', 'sugg', 'passport', 'wenzel', 'vetterlein', 'meserv']
[1;31mcluster 0[0m: ['dispensari', 'pension', 'fslic', 'wass', 'bicknel', 'milk', 'jumel', 'boom', 'merriam', 'ch']
[1;31mcluster 1[0m: ['baal', 'stumpf', 'lagrand', 'ree', 'lesag', 'bail', 'kaupp', 'ashcraft', 'penri', 'mazzei']
[1;31mcluster 3[0m: ['shaeffer', 'wool', 'carusi', 'toy', 'seed', 'jen', 'paper', 'renfrow', 'pearl', 'cork']
[1;31mcluster 15[0m: ['flaglor', 'dippold', 'nailor', 'goff', 'turpin', 'reeder', 'bilsland', 'shappirio', 'randel', 'forsyth']
Wall time: 1min 44s


# Top K Words ($\mu_{cluster}$) of Each Cluster
compute the mean tf-idf vector of the cluster, return the top K words from this mean vector

In [33]:
%%time

k=10 # number of words to get

for i in biggest_n_clusters:
    top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words_from_mean]

[1;31mcluster 2[0m: ['court', 'state', 'v', 'case', 'compani', 'plaintiff', 'defend', 'act', 'upon', 'law']
[1;31mcluster 0[0m: ['state', 'tax', 'court', 'land', 'v', 'act', 'compani', 'upon', 'case', 'unit']
[1;31mcluster 1[0m: ['court', 'state', 'v', 'sct', 'us', 'led2d', 'case', 'petition', '\xc2\xa7', 'unit']
[1;31mcluster 3[0m: ['court', 'state', 'act', 'unit', 'case', 'v', 'upon', 'said', 'contract', 'offic']
[1;31mcluster 15[0m: ['deed', 'court', 'properti', 'wife', 'convey', 'husband', 'estat', 'said', 'land', 'upon']
Wall time: 4.79 s


# Top K Words ($\mu_{cluster} - \mu_{complement}$ ) of Each Cluster
compute the mean tf-idf vector of the cluster and also of the complement of the cluster,  
take the difference mu_cluster - mu_complement, return the top K words in this difference

In [34]:
%%time

k=10 # number of words to get

for i in biggest_n_clusters:
    top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[i], all_the_opinions, 
                                                      k, tfidf_matrix, op_id_to_bow_id, vocab)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ":", [x.encode('utf-8') for x in top_words_from_diff]

[1;31mcluster 2[0m: ['plaintiff', 'bankruptci', 'court', 'suit', 'patent', 'jurisdict', 'compani', 'creditor', 'decre', 'defend']
[1;31mcluster 0[0m: ['tax', 'land', 'state', 'compani', 'commiss', 'indian', 'railroad', 'rate', 'l', 'ct']
[1;31mcluster 1[0m: ['led2d', 'sct', 'us', 'petition', 'v', 'convict', 'constitut', 'sentenc', 'state', 'crimin']
[1;31mcluster 3[0m: ['indict', 'offic', 'collector', 'duti', 'unit', 'contract', 'navi', 'claimant', 'treasuri', 'act']
[1;31mcluster 15[0m: ['deed', 'wife', 'convey', 'husband', 'estat', 'properti', 'titl', 'lot', 'said', 'complain']
Wall time: 27.1 s


# Most Relevant Opinion of Each Cluster
compute the mean tf-idf vector, return the document in the cluster closet to the mean  

In [35]:
%%time

for i in biggest_n_clusters:
    most_relev_op = document_closest_to_mean(dict_top_n_clusters[i], tfidf_matrix, op_id_to_bow_id)
    print '\x1b[1;31m' + 'cluster ' + str(i) + '\x1b[0m' + ": opinion " + most_relev_op

[1;31mcluster 2[0m: opinion 89905
[1;31mcluster 0[0m: opinion 95354
[1;31mcluster 1[0m: opinion 104135
[1;31mcluster 3[0m: opinion 86062
[1;31mcluster 15[0m: opinion 87645
Wall time: 1min 55s


In [13]:
def cluster_infos_csv(biggest_n_clusters, dict_top_n_clusters, clusters_dir, csv_dir_mod_info):
    
    for i in biggest_n_clusters:
        
        opinion_names = []
        opinion_dates = []
        opinion_links = []
        
        for j in dict_top_n_clusters[i]:
            try:
                with open(clusters_dir + j + ".json") as data_file:
                    data = json.load(data_file)
            except IOError:
                pass
                #name, date, link = case_info2(i)
                #opinion_names.append(name)
                #opinion_dates.append(date)
                #opinion_links.append(link)
            
            name = data['case_name'].encode('utf-8')
            date = data['date_filed'].encode('utf-8')
            link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')

            opinion_names.append(name)
            opinion_dates.append(date)
            opinion_links.append(link)

        cluster_info = pd.DataFrame()
        cluster_info['names'] = opinion_names
        cluster_info['dates'] = opinion_dates
        cluster_info['url'] = opinion_links

        cluster_info.to_csv(csv_dir_mod_info + "cluster_"+str(i)+".csv")
        print i

In [14]:
%%time
cluster_infos_csv(biggest_n_clusters, dict_top_n_clusters, clusters_dir, csv_dir_mod_info)

2
0
1
3
15
7
8
4
31
17
48
36
13
40
23
51
54
16
45
42
10
28
19
47
65
6
26
43
46
79
82
94
24
49
21
29
41
44
50
55
58
64
77
96
9
12
20
56
62
80
88
89
90
95
97
98
99
104
113
117
5
14
32
33
34
35
38
39
52
57
61
68
71
72
75
81
83
84
92
103
105
108
109
114
115
119
121
123
124
11
18
22
25
27
30
37
53
59
60
63
66
67
69
70
73
74
76
78
85
86
87
91
93
100
101
102
106
107
110
111
112
116
118
120
122
125
Wall time: 5.13 s


In [22]:
def cluster_summaries_csv(k, biggest_n_clusters, dict_top_n_clusters, tfidf_matrix, op_id_to_bow_id, vocab, csv_dir_mod_summary):
    for i in biggest_n_clusters:
        top_words = top_k_words(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
        top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
        top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[i], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
        most_relev_op = document_closest_to_mean(dict_top_n_clusters[i], tfidf_matrix, op_id_to_bow_id)
        
        top_words = [x.encode('utf-8') for x in top_words]
        top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
        top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

        len_old_top_words = len(top_words)
        len_old_top_words_from_mean = len(top_words_from_mean)
        len_old_top_words_from_diff = len(top_words_from_diff)

        if len(top_words) != len(top_words_from_mean):
            for j in range(0,len(top_words_from_mean)-len(top_words)):
                top_words.append(np.nan)

        cluster_summary = pd.DataFrame()
        cluster_summary['top_words'] = top_words
        cluster_summary['top_words_from_mean'] = top_words_from_mean
        cluster_summary['top_words_from_diff'] = top_words_from_diff
        cluster_summary['most_relev_op'] = most_relev_op

        cluster_summary.to_csv(csv_dir_mod_summary + "cluster_"+str(i)+"_summary.csv")
        print "cluster", i, "is done", "(", len(dict_top_n_clusters[i]), "opinions )", len_old_top_words, len_old_top_words_from_mean, len_old_top_words_from_diff

In [23]:
%%time
cluster_summaries_csv(1000, biggest_n_clusters, dict_top_n_clusters, tfidf_matrix, op_id_to_bow_id, vocab, csv_dir_mod_summary)

cluster 2 is done ( 9273 opinions ) 1000 1000 1000
cluster 0 is done ( 6870 opinions ) 1000 1000 1000
cluster 1 is done ( 6234 opinions ) 1000 1000 1000
cluster 3 is done ( 1458 opinions ) 1000 1000 1000
cluster 15 is done ( 76 opinions ) 1000 1000 1000
cluster 7 is done ( 75 opinions ) 1000 1000 1000
cluster 8 is done ( 69 opinions ) 1000 1000 1000
cluster 4 is done ( 52 opinions ) 1000 1000 1000
cluster 31 is done ( 50 opinions ) 1000 1000 1000
cluster 17 is done ( 36 opinions ) 1000 1000 1000
cluster 48 is done ( 20 opinions ) 1000 1000 1000
cluster 36 is done ( 18 opinions ) 1000 1000 1000
cluster 13 is done ( 16 opinions ) 1000 1000 1000
cluster 40 is done ( 16 opinions ) 1000 1000 1000
cluster 23 is done ( 15 opinions ) 1000 1000 1000
cluster 51 is done ( 15 opinions ) 1000 1000 1000
cluster 54 is done ( 14 opinions ) 1000 1000 1000
cluster 16 is done ( 13 opinions ) 1000 1000 1000
cluster 45 is done ( 12 opinions ) 1000 1000 1000
cluster 42 is done ( 11 opinions ) 1000 1000 1000

# Cluster 2 Summary (9273 opinions)

In [102]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[2]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_mod + "cluster_2.csv")

print cluster_info.shape

(9273, 3)
Wall time: 1.47 s


In [98]:
cluster_info = pd.read_csv(csv_dir_mod + 'cluster_2.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,Dalton v. Jennings,1876-12-18,https://www.courtlistener.com/opinion/89374/da...
1,1,Windsor v. McVeigh,1876-12-11,https://www.courtlistener.com/opinion/89375/wi...
2,2,Bigelow v. Berkshire Life Ins. Co.,1876-12-11,https://www.courtlistener.com/opinion/89376/bi...
3,3,Indianapolis & St. Louis R. Co. v. Horst,1876-12-18,https://www.courtlistener.com/opinion/89378/in...
4,4,"The"" Atlas""",1876-11-27,https://www.courtlistener.com/opinion/89379/th...


In [99]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[2], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[2], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[2], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[2], tfidf_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_mod + "cluster_2_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 1min 36s


In [100]:
cluster_summary = pd.read_csv(csv_dir_mod + 'cluster_2_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,squier,court,plaintiff,89905
1,1,reinsur,state,bankruptci,89905
2,2,scophoni,v,court,89905
3,3,graeff,case,suit,89905
4,4,arbitr,compani,patent,89905


# Cluster 0 Summary (6870 opinions)

In [103]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[0]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_mod + "cluster_0.csv")

print cluster_info.shape

(6870, 3)
Wall time: 1.11 s


In [104]:
cluster_info = pd.read_csv(csv_dir_mod + 'cluster_0.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,New Jersey v. Anderson,1906-12-10,https://www.courtlistener.com/opinion/96546/ne...
1,1,Barkley v. Levee Commissioners,1876-12-18,https://www.courtlistener.com/opinion/89372/ba...
2,2,Broughton v. Pensacola,1876-12-18,https://www.courtlistener.com/opinion/89373/br...
3,3,Western Union Telegraph Co. v. Pennsylvania,1961-12-04,https://www.courtlistener.com/opinion/106303/w...
4,4,Reitz v. Mealey,1941-11-10,https://www.courtlistener.com/opinion/103548/r...


In [105]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[0], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[0], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[0], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[0], tfidf_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_mod + "cluster_0_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 1min 10s


In [106]:
cluster_summary = pd.read_csv(csv_dir_mod + 'cluster_0_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,dispensari,state,tax,95354
1,1,pension,tax,land,95354
2,2,fslic,court,state,95354
3,3,wass,land,compani,95354
4,4,bicknel,v,commiss,95354


# Cluster 1 Summary (6234 opinions)

In [107]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[1]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_mod + "cluster_1.csv")

print cluster_info.shape

(6234, 3)
Wall time: 3.11 s


In [108]:
cluster_info = pd.read_csv(csv_dir_mod + 'cluster_1.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,CBS v. Federal Communications Commission,1981-07-01,https://www.courtlistener.com/opinion/110557/c...
1,1,Hamilton v. Alabama,1961-11-13,https://www.courtlistener.com/opinion/106300/h...
2,2,United States v. Sing Tuck,1904-04-25,https://www.courtlistener.com/opinion/96076/un...
3,3,Hoyt v. Florida,1961-11-20,https://www.courtlistener.com/opinion/106302/h...
4,4,California Medical Assn. v. Federal Election C...,1981-06-26,https://www.courtlistener.com/opinion/110551/c...


In [109]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[1], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[1], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[1], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[1], tfidf_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_mod + "cluster_1_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 1min 17s


In [110]:
cluster_summary = pd.read_csv(csv_dir_mod + 'cluster_1_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,baal,court,led2d,104135
1,1,stumpf,state,sct,104135
2,2,lagrand,v,us,104135
3,3,ree,sct,petition,104135
4,4,lesag,us,v,104135


# Cluster 3 Summary (1458 opinions)

In [111]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[3]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_mod + "cluster_3.csv")

print cluster_info.shape

(1458, 3)
Wall time: 814 ms


In [112]:
cluster_info = pd.read_csv(csv_dir_mod + 'cluster_3.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,United States Ex Rel. French v. Weeks,1922-05-29,https://www.courtlistener.com/opinion/100016/u...
1,1,United States Ex Rel. Creary v. Weeks,1922-06-05,https://www.courtlistener.com/opinion/100017/u...
2,2,Ex Parte Harley-Davidson Motor Co.,1922-06-05,https://www.courtlistener.com/opinion/100019/e...
3,3,Cutler v. Kouns,1884-03-10,https://www.courtlistener.com/opinion/91071/cu...
4,4,Amado v. United States,1904-11-07,https://www.courtlistener.com/opinion/96135/am...


In [113]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[3], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[3], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[3], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[3], tfidf_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_mod + "cluster_3_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 15.7 s


In [114]:
cluster_summary = pd.read_csv(csv_dir_mod + 'cluster_3_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,shaeffer,court,indict,86062
1,1,wool,state,offic,86062
2,2,carusi,act,collector,86062
3,3,toy,unit,duti,86062
4,4,seed,case,unit,86062


# Cluster 15 Summary (76 opinions)

In [115]:
%%time

opinion_names = []
opinion_dates = []
opinion_links = []

for i in dict_top_n_clusters[15]:
    try:
        with open(clusters_dir + i + ".json") as data_file:
            data = json.load(data_file)
    except IOError:
        pass
        #name, date, link = case_info2(i)
        #opinion_names.append(name)
        #opinion_dates.append(date)
        #opinion_links.append(link)
        
    name = data['case_name'].encode('utf-8')
    date = data['date_filed'].encode('utf-8')
    link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')
    
    opinion_names.append(name)
    opinion_dates.append(date)
    opinion_links.append(link)
    
cluster_info = pd.DataFrame()
cluster_info['names'] = opinion_names
cluster_info['dates'] = opinion_dates
cluster_info['url'] = opinion_links

cluster_info.to_csv(csv_dir_mod + "cluster_15.csv")

print cluster_info.shape

(76, 3)
Wall time: 31 ms


In [116]:
cluster_info = pd.read_csv(csv_dir_mod + 'cluster_15.csv')
cluster_info.head()

Unnamed: 0.1,Unnamed: 0,names,dates,url
0,0,Watt v. Starke,1880-01-18,https://www.courtlistener.com/opinion/90120/wa...
1,1,Armstrong v. Morrill,1872-01-22,https://www.courtlistener.com/opinion/88525/ar...
2,2,Brandies v. Cochrane,1884-12-01,https://www.courtlistener.com/opinion/91214/br...
3,3,Union R. Co. v. Dull,1888-01-16,https://www.courtlistener.com/opinion/92109/un...
4,4,Cooper v. Dasher,1933-11-06,https://www.courtlistener.com/opinion/102138/c...


In [117]:
%%time

k=1000 # number of words to get

top_words = top_k_words(dict_top_n_clusters[15], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[15], k, tfidf_matrix, op_id_to_bow_id, vocab)
top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[15], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
most_relev_op = document_closest_to_mean(dict_top_n_clusters[15], tfidf_matrix, op_id_to_bow_id)

top_words = [x.encode('utf-8') for x in top_words]
top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

#print '\x1b[1;31m' + "Top K Words:" + '\x1b[0m', top_words
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster):" + '\x1b[0m', top_words_from_mean
#print '\x1b[1;31m' + "Top K Words (Mu_Cluster - Mu_Complement):" + '\x1b[0m', top_words_from_diff
#print '\x1b[1;31m' + "Most Relevent Opinion:" + '\x1b[0m', most_relev_op

cluster_summary = pd.DataFrame()
cluster_summary['top_words'] = top_words
cluster_summary['top_words_from_mean'] = top_words_from_mean
cluster_summary['top_words_from_diff'] = top_words_from_diff
cluster_summary['most_relev_op'] = most_relev_op

cluster_summary.to_csv(csv_dir_mod + "cluster_15_summary.csv")

print cluster_summary.shape

(1000, 4)
Wall time: 4.63 s


In [118]:
cluster_summary = pd.read_csv(csv_dir_mod + 'cluster_15_summary.csv')
cluster_summary.head()

Unnamed: 0.1,Unnamed: 0,top_words,top_words_from_mean,top_words_from_diff,most_relev_op
0,0,flaglor,deed,deed,87645
1,1,dippold,court,wife,87645
2,2,nailor,properti,convey,87645
3,3,goff,wife,husband,87645
4,4,turpin,convey,estat,87645
