# Summarize K-Means (K=1000) Cluster for NMF

In [2]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict
import json


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import mutual_info_score as mi
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabaz_score as ch # (X, labels)
from sklearn.metrics import completeness_score as cs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import fowlkes_mallows_score as fm
from sklearn.metrics import homogeneity_completeness_v_measure as hcvm
from sklearn.metrics import homogeneity_score as hs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import silhouette_score as ss # (X, labels)
from sklearn.metrics import silhouette_samples as ss2 # (X, labels)
from sklearn.metrics import v_measure_score as vm

import scipy.sparse
import random
import itertools
from itertools import combinations


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *
from helpful_functions import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)
nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"
csv_dir_NMF_km1000 = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_NMF_km1000/"
csv_dir_NMF_km1000_info = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_NMF_km1000/info/"
csv_dir_NMF_km1000_summary = "C:/Users/Michael/Documents/GitHub/law-net/csv/summarize_NMF_km1000/summary/"

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# clusters directory
clusters_dir = "C:/Users/Michael/Desktop/network_data/raw/scotus/clusters/"

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

## load tf-idf vectors
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [3]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

In [4]:
tfidf_matrix

<27885x567570 sparse matrix of type '<type 'numpy.float64'>'
	with 20817470 stored elements in Compressed Sparse Row format>

In [5]:
clusters = pd.read_csv(csv_dir + 'clusters_NMF.csv')
clusters.head()

Unnamed: 0.1,Unnamed: 0,km_10,km_100,km_1000,gmm_10,gmm_100,gmm_1000,hc_10,hc_100,hc_1000
0,145658,5,71,356,0,37,807,1,86,74
1,89370,4,71,557,0,37,145,1,61,267
2,89371,4,73,546,7,11,560,1,85,144
3,89372,4,12,719,0,13,523,1,1,814
4,89373,9,10,602,6,74,681,1,47,79


In [6]:
nlp_tfidf_clusters = clusters['km_1000'].tolist()

nlp_clusters = pd.Series(nlp_tfidf_clusters, index=op_id_to_bow_id)
nlp_clusters.head()

145658    356
89370     557
89371     546
89372     719
89373     602
dtype: int64

## get the top 5 biggest clusters

In [7]:
'''
dict_top_n_clusters = dictionary of top K clusters 
                      (key=cluster #, value=opinions in cluster)
                      
biggest_n_clusters = list of top K clusters (int)
'''

dict_top_n_clusters, biggest_n_clusters = get_top_n_clusters(1000, 1000, nlp_clusters)

cluster 162 : 807 opinions
cluster 28 : 337 opinions
cluster 729 : 245 opinions
cluster 823 : 207 opinions
cluster 50 : 200 opinions
cluster 426 : 193 opinions
cluster 174 : 186 opinions
cluster 836 : 178 opinions
cluster 182 : 176 opinions
cluster 871 : 175 opinions
cluster 353 : 169 opinions
cluster 9 : 157 opinions
cluster 945 : 157 opinions
cluster 308 : 156 opinions
cluster 962 : 148 opinions
cluster 198 : 143 opinions
cluster 558 : 138 opinions
cluster 659 : 137 opinions
cluster 151 : 135 opinions
cluster 369 : 134 opinions
cluster 520 : 133 opinions
cluster 123 : 128 opinions
cluster 370 : 128 opinions
cluster 476 : 128 opinions
cluster 113 : 126 opinions
cluster 933 : 125 opinions
cluster 973 : 125 opinions
cluster 354 : 123 opinions
cluster 581 : 121 opinions
cluster 625 : 119 opinions
cluster 149 : 116 opinions
cluster 971 : 116 opinions
cluster 244 : 114 opinions
cluster 258 : 114 opinions
cluster 278 : 112 opinions
cluster 529 : 112 opinions
cluster 227 : 109 opinions
clust

# Top K Words of Each Cluster
This function summarizes a set of opinions by returning the words that appear in these opinions with the highest tf-idf scores.

# Top K Words ($\mu_{cluster}$) of Each Cluster
compute the mean tf-idf vector of the cluster, return the top K words from this mean vector

# Top K Words ($\mu_{cluster} - \mu_{complement}$ ) of Each Cluster
compute the mean tf-idf vector of the cluster and also of the complement of the cluster,  
take the difference mu_cluster - mu_complement, return the top K words in this difference

# Most Relevant Opinion of Each Cluster
compute the mean tf-idf vector, return the document in the cluster closet to the mean  

In [8]:
def cluster_infos_csv(biggest_n_clusters, dict_top_n_clusters, clusters_dir, csv_dir_info):
    
    for i in biggest_n_clusters:
        
        opinion_names = []
        opinion_dates = []
        opinion_links = []
        
        for j in dict_top_n_clusters[i]:
            try:
                with open(clusters_dir + j + ".json") as data_file:
                    data = json.load(data_file)
            except IOError:
                pass
                #name, date, link = case_info2(i)
                #opinion_names.append(name)
                #opinion_dates.append(date)
                #opinion_links.append(link)
            
            name = data['case_name'].encode('utf-8')
            date = data['date_filed'].encode('utf-8')
            link = 'https://www.courtlistener.com' + data['absolute_url'].encode('utf-8')

            opinion_names.append(name)
            opinion_dates.append(date)
            opinion_links.append(link)

        cluster_info = pd.DataFrame()
        cluster_info['names'] = opinion_names
        cluster_info['dates'] = opinion_dates
        cluster_info['url'] = opinion_links

        cluster_info.to_csv(csv_dir_info + "cluster_"+str(i)+".csv")

In [9]:
%%time
cluster_infos_csv(biggest_n_clusters, dict_top_n_clusters, clusters_dir, csv_dir_NMF_km1000_info)

Wall time: 18.6 s


In [10]:
def cluster_summaries_csv(k, biggest_n_clusters, dict_top_n_clusters, tfidf_matrix, op_id_to_bow_id, vocab, csv_dir_summary):
    for i in biggest_n_clusters:
        top_words = top_k_words(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
        top_words_from_mean = top_k_words_from_mean_vector(dict_top_n_clusters[i], k, tfidf_matrix, op_id_to_bow_id, vocab)
        top_words_from_diff = top_k_words_from_difference(dict_top_n_clusters[i], all_the_opinions, k, tfidf_matrix, op_id_to_bow_id, vocab)
        most_relev_op = document_closest_to_mean(dict_top_n_clusters[i], tfidf_matrix, op_id_to_bow_id)
        
        top_words = [x.encode('utf-8') for x in top_words]
        top_words_from_mean = [x.encode('utf-8') for x in top_words_from_mean]
        top_words_from_diff = [x.encode('utf-8') for x in top_words_from_diff]

        len_old_top_words = len(top_words)
        len_old_top_words_from_mean = len(top_words_from_mean)
        len_old_top_words_from_diff = len(top_words_from_diff)

        if len(top_words) != len(top_words_from_mean):
            for j in range(0,len(top_words_from_mean)-len(top_words)):
                top_words.append(np.nan)

        cluster_summary = pd.DataFrame()
        cluster_summary['top_words'] = top_words
        cluster_summary['top_words_from_mean'] = top_words_from_mean
        cluster_summary['top_words_from_diff'] = top_words_from_diff
        cluster_summary['most_relev_op'] = most_relev_op

        cluster_summary.to_csv(csv_dir_summary + "cluster_"+str(i)+"_summary.csv")
        print "cluster", i, "is done", "(", len(dict_top_n_clusters[i]), "opinions )", len_old_top_words, len_old_top_words_from_mean, len_old_top_words_from_diff

In [11]:
%%time
cluster_summaries_csv(1000, biggest_n_clusters, dict_top_n_clusters, tfidf_matrix, op_id_to_bow_id, vocab, csv_dir_NMF_km1000_summary)

cluster 162 is done ( 807 opinions ) 1000 1000 1000
cluster 28 is done ( 337 opinions ) 1000 1000 1000
cluster 729 is done ( 245 opinions ) 1000 1000 1000
cluster 823 is done ( 207 opinions ) 1000 1000 1000
cluster 50 is done ( 200 opinions ) 1000 1000 1000
cluster 426 is done ( 193 opinions ) 1000 1000 1000
cluster 174 is done ( 186 opinions ) 1000 1000 1000
cluster 836 is done ( 178 opinions ) 1000 1000 1000
cluster 182 is done ( 176 opinions ) 1000 1000 1000
cluster 871 is done ( 175 opinions ) 1000 1000 1000
cluster 353 is done ( 169 opinions ) 1000 1000 1000
cluster 9 is done ( 157 opinions ) 1000 1000 1000
cluster 945 is done ( 157 opinions ) 1000 1000 1000
cluster 308 is done ( 156 opinions ) 1000 1000 1000
cluster 962 is done ( 148 opinions ) 1000 1000 1000
cluster 198 is done ( 143 opinions ) 1000 1000 1000
cluster 558 is done ( 138 opinions ) 1000 1000 1000
cluster 659 is done ( 137 opinions ) 1000 1000 1000
cluster 151 is done ( 135 opinions ) 1000 1000 1000
cluster 369 is d