In [20]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import mutual_info_score as mi
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabaz_score as ch # (X, labels)
from sklearn.metrics import completeness_score as cs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import fowlkes_mallows_score as fm
from sklearn.metrics import homogeneity_completeness_v_measure as hcvm
from sklearn.metrics import homogeneity_score as hs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import silhouette_score as ss # (X, labels)
from sklearn.metrics import silhouette_samples as ss2 # (X, labels)
from sklearn.metrics import v_measure_score as vm

import scipy.sparse
import random
import itertools
from itertools import combinations


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)
nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## load tf-idf vectors
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [2]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

In [3]:
tfidf_matrix

<27885x567570 sparse matrix of type '<type 'numpy.float64'>'
	with 20817470 stored elements in Compressed Sparse Row format>

# Clustering Work
focus on largest connected component on **undirected scotus**

In [4]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

## modularity on undirected scotus
"For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules"--*Wikipedia*

In [5]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

# save clusters in pandas
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 126 clusters
Wall time: 1min 44s


In [6]:
graph_clusters_mod_score = g.modularity(mod_clust.membership)

print graph_clusters_mod_score

0.465097751507


## walktrap on undirected scotus

In [7]:
%%time

# walktrap clustering
cd_walktrap = g.community_walktrap()

wt_clust = cd_walktrap.as_clustering()

print wt_clust.summary()

# save clusters in pandas
walktrap_clusters = pd.Series(wt_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 2264 clusters
Wall time: 3min 1s


In [8]:
walktrap_clusters_mod_score = g.modularity(wt_clust.membership)

print walktrap_clusters_mod_score

0.504445233426


# NLP Clustering
K-Means and Gaussian Mixture Models (GMM)

# K-Means Clustering on tf-idf

### K-means clustering with K = 10

In [4]:
%%time

# set number of clusters
num_clusters = 10

# run kmeans
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

nlp_tfidf_clusters = km.labels_.tolist()

Wall time: 2h 5s


In [11]:
nlp_tfidf_clusters = clusters['km_10'].tolist()

In [12]:
# modularity score of cluster
nlp_tfidf_clusters_mod_score = g.modularity(nlp_tfidf_clusters)
print nlp_tfidf_clusters_mod_score

0.000575918627669


### K-means clustering with K = 100

In [5]:
%%time

# set number of clusters
num_clusters = 100

# run kmeans
km2 = KMeans(n_clusters=num_clusters)
km2.fit(tfidf_matrix)

nlp_tfidf_clusters2 = km2.labels_.tolist()

Wall time: 6h 16min 12s


In [13]:
nlp_tfidf_clusters2 = clusters['km_100'].tolist()

In [14]:
# modularity score of cluster
nlp_tfidf_clusters_mod_score2 = g.modularity(nlp_tfidf_clusters2)
print nlp_tfidf_clusters_mod_score2

0.000287935036148


### K-means clustering with K = 1000

In [6]:
%%time

# set number of clusters
num_clusters = 1000

# run kmeans
km3 = KMeans(n_clusters=num_clusters)
km3.fit(tfidf_matrix)

nlp_tfidf_clusters3 = km3.labels_.tolist()

MemoryError: 

# Gaussian-Mixture-Model (GMM) Clustering on tf-idf

### GMM Clustering with K = 10

In [10]:
%%time

# set number of components
num_components = 10

tfidf_matrix_dense = tfidf_matrix.todense()

# run GMM
gmm = GaussianMixture(n_components=num_components)
gmm.fit(tfidf_matrix_dense)

gmm_clusters = gmm.predict(tfidf_matrix_dense).tolist()
gmm_clusters = map(int, gmm_clusters)

MemoryError: 

### GMM Clustering with K = 100 

In [62]:
%%time

# set number of components
num_components = 100

tfidf_matrix_dense = tfidf_matrix.todense()

# run GMM
gmm2 = GaussianMixture(n_components=num_components)
gmm2.fit(tfidf_matrix_dense)

gmm_clusters2 = gmm2.predict(tfidf_matrix_dense).tolist()
gmm_clusters2 = map(int, gmm_clusters2)

Wall time: 2.06 s


### GMM Clustering with K = 1000

In [63]:
%%time

# set number of components
num_components = 1000

tfidf_maatrix_dense = tfidf_matrix.todense()

# run GMM
gmm3 = GaussianMixture(n_components=num_components)
gmm3.fit(tfidf_matrix_dense)

gmm_clusters3 = gmm3.predict(tfidf_matrix_dense).tolist()
gmm_clusters3 = map(int, gmm_clusters3)

Wall time: 29.2 s


## Hierarchical Clustering on tf-idf

### Hierarchical Clustering with K = 10

In [5]:
%%time

# set number of components
num_clusters = 10

# run Hierarchical Clustering (Agglomerative with linkage = 'ward', 'complete', 'average')
'''
1. ward: minimizes the variance of the clusters being merged.
2. average: uses the average of the distances of each observation of the two sets.
3. complete or maximum: uses the maximum distances between all observations of the two sets

note:
manhattan/L1 distance is often good for sparse features, or sparse noise: 
    i.e. many of the features are zero, as in text mining using occurences of rare words.
'''

hc = AgglomerativeClustering(n_clusters = num_clusters, affinity='euclidean', linkage='ward')
hc_clusters = hc.fit_predict(tfidf_matrix).tolist()
hc_clusters = map(int, hc_clusters)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

### Hierarchical Clustering with K = 100

In [None]:
%%time

# set number of components
num_clusters = 100

# run Hierarchical Clustering (Agglomerative with linkage = 'ward', 'complete', 'average')
'''
1. ward: minimizes the variance of the clusters being merged.
2. average: uses the average of the distances of each observation of the two sets.
3. complete or maximum: uses the maximum distances between all observations of the two sets

note:
manhattan/L1 distance is often good for sparse features, or sparse noise: 
    i.e. many of the features are zero, as in text mining using occurences of rare words.
'''

hc2 = AgglomerativeClustering(n_clusters = num_clusters, affinity='euclidean', linkage='ward')
hc_clusters2 = hc2.fit_predict(tfidf_matrix).tolist()
hc_clusters2 = map(int, hc_clusters2)

### Hierarchical Clustering with K = 1000

In [None]:
%%time

# set number of components
num_clusters = 1000

# run Hierarchical Clustering (Agglomerative with linkage = 'ward', 'complete', 'average')
'''
1. ward: minimizes the variance of the clusters being merged.
2. average: uses the average of the distances of each observation of the two sets.
3. complete or maximum: uses the maximum distances between all observations of the two sets

note:
manhattan/L1 distance is often good for sparse features, or sparse noise: 
    i.e. many of the features are zero, as in text mining using occurences of rare words.
'''

hc3 = AgglomerativeClustering(n_clusters = num_clusters, affinity='euclidean', linkage='ward')
hc_clusters3 = hc3.fit_predict(tfidf_matrix).tolist()
hc_clusters3 = map(int, hc_clusters3)

## Compare NLP clustering (tfidf) vs graph clustering

In [11]:
#clusters = pd.DataFrame(index=normalized_text_dict.keys(), columns=['nlp', 'graph'])
clusters = pd.DataFrame(index=op_id_to_bow_id, columns=[])

# add in communities 
clusters['mod'] = graph_clusters

# consider nodes not considered in CD to be their own cluster
# i.e. nodes outside the largest connected component
clusters['mod'].fillna(max(graph_clusters) + 1, inplace=True)

# make formatting
clusters['mod'] = clusters['mod'].astype(np.int)

# add in walktrap clusters
clusters['wt'] = walktrap_clusters
clusters['wt'].fillna(max(walktrap_clusters) + 1, inplace=True)
clusters['wt'] = clusters['wt'].astype(np.int)

# add in NLP clusters
clusters['km_10'] = nlp_tfidf_clusters
clusters['km_100'] = nlp_tfidf_clusters2
#clusters['km_1000'] = nlp_tfidf_clusters3

#clusters['gmm_10'] = gmm_clusters
#clusters['gmm_100'] = gmm_clusters2
#clusters['gmm_1000'] = gmm_clusters3

#clusters['hc_10'] = hc_clusters
#clusters['hc_100'] = hc_clusters2
#clusters['hc_1000'] = hc_clusters3

clusters.to_csv(csv_dir + "clusters_full_tfidf.csv")

In [12]:
clusters

Unnamed: 0,mod,wt,km_10,km_100
145658,1,5,8,76
89370,3,294,8,76
89371,0,35,0,43
89372,0,3,8,59
89373,0,3,8,11
89374,2,4,4,31
89375,2,5,7,3
89376,2,6,9,19
89377,2,7,8,69
89378,2,7,1,30


### load cluster csv saved in current directory

In [10]:
clusters = pd.read_csv(csv_dir + 'clusters_full_tfidf.csv')
clusters

Unnamed: 0.1,Unnamed: 0,mod,wt,km_10,km_100
0,145658,1,5,8,76
1,89370,3,294,8,76
2,89371,0,35,0,43
3,89372,0,3,8,59
4,89373,0,3,8,11
5,89374,2,4,4,31
6,89375,2,5,7,3
7,89376,2,6,9,19
8,89377,2,7,8,69
9,89378,2,7,1,30


# Modularity Scores of Clusters

In [15]:
clusters_strings = ['mod', 'wt', 'km_10', 'km_100']
mod_scores = [graph_clusters_mod_score, walktrap_clusters_mod_score, nlp_tfidf_clusters_mod_score, nlp_tfidf_clusters_mod_score2
             ]

clusters = pd.DataFrame(index=clusters_strings)
clusters['modularity_score'] = mod_scores

clusters.to_csv(csv_dir + "clusters_full_tfidf_mod_score.csv")

In [16]:
clusters_mod_score = pd.read_csv(csv_dir + 'clusters_full_tfidf_mod_score.csv')
clusters_mod_score

Unnamed: 0.1,Unnamed: 0,modularity_score
0,mod,0.465098
1,wt,0.504445
2,km_10,0.000576
3,km_100,0.000288


# normalized_mutual_information score (nmi) of 
## 1. modularity vs. walktrap
## 2. every combination of nlp_clusters
note: can't do nmi(graph_cluster, nlp_cluster) because len(graph_cluster) != len(nlp_cluster)  
http://scikit-learn.org/stable/modules/classes.html#clustering-metrics

In [23]:
string_list = ["km_10", "km_100"]
clusters_list = [nlp_tfidf_clusters, nlp_tfidf_clusters2]

nums = range(0,2)
combs = list(combinations(nums,2))

'''
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import mutual_info_score as mi
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabaz_score as ch # (X, labels)
from sklearn.metrics import completeness_score as cs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import fowlkes_mallows_score as fm
from sklearn.metrics import homogeneity_completeness_v_measure as hcvm
from sklearn.metrics import homogeneity_score as hs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import silhouette_score as ss # (X, labels)
from sklearn.metrics import silhouette_samples as ss2 # (X, labels)
from sklearn.metrics import v_measure_score as vm
'''

combinations_list = []
nmi_scores = []
ami_scores = []
mi_scores = []
ar_scores = []
#ch_scores = []
cs_scores = []
fm_scores = []
#hcvm_scores = []
hs_scores = []
#ss_scores = []
#ss2_scores = []
vm_scores = []

combinations_list.append("mod vs. wt")
nmi_scores.append(nmi(graph_clusters, walktrap_clusters))
ami_scores.append(ami(graph_clusters, walktrap_clusters))
mi_scores.append(mi(graph_clusters, walktrap_clusters))
ar_scores.append(ar(graph_clusters, walktrap_clusters))
#ch_scores.append(ch(graph_clusters, walktrap_clusters))
cs_scores.append(cs(graph_clusters, walktrap_clusters))
fm_scores.append(fm(graph_clusters, walktrap_clusters))
#hcvm_scores.append(hcvm(graph_clusters, walktrap_clusters))
hs_scores.append(hs(graph_clusters, walktrap_clusters))
#ss_scores.append(ss(graph_clusters, walktrap_clusters))
#ss2_scores.append(ss2(graph_clusters, walktrap_clusters))
vm_scores.append(vm(graph_clusters, walktrap_clusters))

for i in combs:
    combinations_list.append(string_list[i[0]] + " vs. " + string_list[i[1]])
    nmi_scores.append(nmi(clusters_list[i[0]], clusters_list[i[1]]))
    ami_scores.append(ami(clusters_list[i[0]], clusters_list[i[1]]))
    mi_scores.append(mi(clusters_list[i[0]], clusters_list[i[1]]))
    ar_scores.append(ar(clusters_list[i[0]], clusters_list[i[1]]))
    #ch_scores.append(ch(clusters_list[i[0]], clusters_list[i[1]]))
    cs_scores.append(cs(clusters_list[i[0]], clusters_list[i[1]]))
    fm_scores.append(fm(clusters_list[i[0]], clusters_list[i[1]]))
    #hcvm_scores.append(hcvm(clusters_list[i[0]], clusters_list[i[1]]))
    hs_scores.append(hs(clusters_list[i[0]], clusters_list[i[1]]))
    #ss_scores.append(ss(clusters_list[i[0]], clusters_list[i[1]]))
    #ss2_scores.append(ss2(clusters_list[i[0]], clusters_list[i[1]]))
    vm_scores.append(vm(clusters_list[i[0]], clusters_list[i[1]]))
    
clusters = pd.DataFrame(index=combinations_list)
clusters["nmi_score"] = nmi_scores
clusters["ami_score"] = ami_scores
clusters["mi_score"] = mi_scores
clusters["ar_score"] = ar_scores
clusters["cs_score"] = cs_scores
clusters["fm_score"] = fm_scores
clusters["hs_score"] = hs_scores
clusters["vm_score"] = vm_scores
clusters.to_csv(csv_dir + "clusters_full_tfidf_nmi_score.csv")

In [24]:
clusters_nmi_score = pd.read_csv(csv_dir + 'clusters_full_tfidf_nmi_score.csv')
clusters_nmi_score

Unnamed: 0.1,Unnamed: 0,nmi_score,ami_score,mi_score,ar_score,cs_score,fm_score,hs_score,vm_score
0,mod vs. wt,0.38496,0.184582,1.001603,0.088432,0.222527,0.233443,0.665962,0.333588
1,km_10 vs. km_100,0.433438,0.29767,1.301529,0.108179,0.300318,0.220979,0.625565,0.405815
