In [1]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import mutual_info_score as mi
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabaz_score as ch # (X, labels)
from sklearn.metrics import completeness_score as cs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import fowlkes_mallows_score as fm
from sklearn.metrics import homogeneity_completeness_v_measure as hcvm
from sklearn.metrics import homogeneity_score as hs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import silhouette_score as ss # (X, labels)
from sklearn.metrics import silhouette_samples as ss2 # (X, labels)
from sklearn.metrics import v_measure_score as vm

import scipy.sparse
import random
import itertools
from itertools import combinations


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)
nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

## load tf-idf vectors
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [2]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

## TruncatedSVD of tf-idf matrix
Note that this rank reduction is essentially the same as doing Principal Component Analysis (PCA) on the matrix A, except that PCA mean-center the columns before performing SVD. PCA loses the sparseness of the A matrix, which can make it infeasible for large lexicons.

In [3]:
%%time

#svd = TruncatedSVD(n_components=500)
svd = TruncatedSVD(n_components=500, algorithm='arpack')
tfidf_matrix = svd.fit_transform(tfidf_matrix)

Wall time: 5min 8s


In [121]:
tfidf_matrix.shape

(27885L, 500L)

# save

In [122]:
#np.save('C:/Users/Michael/Desktop/network_data/scotus/nlp/svd_500/svd_500.npy',tfidf_matrix)

# load

In [3]:
tfidf_matrix = np.load('C:/Users/Michael/Desktop/network_data/scotus/nlp/svd_500/svd_500.npy')
print tfidf_matrix.shape

(27885L, 500L)


# Clustering Work
focus on largest connected component on **undirected scotus**

In [4]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

## modularity on undirected scotus
"For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules"--*Wikipedia*

In [5]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

# save clusters in pandas
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 126 clusters
Wall time: 1min 51s


In [6]:
graph_clusters_mod_score = g.modularity(mod_clust.membership)

print graph_clusters_mod_score

0.465097751507


## walktrap on undirected scotus

In [7]:
%%time

# walktrap clustering
cd_walktrap = g.community_walktrap()

wt_clust = cd_walktrap.as_clustering()

print wt_clust.summary()

# save clusters in pandas
walktrap_clusters = pd.Series(wt_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 2264 clusters
Wall time: 2min 51s


In [8]:
walktrap_clusters_mod_score = g.modularity(wt_clust.membership)

print walktrap_clusters_mod_score

0.504445233426


# NLP Clustering
K-Means, Gaussian Mixture Models (GMM), Hierarchical Clustering

## K-Means Clustering on tf-idf (truncated SVD)

### K-means clustering with K = 10

In [12]:
%%time

# set number of clusters
num_clusters = 10

# run kmeans
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

nlp_tfidf_clusters = km.labels_.tolist()

Wall time: 35.6 s


In [10]:
nlp_tfidf_clusters = clusters['km_10'].tolist()

In [11]:
# modularity score of cluster
nlp_tfidf_clusters_mod_score = g.modularity(nlp_tfidf_clusters)
print nlp_tfidf_clusters_mod_score

0.000396743831757


### K-means clustering with K = 100

In [27]:
%%time

# set number of clusters
num_clusters = 100

# run kmeans
km2 = KMeans(n_clusters=num_clusters)
km2.fit(tfidf_matrix)

nlp_tfidf_clusters2 = km2.labels_.tolist()

Wall time: 2min 26s


In [12]:
nlp_tfidf_clusters2 = clusters['km_100'].tolist()

In [13]:
# modularity score of cluster
nlp_tfidf_clusters_mod_score2 = g.modularity(nlp_tfidf_clusters2)
print nlp_tfidf_clusters_mod_score2

-3.78886635613e-05


### K-means clustering with K = 1000

In [30]:
%%time

# set number of clusters
num_clusters = 1000

# run kmeans
km3 = KMeans(n_clusters=num_clusters)
km3.fit(tfidf_matrix)

nlp_tfidf_clusters3 = km3.labels_.tolist()

Wall time: 15min 8s


In [14]:
nlp_tfidf_clusters3 = clusters['km_1000'].tolist()

In [15]:
# modularity score of cluster
nlp_tfidf_clusters_mod_score3 = g.modularity(nlp_tfidf_clusters3)
print nlp_tfidf_clusters_mod_score3

2.26442474939e-05


## GMM Clustering on tf-idf (truncated SVD)

### GMM Clustering with K = 10

In [32]:
%%time

# set number of components
num_components = 10

# run GMM
gmm = GaussianMixture(n_components=num_components)
gmm.fit(tfidf_matrix)

gmm_clusters = gmm.predict(tfidf_matrix).tolist()
gmm_clusters = map(int, gmm_clusters)

Wall time: 8min 45s


In [16]:
gmm_clusters = clusters['gmm_10'].tolist()

In [17]:
# modularity score of cluster
gmm_clusters_mod_score = g.modularity(gmm_clusters)
print gmm_clusters_mod_score

0.000294572155876


### GMM Clustering with K = 100 

In [35]:
%%time

# set number of components
num_components = 100

# run GMM
gmm2 = GaussianMixture(n_components=num_components)
gmm2.fit(tfidf_matrix)

gmm_clusters2 = gmm2.predict(tfidf_matrix).tolist()
gmm_clusters2 = map(int, gmm_clusters2)

Wall time: 19min 9s


In [18]:
gmm_clusters2 = clusters['gmm_100'].tolist()

In [19]:
# modularity score of cluster
gmm_clusters_mod_score2 = g.modularity(gmm_clusters2)
print gmm_clusters_mod_score2

8.92461841005e-05


### GMM Clustering with K = 1000

In [37]:
%%time

# set number of components
num_components = 1000

# run GMM
gmm3 = GaussianMixture(n_components=num_components)
gmm3.fit(tfidf_matrix)

gmm_clusters3 = gmm3.predict(tfidf_matrix).tolist()
gmm_clusters3 = map(int, gmm_clusters3)

Wall time: 53min 10s


In [20]:
gmm_clusters3 = clusters['gmm_1000'].tolist()

In [21]:
# modularity score of cluster
gmm_clusters_mod_score3 = g.modularity(gmm_clusters3)
print gmm_clusters_mod_score3

-0.00010331872288


## Hierarchical Clustering on tf-idf (truncated SVD)

### Hierarchical Clustering with K = 10

In [39]:
%%time

# set number of components
num_clusters = 10

# run Hierarchical Clustering (Agglomerative with linkage = 'ward', 'complete', 'average')
'''
1. ward: minimizes the variance of the clusters being merged.
2. average: uses the average of the distances of each observation of the two sets.
3. complete or maximum: uses the maximum distances between all observations of the two sets

note:
manhattan/L1 distance is often good for sparse features, or sparse noise: 
    i.e. many of the features are zero, as in text mining using occurences of rare words.
'''

hc = AgglomerativeClustering(n_clusters = num_clusters, affinity='euclidean', linkage='ward')
hc_clusters = hc.fit_predict(tfidf_matrix).tolist()
hc_clusters = map(int, hc_clusters)

Wall time: 6min 6s


In [22]:
hc_clusters = clusters['hc_10'].tolist()

In [23]:
# modularity score of cluster
hc_clusters_mod_score = g.modularity(hc_clusters)
print hc_clusters_mod_score

3.36681130488e-05


### Hierarchical Clustering with K = 100

In [41]:
%%time

# set number of components
num_clusters = 100

# run Hierarchical Clustering (Agglomerative with linkage = 'ward', 'complete', 'average')
'''
1. ward: minimizes the variance of the clusters being merged.
2. average: uses the average of the distances of each observation of the two sets.
3. complete or maximum: uses the maximum distances between all observations of the two sets

note:
manhattan/L1 distance is often good for sparse features, or sparse noise: 
    i.e. many of the features are zero, as in text mining using occurences of rare words.
'''

hc2 = AgglomerativeClustering(n_clusters = num_clusters, affinity='euclidean', linkage='ward')
hc_clusters2 = hc2.fit_predict(tfidf_matrix).tolist()
hc_clusters2 = map(int, hc_clusters2)

Wall time: 4min 44s


In [24]:
hc_clusters2 = clusters['hc_100'].tolist()

In [25]:
# modularity score of cluster
hc_clusters_mod_score2 = g.modularity(hc_clusters2)
print hc_clusters_mod_score2

0.00028060785358


### Hierarchical Clustering with K = 1000

In [43]:
%%time

# set number of components
num_clusters = 1000

# run Hierarchical Clustering (Agglomerative with linkage = 'ward', 'complete', 'average')
'''
1. ward: minimizes the variance of the clusters being merged.
2. average: uses the average of the distances of each observation of the two sets.
3. complete or maximum: uses the maximum distances between all observations of the two sets

note:
manhattan/L1 distance is often good for sparse features, or sparse noise: 
    i.e. many of the features are zero, as in text mining using occurences of rare words.
'''

hc3 = AgglomerativeClustering(n_clusters = num_clusters, affinity='euclidean', linkage='ward')
hc_clusters3 = hc3.fit_predict(tfidf_matrix).tolist()
hc_clusters3 = map(int, hc_clusters3)

Wall time: 4min 48s


In [26]:
hc_clusters3 = clusters['hc_1000'].tolist()

In [27]:
# modularity score of cluster
hc_clusters_mod_score3 = g.modularity(hc_clusters3)
print hc_clusters_mod_score3

-0.000106095027357


## Compare NLP clustering (tfidf) vs graph clustering

In [50]:
#clusters = pd.DataFrame(index=normalized_text_dict.keys(), columns=['nlp', 'graph'])
clusters = pd.DataFrame(index=op_id_to_bow_id, columns=[])

# add in communities 
clusters['mod'] = graph_clusters

# consider nodes not considered in CD to be their own cluster
# i.e. nodes outside the largest connected component
clusters['mod'].fillna(max(graph_clusters) + 1, inplace=True)

# make formatting
clusters['mod'] = clusters['mod'].astype(np.int)

# add in walktrap clusters
clusters['wt'] = walktrap_clusters
clusters['wt'].fillna(max(walktrap_clusters) + 1, inplace=True)
clusters['wt'] = clusters['wt'].astype(np.int)

# add in NLP clusters
clusters['km_10'] = nlp_tfidf_clusters
clusters['km_100'] = nlp_tfidf_clusters2
clusters['km_1000'] = nlp_tfidf_clusters3

clusters['gmm_10'] = gmm_clusters
clusters['gmm_100'] = gmm_clusters2
clusters['gmm_1000'] = gmm_clusters3

clusters['hc_10'] = hc_clusters
clusters['hc_100'] = hc_clusters2
clusters['hc_1000'] = hc_clusters3

clusters.to_csv(csv_dir + "clusters_SVD.csv")

### load cluster csv saved in current directory

In [9]:
clusters = pd.read_csv(csv_dir + 'clusters_SVD.csv')
clusters

Unnamed: 0.1,Unnamed: 0,mod,wt,km_10,km_100,km_1000,gmm_10,gmm_100,gmm_1000,hc_10,hc_100,hc_1000
0,145658,1,5,0,1,224,3,54,392,1,50,157
1,89370,3,294,0,68,569,3,95,830,1,50,157
2,89371,0,35,9,85,886,8,47,711,2,94,697
3,89372,0,3,0,70,242,3,40,443,1,37,193
4,89373,0,3,0,22,897,0,64,213,1,65,84
5,89374,2,4,6,48,803,4,9,793,2,94,817
6,89375,2,5,5,17,260,9,56,260,3,48,999
7,89376,2,6,2,96,566,4,37,736,5,88,845
8,89377,2,7,0,60,677,8,71,375,1,20,146
9,89378,2,7,1,23,891,3,37,517,5,15,116


# Modularity Scores of Clusters

In [32]:
clusters_strings = ['mod', 'wt', 'km_10', 'km_100', 'km_1000', 'gmm_10', 'gmm_100', 'gmm_1000', 'hc_10', 'hc_100', 'hc_1000']
mod_scores = [graph_clusters_mod_score, walktrap_clusters_mod_score, nlp_tfidf_clusters_mod_score, nlp_tfidf_clusters_mod_score2,
             nlp_tfidf_clusters_mod_score3, gmm_clusters_mod_score, gmm_clusters_mod_score2, gmm_clusters_mod_score3,
             hc_clusters_mod_score, hc_clusters_mod_score2, hc_clusters_mod_score3]

clusters = pd.DataFrame(index=clusters_strings)
clusters['modularity_score'] = mod_scores

clusters.to_csv(csv_dir + "clusters_SVD_mod_score.csv")

In [33]:
clusters_mod_score = pd.read_csv(csv_dir + 'clusters_SVD_mod_score.csv')
clusters_mod_score

Unnamed: 0.1,Unnamed: 0,modularity_score
0,mod,0.465098
1,wt,0.504445
2,km_10,0.000397
3,km_100,-3.8e-05
4,km_1000,2.3e-05
5,gmm_10,0.000295
6,gmm_100,8.9e-05
7,gmm_1000,-0.000103
8,hc_10,3.4e-05
9,hc_100,0.000281


# normalized_mutual_information score (nmi) of 
## 1. modularity vs. walktrap
## 2. every combination of nlp_clusters
note: can't do nmi(graph_cluster, nlp_cluster) because len(graph_cluster) != len(nlp_cluster)  
http://scikit-learn.org/stable/modules/classes.html#clustering-metrics

In [30]:
string_list = ["km_10", "km_100", "km_1000", "gmm_10", "gmm_100", "gmm_1000", 
               "hc_10", "hc_100", "hc_1000"]
clusters_list = [nlp_tfidf_clusters, nlp_tfidf_clusters2, nlp_tfidf_clusters3,
                gmm_clusters, gmm_clusters2, gmm_clusters3, hc_clusters, hc_clusters2, hc_clusters3]

nums = range(0,9)
combs = list(combinations(nums,2))

'''
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import adjusted_mutual_info_score as ami
from sklearn.metrics import mutual_info_score as mi
from sklearn.metrics import adjusted_rand_score as ar
from sklearn.metrics import calinski_harabaz_score as ch # (X, labels)
from sklearn.metrics import completeness_score as cs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import fowlkes_mallows_score as fm
from sklearn.metrics import homogeneity_completeness_v_measure as hcvm
from sklearn.metrics import homogeneity_score as hs # metric isn't symmetric (labels_true, labels_predicted)
from sklearn.metrics import silhouette_score as ss # (X, labels)
from sklearn.metrics import silhouette_samples as ss2 # (X, labels)
from sklearn.metrics import v_measure_score as vm
'''

combinations_list = []
nmi_scores = []
ami_scores = []
mi_scores = []
ar_scores = []
#ch_scores = []
cs_scores = []
fm_scores = []
#hcvm_scores = []
hs_scores = []
#ss_scores = []
#ss2_scores = []
vm_scores = []

combinations_list.append("mod vs. wt")
nmi_scores.append(nmi(graph_clusters, walktrap_clusters))
ami_scores.append(ami(graph_clusters, walktrap_clusters))
mi_scores.append(mi(graph_clusters, walktrap_clusters))
ar_scores.append(ar(graph_clusters, walktrap_clusters))
#ch_scores.append(ch(graph_clusters, walktrap_clusters))
cs_scores.append(cs(graph_clusters, walktrap_clusters))
fm_scores.append(fm(graph_clusters, walktrap_clusters))
#hcvm_scores.append(hcvm(graph_clusters, walktrap_clusters))
hs_scores.append(hs(graph_clusters, walktrap_clusters))
#ss_scores.append(ss(graph_clusters, walktrap_clusters))
#ss2_scores.append(ss2(graph_clusters, walktrap_clusters))
vm_scores.append(vm(graph_clusters, walktrap_clusters))

for i in combs:
    combinations_list.append(string_list[i[0]] + " vs. " + string_list[i[1]])
    nmi_scores.append(nmi(clusters_list[i[0]], clusters_list[i[1]]))
    ami_scores.append(ami(clusters_list[i[0]], clusters_list[i[1]]))
    mi_scores.append(mi(clusters_list[i[0]], clusters_list[i[1]]))
    ar_scores.append(ar(clusters_list[i[0]], clusters_list[i[1]]))
    #ch_scores.append(ch(clusters_list[i[0]], clusters_list[i[1]]))
    cs_scores.append(cs(clusters_list[i[0]], clusters_list[i[1]]))
    fm_scores.append(fm(clusters_list[i[0]], clusters_list[i[1]]))
    #hcvm_scores.append(hcvm(clusters_list[i[0]], clusters_list[i[1]]))
    hs_scores.append(hs(clusters_list[i[0]], clusters_list[i[1]]))
    #ss_scores.append(ss(clusters_list[i[0]], clusters_list[i[1]]))
    #ss2_scores.append(ss2(clusters_list[i[0]], clusters_list[i[1]]))
    vm_scores.append(vm(clusters_list[i[0]], clusters_list[i[1]]))
    
clusters = pd.DataFrame(index=combinations_list)
clusters["nmi_score"] = nmi_scores
clusters["ami_score"] = ami_scores
clusters["mi_score"] = mi_scores
clusters["ar_score"] = ar_scores
clusters["cs_score"] = cs_scores
clusters["fm_score"] = fm_scores
clusters["hs_score"] = hs_scores
clusters["vm_score"] = vm_scores
clusters.to_csv(csv_dir + "clusters_SVD_nmi_score.csv")

In [31]:
clusters_nmi_score = pd.read_csv(csv_dir + 'clusters_SVD_nmi_score.csv')
clusters_nmi_score

Unnamed: 0.1,Unnamed: 0,nmi_score,ami_score,mi_score,ar_score,cs_score,fm_score,hs_score,vm_score
0,mod vs. wt,0.38496,0.184582,1.001603,0.088432,0.222527,0.233443,0.665962,0.333588
1,km_10 vs. km_100,0.454223,0.311113,1.376554,0.103153,0.313664,0.218541,0.657772,0.424772
2,km_10 vs. km_1000,0.404013,0.204428,1.513594,0.011007,0.225683,0.07026,0.723255,0.344019
3,km_10 vs. gmm_10,0.456354,0.445125,0.978323,0.311404,0.445492,0.404925,0.467481,0.456222
4,km_10 vs. gmm_100,0.449239,0.307338,1.362867,0.09695,0.309899,0.211639,0.651231,0.419955
5,km_10 vs. gmm_1000,0.403644,0.204352,1.511488,0.011024,0.225585,0.070054,0.722248,0.343791
6,km_10 vs. hc_10,0.384093,0.366202,0.767294,0.272394,0.402373,0.400728,0.366643,0.383678
7,km_10 vs. hc_100,0.362382,0.248962,1.091478,0.071048,0.251789,0.163128,0.521551,0.33962
8,km_10 vs. hc_1000,0.387812,0.194601,1.456224,0.010013,0.216139,0.066034,0.695841,0.329828
9,km_100 vs. km_1000,0.645515,0.416341,3.50208,0.10124,0.522174,0.184872,0.797989,0.631269
