In [3]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.sparse
import random
import itertools


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)

nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

## load tf-idf vectors (min_df = 0.3, max_df = 0.7)
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [3]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_df_sub_dir)

# Graph Clustering
focus on largest connected component on **undirected scotus**

In [4]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

## modularity on undirected scotus
"For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules"--*Wikipedia*

In [9]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

# save clusters in pandas
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 126 clusters
Wall time: 1min 57s


## walktrap on undirected scotus

In [11]:
%%time

# walktrap clustering
cd_walktrap = g.community_walktrap()

wt_clust = cd_walktrap.as_clustering()

print wt_clust.summary()

# save clusters in pandas
walktrap_clusters = pd.Series(wt_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 2264 clusters
Wall time: 2min 51s


# NLP Clustering
K-Means and Gaussian Mixture Models (GMM)

## K-Means Clustering on tf-idf (tweaked df)

### K-means clustering with K = 10

In [13]:
%%time

# set number of clusters
num_clusters = 10

# run kmeans
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

nlp_tfidf_clusters = km.labels_.tolist()

Wall time: 37min 43s


### K-means clustering with K = 100

In [38]:
%%time

# set number of clusters
num_clusters = 100

# run kmeans
km2 = KMeans(n_clusters=num_clusters)
km2.fit(tfidf_matrix)

nlp_tfidf_clusters2 = km2.labels_.tolist()

Wall time: 50min 7s


## GMM Clustering on tf-idf (tweaked df)

### GMM Clustering with K = 10

In [17]:
%%time

# set number of components
num_components = 10

tfidf_matrix_dense = tfidf_matrix.todense()

# run GMM
gmm = GaussianMixture(n_components=num_components)
gmm.fit(tfidf_matrix_dense)

gmm_clusters = gmm.predict(tfidf_matrix_dense).tolist()
gmm_clusters = map(int, gmm_clusters)

Wall time: 8min 12s


### GMM Clustering with K = 100 

In [46]:
%%time

# set number of components
num_components = 100

tfidf_matrix_dense = tfidf_matrix.todense()

# run GMM
gmm2 = GaussianMixture(n_components=num_components)
gmm2.fit(tfidf_matrix_dense)

gmm_clusters2 = gmm2.predict(tfidf_matrix_dense).tolist()
gmm_clusters2 = map(int, gmm_clusters2)

Wall time: 5min 49s


## Compare NLP clustering (tfidf) vs graph clustering

In [49]:
#clusters = pd.DataFrame(index=normalized_text_dict.keys(), columns=['nlp', 'graph'])
clusters = pd.DataFrame(index=op_id_to_bow_id, columns=[])

# add in communities 
clusters['mod'] = graph_clusters

# consider nodes not considered in CD to be their own cluster
# i.e. nodes outside the largest connected component
clusters['mod'].fillna(max(graph_clusters) + 1, inplace=True)

# make formatting
clusters['mod'] = clusters['mod'].astype(np.int)

# add in walktrap clusters
clusters['wt'] = walktrap_clusters
clusters['wt'].fillna(max(walktrap_clusters) + 1, inplace=True)
clusters['wt'] = clusters['wt'].astype(np.int)

# add in NLP clusters
clusters['km_10'] = nlp_tfidf_clusters
clusters['km_100'] = nlp_tfidf_clusters2

clusters['gmm_10'] = gmm_clusters
clusters['gmm_100'] = gmm_clusters2

clusters.to_csv("clusters_tweak_df.csv")

In [50]:
clusters

Unnamed: 0,mod,wt,km_10,km_100,gmm_10,gmm_100
145658,1,5,3,48,8,71
89370,3,294,3,61,8,86
89371,0,35,9,41,9,7
89372,0,3,4,44,1,8
89373,0,3,0,54,3,36
89374,2,4,0,3,3,69
89375,2,5,2,95,1,23
89376,2,6,2,36,1,2
89377,2,7,7,60,6,58
89378,2,7,2,76,1,97


### load cluster csv saved in current directory

In [4]:
clusters = pd.read_csv('clusters_tweak_df.csv')
clusters

Unnamed: 0.1,Unnamed: 0,mod,wt,km_10,km_100,gmm_10,gmm_100
0,145658,1,5,3,48,8,71
1,89370,3,294,3,61,8,86
2,89371,0,35,9,41,9,7
3,89372,0,3,4,44,1,8
4,89373,0,3,0,54,3,36
5,89374,2,4,0,3,3,69
6,89375,2,5,2,95,1,23
7,89376,2,6,2,36,1,2
8,89377,2,7,7,60,6,58
9,89378,2,7,2,76,1,97
