In [2]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.sparse
import random
import itertools


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)
nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

## load bow vectors
**bow_matrix** = (row_index, column_index): token/word-count value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in bow_matrix (correspond to column indices)

In [3]:
def load_bow(nlp_dir):
    """
    bow_matrix, op_id_to_bow_id = load_bow(nlp_dir)
    """
    bow_matrix = load_sparse_csr(nlp_dir + 'bag_of_words_matrix.npz')

    with open(nlp_dir + 'op_id_to_bow_id.p', 'rb') as f:
        op_id_to_bow_id = pickle.load(f)

    with open(nlp_dir + 'vocab.p', 'rb') as f:
        vocab = pickle.load(f)

    return bow_matrix, op_id_to_bow_id, vocab

In [4]:
bow_matrix, op_id_to_bow_id, vocab = load_bow(nlp_bow_dir)

In [10]:
bow_matrix

<27885x567570 sparse matrix of type '<type 'numpy.int64'>'
	with 20817470 stored elements in Compressed Sparse Row format>

# Clustering Work
focus on largest connected component on **undirected scotus**

In [6]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

# limit ourselves to cases upto and including 2015 since we are missing some textfiles from 2016
G = G.subgraph(G.vs.select(year_le=2015))

# make graph undirected
Gud = G.copy()
Gud = Gud.as_undirected()

# get largest connected componenet
components = Gud.clusters(mode='STRONG')
g = components.subgraphs()[np.argmax(components.sizes())]

# CL ids of cases in largest connected component
CLids = g.vs['name']

## modularity on undirected scotus
"For a given division of the network's vertices into some modules, modularity reflects the concentration of edges within modules compared with random distribution of links between all nodes regardless of modules"--*Wikipedia*

In [7]:
%%time 

# modularity clustering
cd_modularity = g.community_fastgreedy() # .as_clustering().membership

mod_clust = cd_modularity.as_clustering()

print mod_clust.summary()

# save clusters in pandas
graph_clusters = pd.Series(mod_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 126 clusters
Wall time: 1min 45s


## walktrap on undirected scotus

In [8]:
%%time

# walktrap clustering
cd_walktrap = g.community_walktrap()

wt_clust = cd_walktrap.as_clustering()

print wt_clust.summary()

# save clusters in pandas
walktrap_clusters = pd.Series(wt_clust.membership, index=g.vs['name'])

Clustering with 24724 elements and 2264 clusters
Wall time: 2min 47s


# NLP Clustering
K-Means and Gaussian Mixture Models (GMM)

# K-Means Clustering on bow

### K-means clustering with K = 10

In [12]:
%%time

# set number of clusters
num_clusters = 10

# run kmeans
km = KMeans(n_clusters=num_clusters)
km.fit(bow_matrix)

nlp_bow_clusters = km.labels_.tolist()

Wall time: 1h 28min 29s


### K-means clustering with K = 100

In [13]:
%%time

# set number of clusters
num_clusters = 100

# run kmeans
km2 = KMeans(n_clusters=num_clusters)
km2.fit(bow_matrix)

nlp_bow_clusters2 = km2.labels_.tolist()

Wall time: 7h 46min 25s


### K-means clustering with K = 1000

In [None]:
%%time

# set number of clusters
num_clusters = 1000

# run kmeans
km3 = KMeans(n_clusters=num_clusters)
km3.fit(bow_matrix)

nlp_bow_clusters3 = km3.labels_.tolist()

# Gaussian-Mixture-Model (GMM) Clustering on bow

### GMM Clustering with K = 10

In [14]:
%%time

# set number of components
num_components = 10

bow_matrix_dense = bow_matrix.todense()

# run GMM
gmm = GaussianMixture(n_components=num_components)
gmm.fit(bow_matrix_dense)

gmm_clusters = gmm.predict(bow_matrix_dense).tolist()
gmm_clusters = map(int, gmm_clusters)

MemoryError: 

### GMM Clustering with K = 100 

In [62]:
%%time

# set number of components
num_components = 100

bow_matrix_dense = bow_matrix.todense()

# run GMM
gmm2 = GaussianMixture(n_components=num_components)
gmm2.fit(bow_matrix_dense)

gmm_clusters2 = gmm2.predict(bow_matrix_dense).tolist()
gmm_clusters2 = map(int, gmm_clusters2)

Wall time: 2.06 s


### GMM Clustering with K = 1000

In [63]:
%%time

# set number of components
num_components = 1000

bow_matrix_dense = bow_matrix.todense()

# run GMM
gmm3 = GaussianMixture(n_components=num_components)
gmm3.fit(bow_matrix_dense)

gmm_clusters3 = gmm3.predict(bow_matrix_dense).tolist()
gmm_clusters3 = map(int, gmm_clusters3)

Wall time: 29.2 s


## Compare NLP clustering (bow) vs graph clustering

In [15]:
#clusters = pd.DataFrame(index=normalized_text_dict.keys(), columns=['nlp', 'graph'])
clusters = pd.DataFrame(index=op_id_to_bow_id, columns=[])

# add in communities 
clusters['mod'] = graph_clusters

# consider nodes not considered in CD to be their own cluster
# i.e. nodes outside the largest connected component
clusters['mod'].fillna(max(graph_clusters) + 1, inplace=True)

# make formatting
clusters['mod'] = clusters['mod'].astype(np.int)

# add in walktrap clusters
clusters['wt'] = walktrap_clusters
clusters['wt'].fillna(max(walktrap_clusters) + 1, inplace=True)
clusters['wt'] = clusters['wt'].astype(np.int)

# add in NLP clusters
clusters['km_10'] = nlp_bow_clusters
clusters['km_100'] = nlp_bow_clusters2
#clusters['km_1000'] = nlp_bow_clusters3

#clusters['gmm_10'] = gmm_clusters
#clusters['gmm_100'] = gmm_clusters2
#clusters['gmm_1000'] = gmm_clusters3

#clusters.to_csv("clusters_NMF.csv")
clusters.to_csv(csv_dir + "clusters_full_bow.csv")

In [16]:
clusters

Unnamed: 0,mod,wt,km_10,km_100
145658,1,5,0,32
89370,3,294,0,32
89371,0,35,0,32
89372,0,3,0,32
89373,0,3,0,32
89374,2,4,8,53
89375,2,5,0,20
89376,2,6,0,32
89377,2,7,0,32
89378,2,7,0,20


### load cluster csv saved in current directory

In [17]:
clusters = pd.read_csv(csv_dir + 'clusters_full_bow.csv')
clusters

Unnamed: 0.1,Unnamed: 0,mod,wt,km_10,km_100
0,145658,1,5,0,32
1,89370,3,294,0,32
2,89371,0,35,0,32
3,89372,0,3,0,32
4,89373,0,3,0,32
5,89374,2,4,8,53
6,89375,2,5,0,20
7,89376,2,6,0,32
8,89377,2,7,0,32
9,89378,2,7,0,20
