In [1]:
# modify these for your own computer
repo_directory = '/Users/Michael/Documents/GitHub/law-net/'

data_dir = '/Users/Michael/Desktop/network_data/'

import os
import numpy as np
import re
import sys
import matplotlib.pyplot as plt
import glob
import cPickle  as pickle
from collections import OrderedDict


# graph package
import igraph as ig


# stat
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

import scipy.sparse
import random
import itertools


# our code
sys.path.append(repo_directory + 'code/')
from summarize_clusters import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from bag_of_words import * 

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
nlp_dir = subnet_dir + 'nlp/'
nlp_sub_dir = nlp_dir + 'bow_tfidf/' #tfidf matrix (and other info, i.e. vocab) computed from bag-of-words matrix
nlp_bow_dir = nlp_dir + 'bow/' #bag-of-words matrix (and other info, i.e. vocab)
nlp_df_sub_dir = nlp_dir + 'bow_tfidf_df/'

# csv location
csv_dir = "C:/Users/Michael/Documents/GitHub/law-net/csv/"

# all the file paths for .txt files
file_paths = glob.glob(text_dir + '*.txt')

# all opinions
all_the_opinions = all_opinions(file_paths)

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

## load tf-idf vectors
**tfidf_matrix** = (row_index, column_index): tf_idf value (**CSR FORMAT**)  
**op_id_to_bow_id** = opinion_id (corresponds to row indices)  
**vocab** = all the words in tfidf_matrix (correspond to column indices)

In [67]:
tfidf_matrix, op_id_to_bow_id, vocab = load_tf_idf(nlp_sub_dir)

In [68]:
tfidf_matrix

<27885x567570 sparse matrix of type '<type 'numpy.float64'>'
	with 20817470 stored elements in Compressed Sparse Row format>

### access small random subset of tfidf matrix (1000 x 100)

In [69]:
random_row_indices = [ i for i in sorted(random.sample(xrange(27885), 1000)) ]
random_column_indices =  [ i for i in sorted(random.sample(xrange(567570), 100)) ]

tfidf_matrix = tfidf_matrix[random_row_indices, :]
tfidf_matrix = tfidf_matrix[:, random_column_indices]
tfidf_matrix

<1000x100 sparse matrix of type '<type 'numpy.float64'>'
	with 31 stored elements in Compressed Sparse Row format>

### the random opinion id's

In [None]:
random_op_id = []
for op_id, bow_id in op_id_to_bow_id.iteritems():
    for rr_id in random_row_indices:
        if rr_id == bow_id:
            random_op_id.append(op_id)

# K-Means Clustering on tf-idf (random subset)

### K-means clustering with K = 10

In [56]:
%%time

# set number of clusters
num_clusters = 10

# run kmeans
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)

nlp_tfidf_clusters = km.labels_.tolist()

Wall time: 354 ms


### K-means clustering with K = 100

In [59]:
%%time

# set number of clusters
num_clusters = 100

# run kmeans
km2 = KMeans(n_clusters=num_clusters)
km2.fit(tfidf_matrix)

nlp_tfidf_clusters2 = km2.labels_.tolist()

Wall time: 1.01 s


### K-means clustering with K = 1000

In [60]:
%%time

# set number of clusters
num_clusters = 1000

# run kmeans
km3 = KMeans(n_clusters=num_clusters)
km3.fit(tfidf_matrix)

nlp_tfidf_clusters3 = km3.labels_.tolist()

Wall time: 9.39 s


# Gaussian-Mixture-Model (GMM) Clustering on tf-idf (random subset)

### GMM Clustering with K = 10

In [61]:
%%time

# set number of components
num_components = 10

tfidf_matrix_dense = tfidf_matrix.todense()

# run GMM
gmm = GaussianMixture(n_components=num_components)
gmm.fit(tfidf_matrix_dense)

gmm_clusters = gmm.predict(tfidf_matrix_dense).tolist()
gmm_clusters = map(int, gmm_clusters)

Wall time: 177 ms


### GMM Clustering with K = 100 

In [62]:
%%time

# set number of components
num_components = 100

tfidf_matrix_dense = tfidf_matrix.todense()

# run GMM
gmm2 = GaussianMixture(n_components=num_components)
gmm2.fit(tfidf_matrix_dense)

gmm_clusters2 = gmm2.predict(tfidf_matrix_dense).tolist()
gmm_clusters2 = map(int, gmm_clusters2)

Wall time: 2.06 s


### GMM Clustering with K = 1000

In [63]:
%%time

# set number of components
num_components = 1000

tfidf_maatrix_dense = tfidf_matrix.todense()

# run GMM
gmm3 = GaussianMixture(n_components=num_components)
gmm3.fit(tfidf_matrix_dense)

gmm_clusters3 = gmm3.predict(tfidf_matrix_dense).tolist()
gmm_clusters3 = map(int, gmm_clusters3)

Wall time: 29.2 s


## NLP Clustering (K-Means & GMM) on Random Subset

In [73]:
#clusters = pd.DataFrame(index=normalized_text_dict.keys(), columns=['nlp', 'graph'])
clusters = pd.DataFrame(index=random_op_id, columns=[])

# add in NLP clusters
clusters['km_10'] = nlp_tfidf_clusters
clusters['km_100'] = nlp_tfidf_clusters2
clusters['km_1000'] = nlp_tfidf_clusters3

clusters['gmm_10'] = gmm_clusters
clusters['gmm_100'] = gmm_clusters2
clusters['gmm_1000'] = gmm_clusters3

clusters.to_csv(csv_dir + "clusters_random_subset.csv")

In [74]:
clusters

Unnamed: 0,km_10,km_100,km_1000,gmm_10,gmm_100,gmm_1000
89375,0,0,0,0,0,4
97159,0,0,0,0,0,4
88022,0,0,0,0,0,4
100010,0,0,0,0,0,4
94541,0,0,0,0,0,4
97935,0,0,0,0,0,4
109857,0,0,0,0,0,4
103709,0,0,0,0,0,4
92749,0,0,0,0,0,4
85677,0,0,0,0,0,4


### load cluster csv saved in current directory

In [77]:
clusters = pd.read_csv(csv_dir + 'clusters_random_subset.csv')
clusters

Unnamed: 0.1,Unnamed: 0,km_10,km_100,km_1000,gmm_10,gmm_100,gmm_1000
0,89375,0,0,0,0,0,4
1,97159,0,0,0,0,0,4
2,88022,0,0,0,0,0,4
3,100010,0,0,0,0,0,4
4,94541,0,0,0,0,0,4
5,97935,0,0,0,0,0,4
6,109857,0,0,0,0,0,4
7,103709,0,0,0,0,0,4
8,92749,0,0,0,0,0,4
9,85677,0,0,0,0,0,4
