In [34]:
import faiss
import json
import os
import numpy as np
from sklearn import metrics

In [26]:
files_to_process = [x for x in os.listdir(INDIR) if '.json' in x]

def read_json_files(file_list):
    observation_list = []
    for filename in file_list:
        filepath = os.path.join(INDIR,filename)
        with open(filepath,'r') as filestream:
            observation_list += json.load(filestream)
            
    return observation_list

observation_list = read_json_files(files_to_process)
observation_list

[{'title': 'AccessibleComputing',
  'page_id': 10,
  'rev_id': 854851586,
  'redirect': 'Computer accessibility',
  'feature_vector': [0.2328696008771658,
   0.18088792636990547,
   -0.07624258100986481,
   0.1971982903778553,
   -0.24810440093278885,
   0.267140191514045,
   -0.15286043286323547,
   -0.2716864123940468,
   -0.3021771050989628,
   0.8540531396865845,
   0.2778720580972731,
   0.37713751196861267,
   -0.14474809914827347,
   -0.16984820365905762,
   0.5719110434874892,
   -0.26097454130649567,
   -0.1256435215473175,
   -0.20538362860679626,
   0.39435283839702606,
   0.2675931006669998,
   0.3183209244161844,
   -0.260308435652405,
   0.1797540783882141,
   -0.6213371306657791,
   -0.32131440937519073,
   -0.2787502743303776,
   0.7543615400791168,
   -0.3266957104206085,
   -0.26793961971998215,
   0.1971108578145504,
   -0.38973698019981384,
   -0.1932934895157814,
   0.18150914460420609,
   -0.7743642032146454,
   0.09696187824010849,
   0.38916249480098486,
   -0.6

In [30]:
def get_feature_vectors(observation_list):
    return np.asarray([obs['feature_vector'] for obs in observation_list],dtype=np.float32)

feature_vector_list = get_feature_vectors(observation_list)
feature_vector_list

array([[ 2.32869595e-01,  1.80887923e-01, -7.62425810e-02,
         1.97198287e-01, -2.48104393e-01,  2.67140180e-01,
        -1.52860433e-01, -2.71686405e-01, -3.02177101e-01,
         8.54053140e-01,  2.77872056e-01,  3.77137512e-01,
        -1.44748092e-01, -1.69848204e-01,  5.71911037e-01,
        -2.60974526e-01, -1.25643522e-01, -2.05383629e-01,
         3.94352853e-01,  2.67593086e-01,  3.18320930e-01,
        -2.60308444e-01,  1.79754078e-01, -6.21337116e-01,
        -3.21314394e-01, -2.78750271e-01,  7.54361510e-01,
        -3.26695710e-01, -2.67939627e-01,  1.97110862e-01,
        -3.89736980e-01, -1.93293482e-01,  1.81509137e-01,
        -7.74364233e-01,  9.69618782e-02,  3.89162481e-01,
        -6.91426337e-01, -1.00057885e-01, -5.95175743e-01,
         2.91188776e-01, -4.54773247e-01,  9.32065696e-02,
         3.25850248e-02, -1.07049525e-01, -2.24017307e-01,
         4.56562340e-01, -2.49534935e-01,  4.97676820e-01,
         3.45646143e-02, -4.97875184e-01],
       [-1.06

In [32]:
def cluster(feature_vector_list, k):
    c = faiss.Kmeans(feature_vector_list.shape[1],k,spherical = True)
    c.train(feature_vector_list)
    sims, cluster_assignments = c.assign(feature_vector_list)
    return cluster_assignments

cluster(feature_vector_list,2)

In [45]:
def determine_k(feature_vector_list, k_to_try):
    max_score = 0
    best_k = None
    best_cluster_assignment = None
    scores = {}
    for k in k_to_try:
        cluster_assignments = cluster(feature_vector_list,k)
        score = metrics.silhouette_score(feature_vector_list,
                                         cluster_assignments,
                                         metric='euclidean')
        scores[k] = score
        if score > max_score:
            max_score = score
            best_k = k
            best_cluster_assignment = cluster_assignments
            
    
    result = {
        'scores':scores,
        'max_score':max_score,
        'best_k':best_k,
        'best_cluster_assignment':best_cluster_assignment
    }
    
    return result
        
        
k_to_try = range(2,5,1)
determine_k(feature_vector_list, k_to_try)

{'scores': {2: 0.3162572, 3: 0.19870472, 4: 0.3683277},
 'max_score': 0.3683277,
 'best_k': 4,
 'best_cluster_assignment': array([1, 2, 0, 0, 0, 0, 0, 0, 0, 3, 3])}

In [60]:
INDIR = '/Users/klogg/dev/wiki-ltt-cluster/datasets/vectorized/'
outfile_path = '/Users/klogg/dev/wiki-ltt-cluster/datasets/clustered/cluster_test.json'

def main():
    
    files_to_process = [x for x in os.listdir(INDIR) if '.json' in x]
    observation_list = read_json_files(files_to_process)
    feature_vector_list = get_feature_vectors(observation_list)
    
    k_to_try = range(2,5,1)
    
    result = determine_k(feature_vector_list,k_to_try)

    with open(outfile_path,'w') as outfile:
        outfile.write('[')
        for i, obs in enumerate(observation_list):
            json.dump({
                'title':obs['title'],
                'page_id':obs['page_id'],
                'rev_id':obs['rev_id'],
                'redirect':obs['redirect'],
                'cluster_assignment':int(result['best_cluster_assignment'][i])
            },outfile)
        
        outfile.write(']')
        
        #page_list = process_dump(infile_path, save_text=True, save_tokens=True)
        #outfile_name = '{0}.json'.format(infile.split('.')[0])
        #outfile_path = os.path.join(OUTDIR,outfile_name)
        #with open(outfile_path,'w') as outfile:
        #    json.dump(page_list,outfile)

main()