In [10]:
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import *
from sklearn.cluster import KMeans
from itertools import combinations
import numpy as np

In [9]:
dim = 88
npt = 66
pts = np.random.randint(low=0, high=2, size=(npt, dim))
# pts
norm_pts = normalize(pts.astype(np.float))
dist = norm_pts.dot(norm_pts.T)
print 'average similarity:', np.mean(dist[np.triu_indices(npt,k=1)])

average similarity: 0.493664888871


In [3]:
def get_query_clusters(points, k):
    '''
    points [n,m] - array for n points with dimention m - encoded query
    '''
    # normalize input
    points = normalize(points.astype(np.float))
    # get similarity matrix (cosine distance)
    dist = points.dot(points.T)
    # initialize variables
    n_pt = len(points)
    cluster_old, cluster_new = np.ones(n_pt), np.zeros(n_pt)
    # special case, no clustering
    if k==1 or n_pt==1:        
        return np.zeros(n_pt), 1 if n_pt==1 else np.mean(dist[np.triu_indices(n_pt,k=1)])
    # randomly choose k starting centroids
    centroids = points[np.random.permutation(n_pt)[:k]]
    while not np.array_equal(cluster_old, cluster_new):
        cluster_old = cluster_new
        # get cluster index for each point
        cluster_new = np.argmax(points.dot(centroids.T), axis=1)
        # get new centroids, and within class mean distance/similarity
        centroids, in_dist = [], []
        for c in np.unique(cluster_new):
            pid = cluster_new==c
            # set new centroid as the one who has minimum total distance to rest of the points in the cluster
            cid = np.argmax(np.sum(dist[np.ix_(pid, pid)], axis=1))
            centroids.append(points[pid][cid])
            in_dist.append(1 if sum(pid)==1 else np.mean(dist[np.ix_(pid,pid)][np.triu_indices(sum(pid),k=1)]))
        centroids = np.array(centroids)
        # traditional way to get new centroid, not working well for cosine distance
#         centroids = normalize([np.mean(points[cluster_new==c], axis=0) for c in np.unique(cluster_new)])

    return cluster_new, np.mean(in_dist)


In [17]:
def query_characterizer(queries, similarity_limit = 0.9):
    '''
    queries - list of string for queries
    return - list of integers to indicate cluster for each query
    '''
    # vectorize queries
#     characterizer = CountVectorizer()
#     encoded_query = characterizer.fit_transform(queries)
    # find the optimal clusters based on minimum within cluster distance
    avg_sim, k = 0, 0
    while avg_sim < similarity_limit:
        k += 1
        clusters, avg_sim = get_query_clusters(queries, k)
        
    return clusters, k, avg_sim
    # for each cluster, assemble training points from its queries
    # for c in np.unique(clusters):


In [19]:
query_characterizer(pts,0.8)

(array([ 8, 20, 10, 14,  1,  8, 14, 15, 14,  6, 14, 18, 10, 10, 14,  3,  7,
         7, 10, 20, 12, 11,  9, 10, 14, 14,  5, 10, 10, 14,  0, 13, 17, 14,
        19, 10, 10, 10,  8,  8, 18, 15,  8, 13, 14,  9,  8, 10,  7, 13, 13,
        10, 10, 10,  9, 16,  7, 10, 10, 18, 10,  2, 10,  4, 10,  7], dtype=int64),
 21,
 0.81634124511092665)

In [8]:
# normalize(pts.astype(np.float))
get_query_clusters(pts,1)

(array([ 0.]), 1)

In [112]:
km = KMeans(20)
cluster = km.fit_predict(norm_pts)
in_dist = []
for c in np.unique(cluster):
    pid = cluster==c    
    in_dist.append(1 if sum(pid)==1 else np.mean(dist[np.ix_(pid,pid)][np.triu_indices(sum(pid),k=1)]))
print np.mean(in_dist)

0.644579618166


In [10]:
import pickle
savedData = 'UT_74_19243.pickle'
with open(savedData) as f: 
    feature, plan = pickle.load(f)
plan=np.array(plan)

In [1]:

# np.where(sim_click[0][0]=='68781UT0020016')[0][0]
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [18]:
# import numpy as np
n_query = 8
n_plan, n_fea = feature.shape
sim_click = np.array([[plan[np.random.permutation(n_plan)], 
                       plan[np.random.permutation(n_plan)[0:np.random.randint(low=1, high=10)]]] 
                      for i in range(n_query)])
q_cluster = np.random.random_integers(0,2,n_query)

In [42]:
%reload_ext autoreload
%autoreload 2
from get_rank_for_state_plan import *
cluster_rank = get_rank_for_state_plan(q_cluster, sim_click)

load 74 plans from feature data with dimension 19243
training started for 3 query clusters

getting training data from cluster 0 with 3 queries
query has 6 clicks
extracting feature for clicked plan 68781UT0020001
extracting feature for clicked plan 68781UT0030008
extracting feature for clicked plan 68781UT0020009
extracting feature for clicked plan 68781UT0010016
extracting feature for clicked plan 68781UT0020002
extracting feature for clicked plan 68781UT0030015
query has 8 clicks
extracting feature for clicked plan 68781UT0030009
extracting feature for clicked plan 18167UT0010002
extracting feature for clicked plan 68781UT0010017
extracting feature for clicked plan 68781UT0020001
extracting feature for clicked plan 68781UT0140005
extracting feature for clicked plan 56764UT0010002
extracting feature for clicked plan 68781UT0010008
extracting feature for clicked plan 68781UT0140010
query has 8 clicks
extracting feature for clicked plan 68781UT0020019
extracting feature for clicked pla

In [43]:
cluster_rank

array([[ 0.22835128,  0.73983946,  0.30144769,  0.21298428,  0.        ,
         0.36515739,  0.0931118 ,  0.83875313,  1.        ,  0.37856997,
         0.59644845,  0.52429864,  0.73983946,  0.52429864,  0.09456797,
         0.0931118 ,  0.52429864,  0.52429864,  0.51873839,  0.52429864,
         0.52429864,  0.66193672,  0.52429864,  0.66990897,  0.52429864,
         0.66990897,  0.51019228,  0.50954509,  0.52235708,  0.82526049,
         0.7360281 ,  0.59644845,  0.5958176 ,  0.66387829,  0.7360281 ,
         0.58485751,  0.66387829,  0.59775916,  0.64977193,  0.66387829,
         0.59775916,  0.7360281 ,  0.66387829,  0.58356314,  0.17657996,
         0.52235708,  0.59775916,  0.5836528 ,  0.44527786,  0.51019228,
         0.66387829,  0.59775916,  0.66990897,  0.51873839,  0.5836528 ,
         0.64912474,  0.51814962,  0.59775916,  0.45203049,  0.59775916,
         0.59775916,  0.64977193,  0.58300561,  0.5958176 ,  0.44527786,
         0.66193672,  0.58485751,  0.59775916,  0.6