# INDIVIDUAL CLUSTERING

In [1]:
import numpy as np
import datetime
import json
import pylab
import pandas as pd
import matplotlib.pyplot as plt

### Detalles

### Lista de Clientes

In [2]:
# Extraemos la lista de clientes sin repetir

def leer_data():
    outfile='./data/data.csv'
    data = pd.read_csv(outfile)
    return data

data = leer_data()
clientes =  data.groupby('client_id').client_id.count().index

## Temporal TXs footprint

In [3]:
# Definimos la ruta de los archivos 

file='U'
raw_data='./data/%s.json' %(file)
individual_footprint="%s.individual_footprint" %(raw_data)
individual_clusters="%s.clusters" %(individual_footprint)
individual_labels="%s.labels" %(individual_footprint)

### Funciones

In [4]:
def process_footprint(data,tests,log=False):
    from sklearn.cluster import MiniBatchKMeans
    #KMeans(init='k-means++', n_clusters=k, n_init=10)
    import datetime
    K={}
    for k in tests:
        if k<=len(data):
            if log:
                print("%s: processing %s"%(datetime.datetime.now(),k))
            K[k]=bench_k_means(MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=100,
                      n_init=10, max_no_improvement=10, verbose=0,
                      random_state=0),name="k-means++", data=data)
    return K

In [5]:
def compute_best_k(x,y,occurrencies, plot=False,points=1000,sf=0.9):
    import numpy as np
    
    if len(x)<5:
        return max(1, round(np.sqrt(occurrencies/2)))
    
    from scipy.interpolate import interp1d
    from scipy.interpolate import UnivariateSpline
    spl = UnivariateSpline(x, y)
    spl.set_smoothing_factor(sf)
    xs = np.linspace(min(x), max(x), points)
    ys = spl(xs)
    idx_better_k=get_change_point(xs, ys)
    if plot:
        import pylab
        pylab.plot(xs,ys)
        
        pylab.scatter(xs[idx_better_k],ys[idx_better_k],s=20, marker='o')
        pylab.text(xs[idx_better_k],ys[idx_better_k],"bestK %s" %(np.round(xs[idx_better_k])))
        return int(np.round(xs[idx_better_k])),pylab
    return int(np.round(xs[idx_better_k]))

In [6]:
def bench_k_means(estimator, name, data,distance_function=None):
    from sklearn import metrics
    from sklearn.metrics import silhouette_samples, silhouette_score
    import time
    t0 = time.time()
    if distance_function:
        estimator.fit(data,distance_function)
    else:
        estimator.fit(data)
    #cluster_labels = estimator.fit_predict(data)
    #silhouette_score_ = silhouette_score(data, cluster_labels)
    
    inertia=estimator.inertia_
    duration=time.time() - t0
    return {'inertia':inertia,'duration':duration, 'estimator':estimator}#,'silhouette':silhouette_score_}

def get_change_point(x, y):
    """
         Elección del mejor K
         :: param x: lista de valores de K
         :: param y: lista de valores de SSE
    """
    import math
    max_d = -float('infinity')
    index = 0

    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[len(x)-1], y[len(y)-1]], p=[x[i], y[i]])
        d = math.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
    
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta *  y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point
	

### Individual Clustering

In [7]:
# Numero de filas del archivo
f=open(individual_footprint)
num_rows = len(f.readlines())-1
f.close()

#<customer_id;year;week;profile_id;size;t1... tn >
import datetime
f=open(individual_footprint)
fw=open(individual_clusters,'w')  #uid,cluster_id,centroid
fw2=open(individual_labels,'w') #uid,year,week,cluster_id,profile
fw.write('customer_tag;individual_cluster;d0t0;d1t0;d2t0;d3t0;d4t0;d5t0;d6t0;d0t1;d1t1;d2t1;d3t1;d4t1;d5t1;d6t1;d0t2;d1t2;d2t2;d3t2;d4t2;d5t2;d6t2;d0t3;d1t3;d2t3;d3t3;d4t3;d5t3;d6t3\n')
fw2.write('customer_tag;year;week;individual_cluster;d0t0;d1t0;d2t0;d3t0;d4t0;d5t0;d6t0;d0t1;d1t1;d2t1;d3t1;d4t1;d5t1;d6t1;d0t2;d1t2;d2t2;d3t2;d4t2;d5t2;d6t2;d0t3;d1t3;d2t3;d3t3;d4t3;d5t3;d6t3\n')

f.readline()
data=[] #buffer

footprints_clustered=0
footprints_clusters=0
n_cliente=0
contador = 0
temporal= 0
for row in f: #reading individual footprint
    row=row.strip().split(',') # leemos cada elemento da linea parseada por ","
    uid=row[0]
    year=row[1]
    week=row[2]
    size=float(row[5])
    profile=np.array([float(el) for el in row[6:]])
    # Individual clustering
    if uid==clientes[n_cliente]: # Para cada fila donde los "uid" son iguales 
        data.append(((uid,year,week),profile))     
        contador+=1
    else: #final de cliente
        
        #---------------------------------------------------------------------
        # procesar data
        #---------------------------------------------------------------------
        to_cluster=[el[1] for el in data]
        K=process_footprint(to_cluster,np.arange(1,len(to_cluster)+1))

        # choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]
        best_k=compute_best_k(x,y,len(to_cluster))
        ###print(str(contador)+' => clustering: '+str(clientes[n_cliente])+' len data: '+str(len(data))+" best k: "+str(best_k))
        
        # clustering
        if best_k==1:
            #to few records
            cluster_centers_=[np.average(to_cluster,axis=0)]
            labels_=[0]*len(to_cluster)  
        else:
            cluster_centers_=K[best_k]['estimator'].cluster_centers_
            labels_=K[best_k]['estimator'].labels_
        
        #export individual centroids
        for i in np.arange(len(cluster_centers_)):
            string="%s;%s;%s\n"%(clientes[n_cliente],i,';'.join([str(el) for el in cluster_centers_[i]])) #uid,cluster_id,centroid
            fw.write(string)
            footprints_clusters+=1
        fw.flush()

        #export original data and labels
        for i in np.arange(len(data)):
            uid2=data[i][0]
            profile2=data[i][1]
            label2=labels_[i]
            string="%s;%s;%s;%s;%s\n" %(uid2[0],uid2[1],uid2[2],label2
                                                    ,';'.join([str(el) for el in profile2]))#uid,year,week,cluster_id,profile
            fw2.write(string)
            footprints_clustered+=1
        fw2.flush()
        #---------------------------------------------------------------------
        #---------------------------------------------------------------------
        
        

        data=[] #buffer
        data.append(((uid,year,week),profile))
        
        contador+=1
        temporal+=1
        n_cliente+=1
        
    if contador == num_rows:        # Para el ultimo cliente y ultima fila
        #---------------------------------------------------------------------
        # procesar data
        #---------------------------------------------------------------------
        to_cluster=[el[1] for el in data]
        K=process_footprint(to_cluster,np.arange(1,len(to_cluster)+1))

        # choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]
        best_k=compute_best_k(x,y,len(to_cluster))
        ###print(str(contador)+' => clustering: '+str(clientes[n_cliente])+' len data: '+str(len(data))+" best k: "+str(best_k))
        
        # clustering
        if best_k==1:
            #to few records
            cluster_centers_=[np.average(to_cluster,axis=0)]
            labels_=[0]*len(to_cluster)  
        else:
            cluster_centers_=K[best_k]['estimator'].cluster_centers_
            labels_=K[best_k]['estimator'].labels_
        
        #export individual centroids
        for i in np.arange(len(cluster_centers_)):
            string="%s;%s;%s\n"%(uid,i,';'.join([str(el) for el in cluster_centers_[i]])) #uid,cluster_id,centroid
            fw.write(string)
            footprints_clusters+=1
        fw.flush()

        #export original data and labels
        for i in np.arange(len(data)):
            uid2=data[i][0]
            profile2=data[i][1]
            label2=labels_[i]
            string="%s;%s;%s;%s;%s\n" %(uid2[0],uid2[1],uid2[2],label2
                                                    ,';'.join([str(el) for el in profile2]))#uid,year,week,cluster_id,profile
            fw2.write(string)
            footprints_clustered+=1
        fw2.flush()
        #---------------------------------------------------------------------
        #---------------------------------------------------------------------
        
        print("final")   
    
    

temporal

The maximal number of iterations maxit (set to 20 by the program)
allowed for finding a smoothing spline with fp=s has been reached: s
too small.
There is an approximation returned but the corresponding weighted sum
of squared residuals does not satisfy the condition abs(fp-s)/s < tol.


final


227662