In [6]:
import numpy as np
import datetime
import json
import pylab
import pandas as pd
import matplotlib.pyplot as plt

# Preparacion de datos

## Cargando datos

In [7]:

def leer_data():
    outfile='./data2/consulta.csv'
    data = pd.read_csv(outfile)
    return data

data_original = leer_data()
data_original.head(3)

Unnamed: 0,client_id,date,año,mes,dia,hora,merchant_departement,merchant_province,mcc,mccg,quantity,amount_sol
0,+++g8j9k+5A=,2016-09-27 01:37:23,2016,9,27,1,LIMA,LIMA,San Juan De Miraflor,15,1,107.0
1,+++g8j9k+5A=,2016-06-24 03:35:00,2016,6,24,3,LIMA,LIMA,Magdalena Del Mar,15,1,58.0
2,+++g8j9k+5A=,2017-04-29 03:15:16,2017,4,29,3,LIMA,LIMA,Miraflores,17,1,153.0


## Preparacion de datos

In [5]:
data = data_original[['client_id','date','año','mes','dia','hora','quantity','amount_sol']]
data.head(3)   

Unnamed: 0,client_id,date,año,mes,dia,hora,quantity,amount_sol
0,+++g8j9k+5A=,2016-09-27 01:37:23,2016,9,27,1,1,107.0
1,+++g8j9k+5A=,2016-06-24 03:35:00,2016,6,24,3,1,58.0
2,+++g8j9k+5A=,2017-04-29 03:15:16,2017,4,29,3,1,153.0


# Unidad de TXs temporales (U)

In [6]:

file='DATA2'
raw_data='./articulo/%s.json' %(file)



## Funciones

In [7]:
# definimos los 4 time_windows que usaremos

def time_window(hora):
    tw = 9999
    if hora >=0:
        tw = 0      # Madrugada
    if hora >=6:
        tw = 1      # Mañana
    if hora >=12:
        tw = 2      # Tarde
    if hora >=18:
        tw = 3      # Noche
    return tw

In [8]:
# Definimos los U 

def procesar_u(user):    
    uid=list(user['client_id'])[0]
    
    # Lista los años en que tiene txs el usuario
    years = set(list(user['año']))
    anni = {year:{} for year in list(years)}
    
    # para cada fila (para cada fecha)
    for dat in  range(0,len(user)):
        año = user.iloc[dat]['año']
        week=datetime.datetime(año,user.iloc[dat]['mes'],user.iloc[dat]['dia']).isocalendar()[1]
        weekday=datetime.datetime(año,user.iloc[dat]['mes'],user.iloc[dat]['dia']).weekday()
        turn = time_window(data.iloc[dat]['hora'])
        
        # Si la semana no existe en el año
        if not(week in anni[año]):
            anni[año][week] = {}
        # Si el turno no existe en la semana y año
        if not (turn in anni[año][week]):
            anni[año][week][turn]=np.array([0]*7)
            
        anni[año][week][turn][weekday]+=user.iloc[dat]['quantity'] # suma cantidades "importancia por compras"
        #anni[año][week][turn][weekday]+=user.iloc[dat]['amount_sol'] # suma montos "importancia por gastos"
            
    return uid,anni

## Procesando U de cada cliente

In [9]:
# Extraemos la lista de clientes sin repetir
clientes =  data.groupby('client_id').client_id.count().index

profiles={}
# Para cada cliente
for cliente in clientes:
    cliente_i= data[data['client_id'] == cliente]
    ## ejecutamos para cada usuario
    results=procesar_u(cliente_i)
    profiles[results[0]]=results[1]

In [10]:
individual_footprint="%s.individual_footprint" %(raw_data)

#exporting individual footprint   <customer_id;year;week;profile_id;turn;size;t1... tn >
fw=open(individual_footprint,'w')
fw.write('customer_id,year,week,profile_id,turn,size,d0,d1,d2,d3,d4,d5,d6\n')
footprints=0
for uid in profiles:
    profile_id=0
    for year in profiles[uid]:
        for week in profiles[uid][year]:
            for turn in profiles[uid][year][week]:
                d=profiles[uid][year][week][turn]
                fw.write(''+str(uid)+','+str(year)+','+str(week)+','+str(profile_id)+','+str(turn)+','
                         +str(sum(d))+','+str(d[0])+','+str(d[1])+','+str(d[2])+','+str(d[3])+
                         ','+str(d[4])+','+str(d[5])+','+str(d[6])+'\n')

            profile_id = profile_id + 1
    footprints+=profile_id
    fw.flush()
fw.close()
print ("number of footprint: "+str(footprints))

number of footprint: 31689


# Temporal TXs footprint

In [11]:
# Definimos la ruta de los archivos 

individual_clusters="%s.clusters" %(individual_footprint)
individual_labels="%s.labels" %(individual_footprint)

## Funciones

In [12]:
def process_footprint(data,tests,log=False):
    from sklearn.cluster import MiniBatchKMeans
    #KMeans(init='k-means++', n_clusters=k, n_init=10)
    import datetime
    K={}
    for k in tests:
        if k<=len(data):
            if log:
                print("%s: processing %s"%(datetime.datetime.now(),k))
            K[k]=bench_k_means(MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=100,
                      n_init=10, max_no_improvement=10, verbose=0,
                      random_state=0),name="k-means++", data=data)
    return K

In [13]:
def compute_best_k(x,y,occurrencies, plot=False,points=1000,sf=0.9):
    import numpy as np
    
    if len(x)<5:
        return max(1, round(np.sqrt(occurrencies/2)))
    
    from scipy.interpolate import interp1d
    from scipy.interpolate import UnivariateSpline
    spl = UnivariateSpline(x, y)
    spl.set_smoothing_factor(sf)
    xs = np.linspace(min(x), max(x), points)
    ys = spl(xs)
    idx_better_k=get_change_point(xs, ys)
    if plot:
        import pylab
        pylab.plot(xs,ys)
        
        pylab.scatter(xs[idx_better_k],ys[idx_better_k],s=20, marker='o')
        pylab.text(xs[idx_better_k],ys[idx_better_k],"bestK %s" %(np.round(xs[idx_better_k])))
        return int(np.round(xs[idx_better_k])),pylab
    return int(np.round(xs[idx_better_k]))

In [14]:
def bench_k_means(estimator, name, data,distance_function=None):
    from sklearn import metrics
    from sklearn.metrics import silhouette_samples, silhouette_score
    import time
    t0 = time.time()
    if distance_function:
        estimator.fit(data,distance_function)
    else:
        estimator.fit(data)
    #cluster_labels = estimator.fit_predict(data)
    #silhouette_score_ = silhouette_score(data, cluster_labels)
    
    inertia=estimator.inertia_
    duration=time.time() - t0
    return {'inertia':inertia,'duration':duration, 'estimator':estimator}#,'silhouette':silhouette_score_}

def get_change_point(x, y):
    """
        Scelta del K migliore
        ::param x: lista di valori di K
        ::param y: lista di valori di SSE
    """
    import math
    max_d = -float('infinity')
    index = 0

    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[len(x)-1], y[len(y)-1]], p=[x[i], y[i]])
        d = math.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
    
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta *  y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point
	

## Individual Clustering

In [16]:
# Numero de filas del archivo
f=open(individual_footprint)
num_rows = len(f.readlines())-1
f.close()

#<customer_id;year;week;profile_id;size;t1... tn >
import datetime
f=open(individual_footprint)
fw=open(individual_clusters,'w')  #uid,cluster_id,centroid
fw2=open(individual_labels,'w') #uid,year,week,cluster_id,profile

f.readline()
data=[] #buffer

footprints_clustered=0
footprints_clusters=0
n_cliente=0
contador = 0
temporal= 0
for row in f: #reading individual footprint
    row=row.strip().split(',') # leemos cada elemento da linea parseada por ","
    uid=row[0]
    year=row[1]
    week=row[2]
    size=int(row[5])
    profile=np.array([float(el) for el in row[6:]])
    # Individual clustering
    if uid==clientes[n_cliente]: # Para cada fila donde los "uid" son iguales 
        data.append(((uid,year,week),profile))     
        contador+=1
    else: #final de cliente
        
        #---------------------------------------------------------------------
        # procesar data
        #---------------------------------------------------------------------
        to_cluster=[el[1] for el in data]
        K=process_footprint(to_cluster,np.arange(1,len(to_cluster)))

        # choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]
        best_k=compute_best_k(x,y,len(to_cluster))
        print('clustering: '+str(clientes[n_cliente])+' len data: '+str(len(data))+" best k: "+str(best_k))
        
        # clustering
        if best_k==1:
            #to few records
            cluster_centers_=[np.average(to_cluster,axis=0)]
            labels_=[0]*len(to_cluster)  
        else:
            cluster_centers_=K[best_k]['estimator'].cluster_centers_
            labels_=K[best_k]['estimator'].labels_
        
        #export individual centroids
        for i in np.arange(len(cluster_centers_)):
            string="%s;%s;%s\n"%(clientes[n_cliente],i,';'.join([str(el) for el in cluster_centers_[i]])) #uid,cluster_id,centroid
            fw.write(string)
            footprints_clusters+=1
        fw.flush()

        #export original data and labels
        for i in np.arange(len(data)):
            uid2=data[i][0]
            profile2=data[i][1]
            label2=labels_[i]
            string="%s;%s;%s;%s;%s\n" %(uid2[0],uid2[1],uid2[2],label2
                                                    ,';'.join([str(el) for el in profile2]))#uid,year,week,cluster_id,profile
            fw2.write(string)
            footprints_clustered+=1
        fw2.flush()
        #---------------------------------------------------------------------
        #---------------------------------------------------------------------
        
        

        data=[] #buffer
        data.append(((uid,year,week),profile))
        
        contador+=1
        temporal+=1
        n_cliente+=1
        
    if contador == num_rows:        # Para el ultimo cliente y ultima fila
        #---------------------------------------------------------------------
        # procesar data
        #---------------------------------------------------------------------
        to_cluster=[el[1] for el in data]
        K=process_footprint(to_cluster,np.arange(1,len(to_cluster)))

        # choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]
        best_k=compute_best_k(x,y,len(to_cluster))
        print('clustering: '+str(clientes[n_cliente])+' len data: '+str(len(data))+" best k: "+str(best_k))
        
        # clustering
        if best_k==1:
            #to few records
            cluster_centers_=[np.average(to_cluster,axis=0)]
            labels_=[0]*len(to_cluster)  
        else:
            cluster_centers_=K[best_k]['estimator'].cluster_centers_
            labels_=K[best_k]['estimator'].labels_
        
        #export individual centroids
        for i in np.arange(len(cluster_centers_)):
            string="%s;%s;%s\n"%(uid,i,';'.join([str(el) for el in cluster_centers_[i]])) #uid,cluster_id,centroid
            fw.write(string)
            footprints_clusters+=1
        fw.flush()

        #export original data and labels
        for i in np.arange(len(data)):
            uid2=data[i][0]
            profile2=data[i][1]
            label2=labels_[i]
            string="%s;%s;%s;%s;%s\n" %(uid2[0],uid2[1],uid2[2],label2
                                                    ,';'.join([str(el) for el in profile2]))#uid,year,week,cluster_id,profile
            fw2.write(string)
            footprints_clustered+=1
        fw2.flush()
        #---------------------------------------------------------------------
        #---------------------------------------------------------------------
        
        print("final")   
    
    

temporal

clustering: +++AhJk2QJM= len data: 9 best k: 4
clustering: +++el3L/5Wo= len data: 3 best k: 1
clustering: +++g8j9k+5A= len data: 28 best k: 7
clustering: +++yupGqYTo= len data: 7 best k: 3
clustering: ++/9crwdWSc= len data: 55 best k: 11
clustering: ++/NRCUAMIA= len data: 110 best k: 24
clustering: ++/bAm/9MuA= len data: 3 best k: 1
clustering: ++/cwUrPKGM= len data: 8 best k: 4
clustering: ++/nqADqHkE= len data: 111 best k: 22
clustering: ++/oQ9Lb9dI= len data: 1 best k: 1
clustering: ++0XFicYlHI= len data: 1 best k: 1
clustering: ++0i87zUaWQ= len data: 14 best k: 6
clustering: ++0tfqgWxOQ= len data: 2 best k: 1
clustering: ++0vL18mGDU= len data: 5 best k: 2.0
clustering: ++0xinBYSBE= len data: 59 best k: 13
clustering: ++0zBffGXGk= len data: 23 best k: 6
clustering: ++10Hq5daII= len data: 3 best k: 1
clustering: ++14g8obpj0= len data: 4 best k: 1
clustering: ++1QchhSBuo= len data: 1 best k: 1
clustering: ++1XMtcwMec= len data: 66 best k: 17
clustering: ++1udmfUnfU= len data: 14 best 

clustering: ++WDNVcbqkE= len data: 46 best k: 9
clustering: ++WU9amOi8A= len data: 1 best k: 1
clustering: ++XKjuQwXic= len data: 13 best k: 5
clustering: ++XMeyxaxac= len data: 15 best k: 7
clustering: ++XlDbl7lno= len data: 66 best k: 13
clustering: ++XsKel+znM= len data: 13 best k: 5
clustering: ++Y2WxO8j/o= len data: 33 best k: 9
clustering: ++Y6tP4m+Ro= len data: 8 best k: 4
clustering: ++YYEyOA0HU= len data: 5 best k: 2.0
clustering: ++YdKuawKpA= len data: 10 best k: 4
clustering: ++Ye+a8egxI= len data: 1 best k: 1
clustering: ++Yg9SnovE4= len data: 65 best k: 15
clustering: ++YxWitfLeM= len data: 12 best k: 4
clustering: ++Yz0jTWwEU= len data: 1 best k: 1
clustering: ++Z0cEzcGyY= len data: 21 best k: 7
clustering: ++ZIRyrwCRk= len data: 1 best k: 1
clustering: ++ZXdftEe10= len data: 10 best k: 4
clustering: ++ZZAXWCx9U= len data: 61 best k: 11
clustering: ++ZudOwsZnw= len data: 1 best k: 1
clustering: ++aBU5hNvP4= len data: 76 best k: 18
clustering: ++aHRvQdI2E= len data: 114 be

clustering: +/+uudWZgO0= len data: 13 best k: 5
clustering: +/+viq0t96Q= len data: 8 best k: 4
clustering: +//00EbGssY= len data: 1 best k: 1
clustering: +//24MSyrdg= len data: 1 best k: 1
clustering: +//5qbVGEE4= len data: 31 best k: 1
clustering: +//7/b0IcIY= len data: 35 best k: 9
clustering: +//KZkiJkbc= len data: 2 best k: 1
clustering: +//LEptykcY= len data: 25 best k: 8
clustering: +//NX/cU82Q= len data: 6 best k: 3
clustering: +//NbJMb1OY= len data: 23 best k: 6
clustering: +//SXQ1El20= len data: 101 best k: 20
clustering: +//mw+qc++c= len data: 2 best k: 1
clustering: +//xLEw+o0o= len data: 1 best k: 1
clustering: +/01jP48bkI= len data: 38 best k: 1
clustering: +/0GGaYfzdE= len data: 3 best k: 1
clustering: +/0SroLaRNg= len data: 34 best k: 9
clustering: +/0W1qTC4VA= len data: 17 best k: 6
clustering: +/0lC8BPkgo= len data: 3 best k: 1
clustering: +/12xggrkzo= len data: 5 best k: 2.0
clustering: +/199g1Uq70= len data: 4 best k: 1
clustering: +/1DK+68XAw= len data: 34 best k: 6

clustering: +/WtcHZMNW0= len data: 35 best k: 10
clustering: +/XLRBkrjwc= len data: 42 best k: 10
clustering: +/XNTF05dRY= len data: 8 best k: 3
clustering: +/XUABMzjDM= len data: 1 best k: 1
clustering: +/Xg2R/qnNc= len data: 40 best k: 9
clustering: +/Xr9E3TmyY= len data: 12 best k: 4
clustering: +/XuU+s+NtY= len data: 1 best k: 1
clustering: +/Y6s61bkA8= len data: 19 best k: 7
clustering: +/YUzQES66M= len data: 39 best k: 12
clustering: +/YXcbXCZaE= len data: 1 best k: 1
clustering: +/YY0mbzNgI= len data: 5 best k: 2.0
clustering: +/YfssdLhM8= len data: 3 best k: 1
clustering: +/YnqUV3fLc= len data: 17 best k: 7
clustering: +/Z0x15uBZU= len data: 7 best k: 3
clustering: +/Z43cKsFAA= len data: 3 best k: 1
clustering: +/Z4z/BhJf4= len data: 8 best k: 5
clustering: +/ZKKE7wEHI= len data: 106 best k: 22
clustering: +/ZMript83E= len data: 5 best k: 2.0
clustering: +/ZSfB6NYQE= len data: 4 best k: 1
clustering: +/Zg8+3Rq3M= len data: 33 best k: 9
clustering: +/ZjaBaGi5w= len data: 69 best

clustering: +/y4+pEbde8= len data: 31 best k: 8
clustering: +/y7QsXVM/g= len data: 20 best k: 6
clustering: +/y8xHEXLUg= len data: 55 best k: 13
clustering: +/yPWUe76yw= len data: 115 best k: 21
clustering: +/yWa2RSBlI= len data: 2 best k: 1
clustering: +/z3oL/3bRU= len data: 122 best k: 24
clustering: +/zHiN7y7dE= len data: 2 best k: 1
clustering: +/znNSjDxBw= len data: 19 best k: 6
clustering: +/zu5STBtHY= len data: 7 best k: 4
clustering: +/zyXchFT/E= len data: 1 best k: 1
clustering: +0+0K3XJA/o= len data: 24 best k: 8
clustering: +0+FlPkqniY= len data: 5 best k: 2.0
clustering: +0+J54Rbzjc= len data: 17 best k: 6
clustering: +0+XEX5+YFI= len data: 15 best k: 6
clustering: +0+XpYsXUPA= len data: 4 best k: 1
clustering: +0+tg2SCurA= len data: 57 best k: 10
clustering: +0+yyLKK4Yw= len data: 22 best k: 7
clustering: +0/0nMsK2lM= len data: 1 best k: 1
clustering: +0/IfwP/73w= len data: 2 best k: 1
clustering: +0/gu2oHIkE= len data: 3 best k: 1
clustering: +0/lkzs2+RU= len data: 19 bes

clustering: +0T0ltbIAe8= len data: 35 best k: 8
clustering: +0TNKkO45fs= len data: 5 best k: 2.0
clustering: +0TgDmDpJvc= len data: 100 best k: 21
clustering: +0UM+ftOOmo= len data: 4 best k: 1
clustering: +0Ult5EOMLw= len data: 15 best k: 4
clustering: +0V7SDRqgZI= len data: 1 best k: 1
clustering: +0VFdPbdS9A= len data: 1 best k: 1
clustering: +0VMLX//Xf0= len data: 9 best k: 4
clustering: +0VMbI5gEdk= len data: 1 best k: 1
clustering: +0VRbNMixWI= len data: 18 best k: 6
clustering: +0Vo8uhZmFs= len data: 23 best k: 7
clustering: +0VuLUpRWtA= len data: 23 best k: 8
clustering: +0W7PTTwyMg= len data: 34 best k: 8
clustering: +0WAOqlgjps= len data: 12 best k: 5
clustering: +0WE2FqeOTg= len data: 18 best k: 6
clustering: +0WJHBGbDHM= len data: 27 best k: 7
clustering: +0WVks70DtQ= len data: 1 best k: 1
clustering: +0WbRMXihqo= len data: 40 best k: 11
clustering: +0WhATi+y48= len data: 23 best k: 6
clustering: +0WjOQSeePE= len data: 40 best k: 12
clustering: +0Wjz7egBi4= len data: 9 best

clustering: +1/J/5bEqwo= len data: 17 best k: 6
clustering: +1/XUwoBJnQ= len data: 14 best k: 5
clustering: +1/g+klOfT8= len data: 2 best k: 1
clustering: +1/nvllj2qU= len data: 12 best k: 3
clustering: +101jrT4g0s= len data: 14 best k: 4
clustering: +1031hOvx78= len data: 10 best k: 5
clustering: +109HCpWehM= len data: 8 best k: 4
clustering: +10I26Q8A8c= len data: 4 best k: 1
clustering: +10aMryYerU= len data: 1 best k: 1
clustering: +10ha1hBGvo= len data: 6 best k: 2
clustering: +10mR7Nsqfk= len data: 31 best k: 9
clustering: +11OF0lXTKo= len data: 25 best k: 6
clustering: +11YWdzRJEg= len data: 122 best k: 23
clustering: +11ZvBoYvlU= len data: 1 best k: 1
clustering: +11iMzZJP80= len data: 144 best k: 30
clustering: +11j3TQTXJc= len data: 3 best k: 1
clustering: +11pYCpEZqI= len data: 6 best k: 4
clustering: +1232tTL3qA= len data: 25 best k: 8
clustering: +12k/EdJArQ= len data: 48 best k: 10
clustering: +12oVP3bPCU= len data: 4 best k: 1
clustering: +13/Y57RH00= len data: 77 best k

clustering: +1TukFDxH6M= len data: 24 best k: 9
clustering: +1UGnrJVMdc= len data: 2 best k: 1
clustering: +1UYcoi4GmY= len data: 1 best k: 1
clustering: +1Uf61iAmBk= len data: 47 best k: 10
clustering: +1VKN+pSVwQ= len data: 3 best k: 1
clustering: +1VN7Ev4/ko= len data: 62 best k: 11
clustering: +1VQaOVVG94= len data: 40 best k: 10
clustering: +1VXT+JeldA= len data: 7 best k: 3
clustering: +1Vl6qLra3c= len data: 3 best k: 1
clustering: +1VuzeX/k1E= len data: 2 best k: 1
clustering: +1Vy3qyirWg= len data: 7 best k: 3
clustering: +1Vzx6fhIoU= len data: 3 best k: 1
clustering: +1W/ss2Bios= len data: 1 best k: 1
clustering: +1W4X3Y9nrE= len data: 18 best k: 7
clustering: +1W9jUXJyxA= len data: 17 best k: 7
clustering: +1WQpUtiX5M= len data: 4 best k: 1
clustering: +1WTqdQ/RyY= len data: 24 best k: 6
clustering: +1We4BPZxp8= len data: 39 best k: 1
clustering: +1X5hHpbzcY= len data: 10 best k: 5
clustering: +1X8Od4uNd8= len data: 8 best k: 3
clustering: +1XBR6HISGs= len data: 18 best k: 6


clustering: +1ztKr9p0KY= len data: 17 best k: 4
clustering: +1zw5GoFC1U= len data: 8 best k: 3
clustering: +2+GvwygcvA= len data: 24 best k: 6
clustering: +2+KKky+0gs= len data: 155 best k: 1
clustering: +2+pSWe0mHg= len data: 69 best k: 11
clustering: +2+q7NYMolc= len data: 3 best k: 1
clustering: +2+uUBEtdJk= len data: 51 best k: 12
clustering: +2//dTkgxKk= len data: 1 best k: 1
clustering: +2/1KpDz1ZE= len data: 22 best k: 7
clustering: +2/70zExpEk= len data: 4 best k: 1
clustering: +2/77RLi8es= len data: 2 best k: 1
clustering: +2/DzlFK0ss= len data: 21 best k: 6
clustering: +2/S3vxJIOU= len data: 1 best k: 1
clustering: +2/UUNhfcq8= len data: 63 best k: 13
clustering: +2/nVrPoecE= len data: 3 best k: 1
clustering: +2/yKCMojno= len data: 9 best k: 4
clustering: +2/zlENAfM4= len data: 19 best k: 7
clustering: +20W2t5Ee9c= len data: 8 best k: 4
clustering: +20XMch3gVM= len data: 46 best k: 10
clustering: +20y314t7N4= len data: 10 best k: 5
clustering: +21bRp0n0Ko= len data: 13 best k

clustering: +2VD8vmnQ14= len data: 97 best k: 22
clustering: +2VEjKIbmM0= len data: 4 best k: 1
clustering: +2VNJli35wI= len data: 3 best k: 1
clustering: +2VTxFRLsr0= len data: 9 best k: 4
clustering: +2Vi9asjhi4= len data: 6 best k: 3
clustering: +2VuR/a0sT4= len data: 34 best k: 8
clustering: +2VygPBWo1k= len data: 153 best k: 24
clustering: +2WI9BcgZlA= len data: 2 best k: 1
clustering: +2WP4Jpab1k= len data: 39 best k: 12
clustering: +2WTkcFXHig= len data: 4 best k: 1
clustering: +2WWPoiGg/I= len data: 58 best k: 16
clustering: +2WbY7UsscQ= len data: 1 best k: 1
clustering: +2XLDc0+xq8= len data: 54 best k: 8
clustering: +2XMmsjp7Cc= len data: 11 best k: 5
clustering: +2XTaIPnqfA= len data: 1 best k: 1
clustering: +2XY+HBBh1o= len data: 3 best k: 1
clustering: +2XhWE3AcWk= len data: 13 best k: 4
clustering: +2XkigZLq1c= len data: 23 best k: 6
clustering: +2Y629T3tGE= len data: 11 best k: 5
clustering: +2Y8p0EDnB0= len data: 3 best k: 1
clustering: +2YGk2Hd6jE= len data: 2 best k: 

clustering: +2ynlTIjnU0= len data: 56 best k: 1
clustering: +2yvGKzIiJc= len data: 6 best k: 4
clustering: +2yvLxuFtjg= len data: 51 best k: 13
clustering: +2zB9b1tXNw= len data: 14 best k: 5
clustering: +2zrabOaa3M= len data: 14 best k: 5
clustering: +3+5B4tf+Rw= len data: 4 best k: 1
clustering: +3+CAIAJHv0= len data: 32 best k: 8
clustering: +3+HFcuRyBI= len data: 1 best k: 1
clustering: +3+J5PJNBCU= len data: 12 best k: 5
clustering: +3+N783vjnM= len data: 9 best k: 3
clustering: +3/3OaXU5/g= len data: 1 best k: 1
clustering: +3/9//at8OI= len data: 10 best k: 4
clustering: +3/FyByxM/s= len data: 11 best k: 5
clustering: +3/pk+Y16c0= len data: 72 best k: 13
clustering: +3/rAU/SNOk= len data: 39 best k: 13
clustering: +3059JVeSk8= len data: 43 best k: 10
clustering: +30BN8MAEPo= len data: 3 best k: 1
clustering: +30FSSfSqoI= len data: 3 best k: 1
clustering: +30Gq1sAeAc= len data: 27 best k: 6
clustering: +30KupfeGek= len data: 11 best k: 4
clustering: +30Q43KvPT4= len data: 77 best 

clustering: +3PMy1SJRzA= len data: 24 best k: 8
clustering: +3PWCB6EHVM= len data: 9 best k: 4
clustering: +3PX7WPqZjc= len data: 24 best k: 6
clustering: +3Pi3sB15Rc= len data: 6 best k: 3
clustering: +3PzJG2kVp8= len data: 1 best k: 1
clustering: +3Q+yQaSRiU= len data: 3 best k: 1
clustering: +3Q1rvOYjb8= len data: 5 best k: 2.0
clustering: +3QEPdWjYuc= len data: 2 best k: 1
clustering: +3QOa3k9ZA0= len data: 3 best k: 1
clustering: +3QRPNV73OI= len data: 12 best k: 6
clustering: +3Qeo8rrF0E= len data: 28 best k: 9
clustering: +3QiF2ZBpfs= len data: 1 best k: 1
clustering: +3QofCrol2A= len data: 1 best k: 1
clustering: +3QpcR6S2Po= len data: 14 best k: 6
clustering: +3QvI2h+hZs= len data: 49 best k: 10
clustering: +3Qxj5h+xrI= len data: 24 best k: 7
clustering: +3R4Bbdnqe0= len data: 11 best k: 4
clustering: +3RIHTn3XYQ= len data: 7 best k: 3
clustering: +3RPOKkKqbE= len data: 42 best k: 9
clustering: +3RU+5ZJABM= len data: 19 best k: 7
clustering: +3RyFOgUL5w= len data: 44 best k: 1

clustering: +3u1VkNiU6k= len data: 107 best k: 22
clustering: +3u2vTC1DLw= len data: 1 best k: 1
clustering: +3uL50BpO1U= len data: 1 best k: 1
clustering: +3uv/fD0utE= len data: 22 best k: 7
clustering: +3uxP+rhHLs= len data: 1 best k: 1
clustering: +3uzSRNporY= len data: 1 best k: 1
clustering: +3vZ8mA2J94= len data: 57 best k: 12
clustering: +3vh245T+r4= len data: 2 best k: 1
clustering: +3vi1fd3YuM= len data: 16 best k: 5
clustering: +3vp4H7WKPk= len data: 1 best k: 1
clustering: +3vqgknYlW8= len data: 2 best k: 1
clustering: +3vvC0TfzFA= len data: 4 best k: 1
clustering: +3vxQz9cC7s= len data: 8 best k: 4
clustering: +3wHP455zC4= len data: 6 best k: 2
clustering: +3wQWeOGIQU= len data: 28 best k: 6
clustering: +3wQpUCw+d0= len data: 5 best k: 2.0
clustering: +3wSGejR/3w= len data: 11 best k: 4
clustering: +3wcNC/P0Fg= len data: 1 best k: 1
clustering: +3wmcJ22jT4= len data: 36 best k: 10
clustering: +3wwxOUYlPw= len data: 7 best k: 3
clustering: +3x5UIFH1T0= len data: 8 best k: 3


clustering: +4LoVpdzJp0= len data: 18 best k: 5
clustering: +4LymZdP3JU= len data: 4 best k: 1
clustering: +4M++7kyZ2I= len data: 44 best k: 8
clustering: +4M/FkfPRQA= len data: 1 best k: 1
clustering: +4M4YyZKW8o= len data: 3 best k: 1
clustering: +4M8NMVT2t4= len data: 52 best k: 13
clustering: +4MHYTV5NhE= len data: 19 best k: 6
clustering: +4MYsaW+zZU= len data: 16 best k: 4
clustering: +4MjBWS/Fvo= len data: 81 best k: 17
clustering: +4MlonxB0/8= len data: 4 best k: 1
clustering: +4N02YBF7SU= len data: 2 best k: 1
clustering: +4N2234yaDY= len data: 3 best k: 1
clustering: +4NKCSetDtA= len data: 19 best k: 3
clustering: +4NZOCQHwAs= len data: 32 best k: 9
clustering: +4NbyZ7VIzg= len data: 14 best k: 6
clustering: +4NueNPreag= len data: 4 best k: 1
clustering: +4O2P5qTEak= len data: 2 best k: 1
clustering: +4O89J7zFuE= len data: 21 best k: 7
clustering: +4OOghKqbwY= len data: 1 best k: 1
clustering: +4OfL801ehc= len data: 12 best k: 3
clustering: +4OlNzlXuuw= len data: 53 best k: 1

clustering: +4qn10OAHGQ= len data: 71 best k: 14
clustering: +4rOFRVRZuU= len data: 23 best k: 5
clustering: +4rv9BfHFQA= len data: 4 best k: 1
clustering: +4sHGXyP5Tk= len data: 25 best k: 6
clustering: +4sa6Ecob6s= len data: 18 best k: 6
clustering: +4sfctEE0eM= len data: 132 best k: 30
clustering: +4shT9cQux0= len data: 2 best k: 1
clustering: +4szC23HzUI= len data: 6 best k: 3
clustering: +4tT5lATkHg= len data: 23 best k: 7
clustering: +4tUb7PHbNU= len data: 4 best k: 1
clustering: +4tislfr4P0= len data: 15 best k: 6
clustering: +4uMPsLe6tw= len data: 14 best k: 5
clustering: +4uVj6jbd/A= len data: 2 best k: 1
clustering: +4ucG7urERY= len data: 31 best k: 9
clustering: +4vNdqPWxTA= len data: 19 best k: 7
clustering: +4vfluAriaY= len data: 18 best k: 6
clustering: +4vuzuEZvK0= len data: 82 best k: 17
clustering: +4w3CS1mhGs= len data: 10 best k: 4
clustering: +4wIMKGPILA= len data: 27 best k: 8
clustering: +4wPWtuHmeQ= len data: 23 best k: 8
clustering: +4wd2NDS1yU= len data: 59 bes

clustering: +5K11q5TXE4= len data: 32 best k: 9
clustering: +5K9V5YYgNY= len data: 37 best k: 12
clustering: +5KA5vLbx+g= len data: 18 best k: 7
clustering: +5KHGzweYb4= len data: 1 best k: 1
clustering: +5KXGfmZzWw= len data: 2 best k: 1
clustering: +5Kg9LYJQtg= len data: 74 best k: 15
clustering: +5KpDLSbkwA= len data: 1 best k: 1
clustering: +5KxRRubkS4= len data: 7 best k: 3
clustering: +5LFcL1+4WM= len data: 37 best k: 10
clustering: +5LMKC1sfxw= len data: 48 best k: 11
clustering: +5LMKqun2Pk= len data: 8 best k: 4
clustering: +5LU5dhsB0I= len data: 30 best k: 8
clustering: +5LczJ5bKyk= len data: 8 best k: 4
clustering: +5LhGHefc7E= len data: 8 best k: 3
clustering: +5Lmc9edkHY= len data: 9 best k: 4
clustering: +5LztOfEVT4= len data: 3 best k: 1
clustering: +5M4qqScM4U= len data: 46 best k: 1
clustering: +5MNjBH5WEA= len data: 1 best k: 1
clustering: +5MZqzLesrQ= len data: 7 best k: 4
clustering: +5N/s88ov1U= len data: 8 best k: 3
clustering: +5O1A/Uzuhw= len data: 3 best k: 1
c

2727

In [None]:
clientes

# Lista de clientes

In [None]:
to_cluster=[el[1] for el in data]
K=process_footprint(to_cluster,np.arange(1,len(to_cluster)))

#choose k
x=K.keys()
x=list(x)
y=[K[k]['inertia'] for k in K]
best_k=compute_best_k(x,y,len(to_cluster))
print('clustering: '+str(uid)+' len data: '+str(len(data))+" best k: "+str(best_k))

In [None]:
 #clustering
if best_k==1:
    #to few records
    cluster_centers_=[np.average(to_cluster,axis=0)]
    labels_=[0]*len(to_cluster)  
else:
    cluster_centers=K[best_k]['estimator'].cluster_centers_
    labels=K[best_k]['estimator'].labels_
            

In [None]:
#export individual centroids
for i in np.arange(len(cluster_centers_)):
    string="%s;%s;%s\n"%(uid,i,';'.join([str(el) for el in cluster_centers_[i]])) #uid,cluster_id,centroid
    fw.write(string)
    footprints_clusters+=1
fw.flush()

#export original data and labels
for i in np.arange(len(data)):
    uid=data[i][0]
    profile=data[i][1]
    label=labels_[i]
    string="%s;%s;%s;%s;%s\n" %(uid[0],uid[1],uid[2],label
                                            ,';'.join([str(el) for el in profile]))#uid,year,week,cluster_id,profile
    fw2.write(string)
    footprints_clustered+=1
fw2.flush()

In [None]:
cluster_centers

In [None]:
y[11]

In [None]:
K=process_footprint(to_cluster,np.arange(1,len(to_cluster)))

In [None]:
len(to_cluster)

In [None]:
K

In [None]:
x=K.keys()
x=list(x)
len(x)

In [None]:
y=[K[k]['inertia'] for k in K]
len(y)

In [None]:
best_k=compute_best_k(x,y,len(to_cluster))
best_k

In [None]:
from scipy.interpolate import UnivariateSpline
w = np.isnan(y)
y[w] = 0.0
spl = UnivariateSpline(x, y)

In [None]:
# Plot the elbow
plt.plot(x, y, 'r-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()


In [None]:
list(x)[0:18]


In [None]:
y[0:18]

In [None]:
spl = UnivariateSpline(list(x)[0:18], y[0:18])
spl

# Pruebas


In [None]:
 # clustering dataset
# determine k using elbow method

from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

x1 = np.array([3, 1, 1, 2, 1, 6, 6, 6, 5, 6, 7, 8, 9, 8, 9, 9, 8])
x2 = np.array([5, 4, 5, 6, 5, 8, 6, 7, 6, 7, 1, 2, 1, 2, 3, 2, 3])

plt.plot()
plt.xlim([0, 10])
plt.ylim([0, 10])
plt.title('Dataset')
plt.scatter(x1, x2)
plt.show()

# create new plot and data
plt.plot()
X = np.array(list(zip(x1, x2))).reshape(len(x1), 2)
colors = ['b', 'g', 'red']
markers = ['o', 'v', 's']

# k means determine k
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(X)
    kmeanModel.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])

# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

 

In [None]:
from __future__ import print_function

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

print(__doc__)

# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
X, y = make_blobs(n_samples=500,
                  n_features=2,
                  centers=4,
                  cluster_std=1,
                  center_box=(-10.0, 10.0),
                  shuffle=True,
                  random_state=1)  # For reproducibility

range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

    plt.show()