# Perfiles de comportamiento y minado de patrones

In [None]:
import numpy as np
import datetime
from datetime import date
import json
import pylab
import pandas as pd
import matplotlib.pyplot as plt
import os, sys, logging, warnings, time

In [None]:
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "behavior_patterns/src/"))
import profile_builder as profile_manager

## Contruccion de "Footprints"

En esta etapa contruiremos los "footprints" basicos (tensores dimesionales) de cada cliente a partir de sus datos transaccionales, la dimesiones del footprint son elegidas por el usuario, nosotros tenemos los siguientes casos de estudio:

**Datos de bancarios**:
 - Sesion de TX
   - s = {cliente, timestamp, Monto}  
 - Cada cliente tiene una secuencia de sesiones de TX (S)
    - S = { s1, s2 , s3, ..., sn}                n: # de sesiones de cada cliente

**Datos mobiles**:
 - Sesion de TX
   - s = {cliente, timestamp, Monto}  
 - Cada cliente tiene una secuencia de sesiones de TX (S)
    - S = { s1, s2 , s3, ..., sn}    

In [None]:
to_read = 'bank_trx' 

In [None]:
sessions = profile_manager.session_data(font=to_read)
sessions.head(3)

In [None]:
footprints = profile_manager.footprints(font=to_read,count_session=False)

In [None]:
footprints = profile_manager.footprints(font=to_read,count_session=False)

In [None]:
footprints.head()
#footprints

In [None]:
# footprints[['footprint_id','year','week','profile_id','category','turn','size','c10t4d0']]

## Clustering individual

En esta etapa contruiremos los "footprints" basicos (tensores dimesionales) de cada cliente a partir de sus datos transaccionales, la dimesiones del footprint son elegidas por el usuario, nosotros tenemos los siguientes casos de estudio:

In [None]:
# FUNCIONES DE APOYO (CLUSTERS)

def process_footprint(data,tests,log=False):
    from sklearn.cluster import MiniBatchKMeans
    #KMeans(init='k-means++', n_clusters=k, n_init=10)
    import datetime
    K={}  #  Creamos una lista vacia
    # Probamos para cada K
    for k in tests:
        if k<=len(data):
            if log:
                print("%s: processing %s"%(datetime.datetime.now(),k))
            
            # Cargamos en K(indice k) = los resultados de "MiniBatchKMeans"
            K[k]=bench_k_means(MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=100,
                      n_init=10, max_no_improvement=10, verbose=0,
                      random_state=0),name="k-means++", data=data)
            
    return K

def bench_k_means(estimator, name, data,distance_function=None):
    from sklearn import metrics
    from sklearn.metrics import silhouette_samples, silhouette_score
    import time
    t0 = time.time()
    if distance_function:
        estimator.fit(data,distance_function)
    else:
        estimator.fit(data)
    #cluster_labels = estimator.fit_predict(data)
    #silhouette_score_ = silhouette_score(data, cluster_labels)
    
    inertia=estimator.inertia_
    duration=time.time() - t0
    return {'inertia':inertia,'duration':duration, 'estimator':estimator}#,'silhouette':silhouette_score_}

def compute_best_k(x,y,occurrencies, plot=False,points=1000,sf=0.9):
    import numpy as np
    
    if len(x)<5:
        b_k = max(1, round(np.sqrt(occurrencies/2)))
        if plot:
            import pylab
            pylab.plot(x,y)
            pylab.scatter(x[b_k],y[b_k],s=20, marker='o')
            pylab.text(x[b_k],y[b_k],"bestK %s" %(b_k))
            pylab.show()
            return b_k

        return b_k
    
    import scipy.interpolate as inter
    from scipy.interpolate import interp1d
    from scipy.interpolate import UnivariateSpline
    # spl = UnivariateSpline(x, y)
    spl = inter.InterpolatedUnivariateSpline (x, y)
    spl.set_smoothing_factor(sf)
    xs = np.linspace(min(x), max(x), points)
    ys = spl(xs)
    idx_better_k=get_change_point(xs, ys)
    if plot:
        import pylab
        pylab.plot(xs,ys)
        
        pylab.scatter(xs[idx_better_k],ys[idx_better_k],s=20, marker='o')
        pylab.text(xs[idx_better_k],ys[idx_better_k],"bestK %s" %(np.round(xs[idx_better_k])))
        pylab.show()
        return int(np.round(xs[idx_better_k]))
    return int(np.round(xs[idx_better_k]))

def get_change_point(x, y):
    """
         Elección del mejor K
         :: param x: lista de valores de K
         :: param y: lista de valores de SSE
    """
    import math
    max_d = -float('infinity')
    index = 0

    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[len(x)-1], y[len(y)-1]], p=[x[i], y[i]])
        d = math.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
    
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta *  y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point
  

In [None]:
def process_data(to_cluster):
    K=process_footprint(to_cluster,np.arange(1,len(to_cluster)+1))
    
    # Choose k
    x=list(K.keys())
    y=[K[k]['inertia'] for k in K]
    best_k=compute_best_k(x,y,len(to_cluster))
    
    # print(str(contador)+' => clustering: '+str(clientes[n_cliente])+' len data: '+str(len(data))+" best k: "+str(best_k))

    # clustering
    if best_k==1:
        #to few records
        cluster_centers = [np.average(to_cluster,axis=0)]
        labels = [0]*len(to_cluster)  
    else:
        cluster_centers = K[best_k]['estimator'].cluster_centers_
        labels = K[best_k]['estimator'].labels_

    return cluster_centers,labels

In [None]:
title = list(footprints.columns)[8:]
users = np.unique(footprints['user_id'])

In [None]:
abc = footprints[['user_id','footprint_id']].groupby(by="user_id").count()
abcd = abc[abc['footprint_id']>20]
users2 = list(abcd.index)
abcd.head()

In [None]:
file_individual_cluster = 'footprint_individual.cluster'
file_individual_label = 'footprint_individual.label'

In [None]:
start = time.time()

individual_cluster = []
individual_labels = []

#for user in users2[0:10]:
for user in users[2:10]:
    user_footprints = footprints[footprints['user_id'] == user]
    #print(user)
    buffer = []
    for index, row in user_footprints.iterrows():
        buffer.append(([row['footprint_id'],user,row['year'],row['week']],row[title].values))

    # Clustering buffer
    to_cluster=[el[1] for el in buffer]
    temporal= process_data(to_cluster)
    
    cluster_centers_=temporal[0]
    cluster_labels_=temporal[1]

    for i in np.arange(len(cluster_centers_)):
        individual_cluster.append([user, i, *cluster_centers_[i]])

    cont = 0
    for row in buffer:
        individual_labels.append(row[0] + [cluster_labels_[cont]])
        cont+=1
        
end = time.time()
print(end - start)

In [None]:
sil

In [None]:
def compute_best_k(x,y,occurrencies, plot=True,points=30,sf=0.1):
    import numpy as np
    
    if len(x)<5:
        b_k = max(1, round(np.sqrt(occurrencies/2)))
        print(b_k)
        if plot:
            import pylab
            pylab.plot(x,y)
            pylab.scatter(x[b_k-1],y[b_k-1],s=20, marker='o')
            pylab.text(x[b_k-1],y[b_k-1],"bestK %s" %(b_k))
            pylab.show()
            return b_k
        return b_k
    
    import scipy.interpolate as inter
    from scipy.interpolate import UnivariateSpline
    
    spl = inter.InterpolatedUnivariateSpline (x, y)
    spl.set_smoothing_factor(sf)
    xs = np.linspace(min(x), max(x), points)
    ys = spl(xs)
    idx_better_k=get_change_point(xs, ys)
    if plot:
        import pylab
        pylab.plot(xs,ys)
        
        pylab.scatter(xs[idx_better_k],ys[idx_better_k],s=20, marker='o')
        pylab.text(xs[idx_better_k],ys[idx_better_k],"bestK %s" %(np.round(xs[idx_better_k])))
        pylab.show()
        return int(np.round(xs[idx_better_k]))
    return int(np.round(xs[idx_better_k]))


In [None]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Create dataset with 3 random cluster centers and 1000 datapoints
x, y = make_blobs(n_samples = 1000, centers = 3, n_features=2, shuffle=True, random_state=31)

sil = []
kmax = 10

# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in range(2, kmax+1):
    kmeans = KMeans(n_clusters = k).fit(x)
    labels = kmeans.labels_
    sil.append(silhouette_score(x, labels, metric = 'euclidean'))

In [None]:
user_footprints = footprints[footprints['user_id'] == users2[7]]
print(user)
print(len(user_footprints))
buffer = []
for index, row in user_footprints.iterrows():
    buffer.append(([row['footprint_id'],user,row['year'],row['week']],row[title].values))
    
    
to_cluster=[el[1] for el in buffer]
temporal= process_data(to_cluster)

In [None]:
user_footprints = footprints[footprints['user_id'] == users2[4]]
print(user)
print(len(user_footprints))
buffer = []
for index, row in user_footprints.iterrows():
    buffer.append(([row['footprint_id'],user,row['year'],row['week']],row[title].values))
    
to_cluster=[el[1] for el in buffer]

temporal= process_data(to_cluster)

In [None]:
temporal[1]

In [None]:
import scipy.interpolate as inter
from scipy.interpolate import UnivariateSpline

In [None]:
K=process_footprint(to_cluster,np.arange(1,len(to_cluster)+1))

In [None]:
# Choose k
x=list(K.keys())
y=[K[k]['inertia'] for k in K]

In [None]:
spl = UnivariateSpline(x[0:20], y[0:20])

In [None]:
sf=0.9
spl = UnivariateSpline(x, y)
spl.set_smoothing_factor(sf)

In [None]:
spl = UnivariateSpline(x,y)

In [None]:
spl = inter.InterpolatedUnivariateSpline (x, y)
spl.set_smoothing_factor(sf)

In [None]:
spl.get_coeffs()

In [None]:
best_k=compute_best_k(x,y,s=0.1)

In [None]:
best_k=compute_best_k(x[0:18], y[0:18],18)

In [None]:
K=process_footprint(to_cluster,np.arange(1,len(to_cluster)+1))
    
    # Choose k
    x=list(K.keys())
    y=[K[k]['inertia'] for k in K]
    best_k=compute_best_k(x,y,len(to_cluster))
    
    # clustering
    if best_k==1:
        #to few records
        cluster_centers = [np.average(to_cluster,axis=0)]
        labels = [0]*len(to_cluster)  
    else:
        cluster_centers = K[best_k]['estimator'].cluster_centers_
        labels = K[best_k]['estimator'].labels_

    return cluster_centers,labels

In [None]:
def compute_best_k(x,y,occurrencies, plot=True,points=10,sf=0.9):
    import numpy as np
    
    if len(x)<5:
        b_k = max(1, round(np.sqrt(occurrencies/2)))
        if plot:
            import pylab
            pylab.plot(x,y)
            pylab.scatter(x[b_k],y[b_k],s=20, marker='o')
            pylab.text(x[b_k],y[b_k],"bestK %s" %(b_k))
            return b_k,pylab

        return b_k
    
    import scipy.interpolate as inter
    from scipy.interpolate import UnivariateSpline
    
    spl = inter.InterpolatedUnivariateSpline (x, y)
    spl.set_smoothing_factor(sf)
    xs = np.linspace(min(x), max(x), points)
    ys = spl(xs)
    idx_better_k=get_change_point(xs, ys)
    if plot:
        import pylab
        pylab.plot(xs,ys)
        
        pylab.scatter(xs[idx_better_k],ys[idx_better_k],s=20, marker='o')
        pylab.text(xs[idx_better_k],ys[idx_better_k],"bestK %s" %(np.round(xs[idx_better_k])))
        return int(np.round(xs[idx_better_k]))
    return int(np.round(xs[idx_better_k]))


# examen

In [101]:
def partition(arr, start, end):
    pivot = arr[end]
    i = start
    for j in range(start, end):
        if (arr[j] <= pivot): 
            arr[i],arr[j] = arr[j],arr[i] 
            i += 1            
    ### missing line ###
    #arr[i+1], arr[end] = arr[end], arr[i+1]
    #arr[i], arr[end-1] = arr[end-1], arr[i]
    #arr[i], arr[end] = arr[end], arr[i]
    arr[i+1], arr[end-1] = arr[end-1], arr[i+1]
    return i

In [110]:
abc = [3,6,2,5,8,1,0,7,32,65]
a = 1
b=6

In [111]:
partition(abc,a,b)

1

In [120]:
def sort(arr,start,end):
    if (start < end):
        ind = partition(arr,start,end)
        sort(arr, start, ind-1)
        sort(arr, ind, end) 

In [121]:
sort(abc,a,b)

RecursionError: maximum recursion depth exceeded in comparison