# FOOTPRINTS

In [1]:
import numpy as np
import datetime
from datetime import date
import json
import pylab
import pandas as pd
import matplotlib.pyplot as plt
import os, sys

## Preparacion de datos

### Cargando datos

In [2]:

def leer_data():
    outfile='./SOURCES/data.csv'
    data = pd.read_csv(outfile)
    return data

data_original = leer_data()
data_original.head(3)

Unnamed: 0,client_id,date,año,mes,dia,hora,merchant_departement,merchant_province,merchant_district,mcc,mccg,client_age,quantity,amount_sol,dia_semana,turno
0,NNeQwQy9MAQ=,2016-07-15 22:23:25,2016,7,15,22,LIMA,LIMA,San Juan De Luriganc,7994,2,33.0,1,100.0,Fri,4-Noche
1,qFRoPHPOc/I=,2016-06-21 21:30:55,2016,6,21,21,LIMA,LIMA,San Juan De Luriganc,7994,2,52.0,1,20.0,Tues,4-Noche
2,qFRoPHPOc/I=,2017-02-01 01:29:59,2017,2,1,1,LIMA,LIMA,San Juan De Luriganc,7994,2,52.0,1,100.0,Weds,1-Madrugada


### Preparacion de datos

In [97]:
data = data_original[['client_id','date','año','mes','dia','hora','mccg','quantity','amount_sol']]
data.head

<bound method NDFrame.head of           client_id                 date   año  mes  dia  hora  mccg  \
0      NNeQwQy9MAQ=  2016-07-15 22:23:25  2016    7   15    22     2   
1      qFRoPHPOc/I=  2016-06-21 21:30:55  2016    6   21    21     2   
2      qFRoPHPOc/I=  2017-02-01 01:29:59  2017    2    1     1     2   
3      qFRoPHPOc/I=  2017-03-09 02:26:05  2017    3    9     2     2   
4      he9pUcExpTY=  2016-10-01 06:45:48  2016   10    1     6     2   
5      he9pUcExpTY=  2016-10-01 05:13:21  2016   10    1     5     2   
6      he9pUcExpTY=  2016-10-16 08:14:18  2016   10   16     8     2   
7      he9pUcExpTY=  2016-09-02 23:45:21  2016    9    2    23     2   
8      he9pUcExpTY=  2016-10-01 06:01:49  2016   10    1     6     2   
9      he9pUcExpTY=  2017-03-15 22:06:43  2017    3   15    22     2   
10     he9pUcExpTY=  2016-10-16 08:03:33  2016   10   16     8     2   
11     he9pUcExpTY=  2016-09-30 04:49:39  2016    9   30     4     2   
12     qFRoPHPOc/I=  2016-12-17 01

## Definicion de variables

In [105]:
mccgs = ((data.groupby(['mccg'], as_index=False, sort=True)).count())['mccg']
mccgs,mccgs[3],len(mccgs)

(0    2
 1    3
 2    4
 3    5
 Name: mccg, dtype: int64, 5, 4)

### Clientes

In [68]:
clientes =  data.groupby('client_id').client_id.count().index
clientes

Index(['+9yJoBsAES4=', '+EUZmfQX5bA=', '+Ilmh+mr2WM=', '+JHjmW4a+e0=',
       '+JcXo4zlmNQ=', '+LuVUb5tUBg=', '+MD+jHUjjBs=', '+P5K869BYbE=',
       '+PAlAt5GZNg=', '+PJntcDJ/w0=',
       ...
       'zid9Qi0vD84=', 'ziqVTfxwWbk=', 'zj8o+tt7ags=', 'zjiFEuZAut0=',
       'zjrVzB1sNaw=', 'zk41R+DpP70=', 'zlQla+ah6U0=', 'zliw6zi3X+s=',
       'zs3qMAU6HN8=', 'zs9yIYFt4EI='],
      dtype='object', name='client_id', length=6521)

# FOOTPRINT PARA CADA MCCG

## Unidad de TXs temporales (U)

### Funciones

In [62]:
# definimos los 4 time_windows que usaremos

def time_window(hora):
    tw = 9999
    if hora >=0:
        tw = 0      # Madrugada
    if hora >=6:
        tw = 1      # Mañana
    if hora >=12:
        tw = 2      # Tarde
    if hora >=18:
        tw = 3      # Noche
    return tw

In [100]:
# Definimos los U 

def procesar_u(user):    
    uid=list(user['client_id'])[0]
    # Lista los años en que tiene txs el usuario
    years = set(list(user['año']))
    anni = {year:{} for year in list(years)}
    
    # para cada fila (para cada fecha)
    for dat in  range(0,len(user)):
        año = user.iloc[dat]['año']
        week=datetime.datetime(año,user.iloc[dat]['mes'],user.iloc[dat]['dia']).isocalendar()[1]
        weekday=datetime.datetime(año,user.iloc[dat]['mes'],user.iloc[dat]['dia']).weekday()
        turn = time_window(user.iloc[dat]['hora'])
        mccg = user.iloc[dat]['mccg']
        
        # Si la semana no existe en el año
        if not(week in anni[año]):
            anni[año][week] = {}
        # Si el mccg no existe en la semana y año
        if not (mccg in anni[año][week]):
            anni[año][week][mccg]={}  #NUMERO DE MCCGs VARIABLES
        # Si el turno no existe en el mccg,semana y año
        if not (turn in anni[año][week][mccg]):
            anni[año][week][mccg][turn]=np.array([0]*7)  #CUATRO TURNOS
            
        anni[año][week][mccg][turn][weekday]+=user.iloc[dat]['quantity'] # suma cantidades "importancia por compras"
        #anni[año][week][turn][weekday]+=user.iloc[dat]['amount_sol'] # suma montos "importancia por gastos"
            
    return uid,anni

### Procesando U de cada MCCG

In [135]:

file='./RESULTS/U'
    
##################################################
#        Procesando U de cada CLIENTE
##################################################
    
# Extraemos la lista de clientes sin repetir

profiles={}
#contador=0 
print("Number of rows "+str(len(data)))
# Para cada cliente
for cliente in clientes:
    cliente_i= data[data['client_id'] == cliente]
    ## ejecutamos para cada usuario
    results=procesar_u(cliente_i)
    profiles[results[0]]=results[1]
    #contador += 1
    #print(contador)


Number of rows 12566


In [136]:
title = 'customer_id,year,week,profile_id,turn,size'
for i in range(len(mccgs)):
    for j in range(4):            # numero de turnos
        for k in range(7):            # numero de dias
            title = title+','+'m'+str(mccgs[i])+'t'+str(j)+'d'+str(k)

title = title+'\n'
title

'customer_id,year,week,profile_id,turn,size,m2t0d0,m2t0d1,m2t0d2,m2t0d3,m2t0d4,m2t0d5,m2t0d6,m2t1d0,m2t1d1,m2t1d2,m2t1d3,m2t1d4,m2t1d5,m2t1d6,m2t2d0,m2t2d1,m2t2d2,m2t2d3,m2t2d4,m2t2d5,m2t2d6,m2t3d0,m2t3d1,m2t3d2,m2t3d3,m2t3d4,m2t3d5,m2t3d6,m3t0d0,m3t0d1,m3t0d2,m3t0d3,m3t0d4,m3t0d5,m3t0d6,m3t1d0,m3t1d1,m3t1d2,m3t1d3,m3t1d4,m3t1d5,m3t1d6,m3t2d0,m3t2d1,m3t2d2,m3t2d3,m3t2d4,m3t2d5,m3t2d6,m3t3d0,m3t3d1,m3t3d2,m3t3d3,m3t3d4,m3t3d5,m3t3d6,m4t0d0,m4t0d1,m4t0d2,m4t0d3,m4t0d4,m4t0d5,m4t0d6,m4t1d0,m4t1d1,m4t1d2,m4t1d3,m4t1d4,m4t1d5,m4t1d6,m4t2d0,m4t2d1,m4t2d2,m4t2d3,m4t2d4,m4t2d5,m4t2d6,m4t3d0,m4t3d1,m4t3d2,m4t3d3,m4t3d4,m4t3d5,m4t3d6,m5t0d0,m5t0d1,m5t0d2,m5t0d3,m5t0d4,m5t0d5,m5t0d6,m5t1d0,m5t1d1,m5t1d2,m5t1d3,m5t1d4,m5t1d5,m5t1d6,m5t2d0,m5t2d1,m5t2d2,m5t2d3,m5t2d4,m5t2d5,m5t2d6,m5t3d0,m5t3d1,m5t3d2,m5t3d3,m5t3d4,m5t3d5,m5t3d6\n'

In [137]:
individual_footprint="%s.individual_footprint" %(file)
fw=open(individual_footprint,'w')

fw.write(title)
footprints=0

In [139]:
  

for uid in profiles:
    profile_id=0
    for year in profiles[uid]:
        for week in profiles[uid][year]:
            temp=np.zeros(4*7*len(mccgs))
            for mccg in profiles[uid][year][week]:
                for turn in profiles[uid][year][week][mccg]:
                    d=profiles[uid][year][week][mccg][turn]
                    if(turn == 0):
                        for i in range(0,7):
                            temp[i] += d[i]
                    if(turn == 1):
                        for i in range(7,14):
                            temp[i] += d[i-7]
                    if(turn == 2):
                        for i in range(14,21):
                            temp[i] += d[i-14]
                    if(turn == 3):
                        for i in range(21,28):
                            temp[i] += d[i-21]


                fw.write(''+str(uid)+','+str(year)+','+str(week)+','+str(profile_id)+','+str(turn)+','+str(sum(temp))+','
                             +str(temp[0])+','+str(temp[1])+','+str(temp[2])+','+str(temp[3])+','+str(temp[4])+','+str(temp[5])+','+str(temp[6])+','
                             +str(temp[7])+','+str(temp[8])+','+str(temp[9])+','+str(temp[10])+','+str(temp[11])+','+str(temp[12])+','+str(temp[13])+','
                             +str(temp[14])+','+str(temp[15])+','+str(temp[16])+','+str(temp[17])+','+str(temp[18])+','+str(temp[19])+','+str(temp[20])+','
                             +str(temp[21])+','+str(temp[22])+','+str(temp[23])+','+str(temp[24])+','+str(temp[25])+','+str(temp[26])+','+str(temp[27])
                             +'\n')

            profile_id = profile_id + 1
    footprints+=profile_id
    fw.flush()
fw.close()
print ("number of footprint: "+str(footprints))

number of footprint: 10412


## Funciones de Apoyo (clusters)

In [None]:
def process_footprint(data,tests,log=False):
    from sklearn.cluster import MiniBatchKMeans
    #KMeans(init='k-means++', n_clusters=k, n_init=10)
    import datetime
    K={}
    for k in tests:
        if k<=len(data):
            if log:
                print("%s: processing %s"%(datetime.datetime.now(),k))
            K[k]=bench_k_means(MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=100,
                      n_init=10, max_no_improvement=10, verbose=0,
                      random_state=0),name="k-means++", data=data)
    return K

In [None]:
def compute_best_k(x,y,occurrencies, plot=False,points=1000,sf=0.9):
    import numpy as np
    
    if len(x)<5:
        b_k = max(1, round(np.sqrt(occurrencies/2)))
        if plot:
            import pylab
            pylab.plot(x,y)
            pylab.scatter(x[b_k],y[b_k],s=20, marker='o')
            pylab.text(x[b_k],y[b_k],"bestK %s" %(b_k))
            return b_k,pylab

        return b_k
    
    from scipy.interpolate import interp1d
    from scipy.interpolate import UnivariateSpline
    spl = UnivariateSpline(x, y)
    spl.set_smoothing_factor(sf)
    xs = np.linspace(min(x), max(x), points)
    ys = spl(xs)
    idx_better_k=get_change_point(xs, ys)
    if plot:
        import pylab
        pylab.plot(xs,ys)
        
        pylab.scatter(xs[idx_better_k],ys[idx_better_k],s=20, marker='o')
        pylab.text(xs[idx_better_k],ys[idx_better_k],"bestK %s" %(np.round(xs[idx_better_k])))
        return int(np.round(xs[idx_better_k])),pylab
    return int(np.round(xs[idx_better_k]))

In [None]:
def bench_k_means(estimator, name, data,distance_function=None):
    from sklearn import metrics
    from sklearn.metrics import silhouette_samples, silhouette_score
    import time
    t0 = time.time()
    if distance_function:
        estimator.fit(data,distance_function)
    else:
        estimator.fit(data)
    #cluster_labels = estimator.fit_predict(data)
    #silhouette_score_ = silhouette_score(data, cluster_labels)
    
    inertia=estimator.inertia_
    duration=time.time() - t0
    return {'inertia':inertia,'duration':duration, 'estimator':estimator}#,'silhouette':silhouette_score_}

def get_change_point(x, y):
    """
         Elección del mejor K
         :: param x: lista de valores de K
         :: param y: lista de valores de SSE
    """
    import math
    max_d = -float('infinity')
    index = 0

    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[len(x)-1], y[len(y)-1]], p=[x[i], y[i]])
        d = math.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
    
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta *  y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point
	