In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import math

from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import MiniBatchKMeans
import pylab
from scipy.interpolate import interp1d
from scipy.interpolate import UnivariateSpline
    
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

import csv

from tqdm import tqdm, trange
import warnings

from dask.distributed import Client

## Footprints generation

#### Preparacion de datos

In [2]:
def leer_data():
    infile='data/datos_consumo.csv'
    data = pd.read_csv(infile, encoding = "latin-1", dtype={'BILLCYCLE': object}, low_memory=False)
    return data

datos = leer_data()
datos['YEAR'] = datos['F_TRAFICO'].apply(lambda fecha: int(fecha[6:]))

print('Done')

Done


In [3]:
datos.head()

Unnamed: 0,CO_ID,BILLCYCLE,NUMCODPLANTARIFARIO,F_ACT,F_TRAFICO,HORA,GB_TOTAL,NO_FREE,FREE,BANCOS,...,APPLEM_PI,SOUNDC_PI,FACE_PI,MAIL_PI,WHATS_RO_I,WHATS_RO_IF,WAZE_RO,EMAIL_RO,WHARS_RO2,YEAR
0,40045637,1,1982,12/12/2017,01/05/2018,Mañana,0.0013,0.0012,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,2018
1,40461321,5,1982,20/12/2017,01/05/2018,Madrugada,0.0029,0.0019,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,2018
2,44277329,1,1981,24/03/2018,01/05/2018,Madrugada,0.0004,0.0004,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,2018
3,32782858,4,1982,16/05/2017,01/05/2018,Noche,0.6216,0.5948,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,2018
4,38979920,3,1982,09/11/2017,01/05/2018,Mañana,0.0011,0.0009,0.0,0.0,...,0.0,0.0,0,0,0.0,0.0,0.0,0,0.0,2018


In [4]:
datos.columns

Index(['CO_ID', 'BILLCYCLE', 'NUMCODPLANTARIFARIO', 'F_ACT', 'F_TRAFICO',
       'HORA', 'GB_TOTAL', 'NO_FREE', 'FREE', 'BANCOS', 'YOUTUBE', 'NETFLIX',
       'WHATSAPP', 'WAZE', 'INSTAGRAM', 'SPOTIFY', 'APPLEM', 'SOUNDC',
       'FACEBOOK', 'O_WHATS_WAZE_MI', 'O_FACE_FLEX', 'O_INSTAGRAM_ACC',
       'O_APPLEM', 'O_INSTAGRAM_FULL', 'O_SOUNDC', 'O_SPOTIFY', 'O_EMAIL',
       'TDEFAULT_PI', 'YOUTUBE_PI', 'ND_PI', 'NETFLIX_PI', 'WHATSAPP_PI',
       'WAZE_PI', 'INSTAGRAM_PI', 'SPOTIFY_PI', 'APPLEM_PI', 'SOUNDC_PI',
       'FACE_PI', 'MAIL_PI', 'WHATS_RO_I', 'WHATS_RO_IF', 'WAZE_RO',
       'EMAIL_RO', 'WHARS_RO2', 'YEAR'],
      dtype='object')

In [5]:
len(datos), len(np.unique(datos['CO_ID']))

(5118984, 46994)

In [8]:
datos = datos.values

In [71]:
def time_windows(text):
    if (text == "Madrugada"):
        return 0
    if (text == "Mañana"):
        return 1
    if (text == "Tarde"):
        return 2
    if (text == "Noche"):
        return 3

In [74]:
events_dict = {}
set_turn = set()
set_weekday = set()

for tupla in tqdm(datos[:10000]):
    trx_user = tupla[0]
    trx_date = tupla[4] # F_TRAFICO
    trx_date = pd.to_datetime(trx_date, format='%d/%m/%Y', errors='coerce')
    trx_year = str(trx_date.year)
    trx_week = str(trx_date.isocalendar()[1])
    trx_weekday = trx_date.isocalendar()[2]-1
    trx_turn = time_windows(tupla[5]) # HORA

    set_turn.add(trx_turn)
    set_weekday.add(trx_weekday)
    
    if trx_user not in events_dict:
        events_dict[trx_user] = {}
    if trx_year not in events_dict[trx_user]:
        events_dict[trx_user][trx_year] = {}
    if trx_week not in events_dict[trx_user][trx_year]:
        events_dict[trx_user][trx_year][trx_week] = {}     
    if trx_weekday not in events_dict[trx_user][trx_year][trx_week]:
        events_dict[trx_user][trx_year][trx_week][trx_weekday] = {}   
    if trx_turn not in events_dict[trx_user][trx_year][trx_week][trx_weekday]:
        events_dict[trx_user][trx_year][trx_week][trx_weekday][trx_turn] = tupla[7:-1]

100%|██████████| 10000/10000 [00:01<00:00, 6221.38it/s]


In [75]:
set_app = ['NO_FREE', 'FREE', 'BANCOS', 'YOUTUBE', 'NETFLIX', 'WHATSAPP', 'WAZE', 'INSTAGRAM', 
           'SPOTIFY', 'APPLEM', 'SOUNDC', 'FACEBOOK', 'O_WHATS_WAZE_MI', 'O_FACE_FLEX', 'O_INSTAGRAM_ACC',
           'O_APPLEM', 'O_INSTAGRAM_FULL', 'O_SOUNDC', 'O_SPOTIFY', 'O_EMAIL', 'TDEFAULT_PI', 'YOUTUBE_PI',
           'ND_PI', 'NETFLIX_PI', 'WHATSAPP_PI', 'WAZE_PI', 'INSTAGRAM_PI', 'SPOTIFY_PI', 'APPLEM_PI',
           'SOUNDC_PI', 'FACE_PI', 'MAIL_PI', 'WHATS_RO_I', 'WHATS_RO_IF', 'WAZE_RO', 'EMAIL_RO', 'WHARS_RO2']
set_weekday = sorted(set_weekday)
set_turn = sorted(set_turn)

In [79]:
title = ['FOOTPRINT_ID,YEAR,WEEK,PROFILE_ID,SIZE']
titlec = []

for weekday in set_weekday:
    for turn in set_turn:
        for app in set_app:
            titlec.append('d'+str(weekday)+'-t'+str(turn)+'-'+str(app))
    
title = title + titlec

titlec

['d1-t0-NO_FREE',
 'd1-t0-FREE',
 'd1-t0-BANCOS',
 'd1-t0-YOUTUBE',
 'd1-t0-NETFLIX',
 'd1-t0-WHATSAPP',
 'd1-t0-WAZE',
 'd1-t0-INSTAGRAM',
 'd1-t0-SPOTIFY',
 'd1-t0-APPLEM',
 'd1-t0-SOUNDC',
 'd1-t0-FACEBOOK',
 'd1-t0-O_WHATS_WAZE_MI',
 'd1-t0-O_FACE_FLEX',
 'd1-t0-O_INSTAGRAM_ACC',
 'd1-t0-O_APPLEM',
 'd1-t0-O_INSTAGRAM_FULL',
 'd1-t0-O_SOUNDC',
 'd1-t0-O_SPOTIFY',
 'd1-t0-O_EMAIL',
 'd1-t0-TDEFAULT_PI',
 'd1-t0-YOUTUBE_PI',
 'd1-t0-ND_PI',
 'd1-t0-NETFLIX_PI',
 'd1-t0-WHATSAPP_PI',
 'd1-t0-WAZE_PI',
 'd1-t0-INSTAGRAM_PI',
 'd1-t0-SPOTIFY_PI',
 'd1-t0-APPLEM_PI',
 'd1-t0-SOUNDC_PI',
 'd1-t0-FACE_PI',
 'd1-t0-MAIL_PI',
 'd1-t0-WHATS_RO_I',
 'd1-t0-WHATS_RO_IF',
 'd1-t0-WAZE_RO',
 'd1-t0-EMAIL_RO',
 'd1-t0-WHARS_RO2',
 'd1-t1-NO_FREE',
 'd1-t1-FREE',
 'd1-t1-BANCOS',
 'd1-t1-YOUTUBE',
 'd1-t1-NETFLIX',
 'd1-t1-WHATSAPP',
 'd1-t1-WAZE',
 'd1-t1-INSTAGRAM',
 'd1-t1-SPOTIFY',
 'd1-t1-APPLEM',
 'd1-t1-SOUNDC',
 'd1-t1-FACEBOOK',
 'd1-t1-O_WHATS_WAZE_MI',
 'd1-t1-O_FACE_FLEX',
 'd1-t1-O_INS

In [78]:
# user, year, week, day, turn data_app
events_dict[32782858]

{'2018': {'18': {1: {3: array([0.5948, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0136, 0.0, 0.0, 0.0, 0.0, 0.0,
           0.0, 0.0132, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0.0,
           0.0, 0.0, 0.0, 0.0, 0.0, 0, 0, 0.0, 0.0, 0.0, 0, 0.0], dtype=object)}}}}

In [109]:
footprint_file = "data/footprints.csv"
fw0=open(footprint_file,'w')
fw0.write(",".join(map(str, title))+"\n")

l_turn = len(set_turn)
l_weekday = len(set_weekday)
l_app= len(set_app)

vector_zeros_temp = np.zeros(l_weekday*l_turn*l_app)
vector_zeros_temp2 = np.zeros(l_turn*l_app) 


for customer in tqdm(events_dict):
    profile_id=0
    for year in events_dict[customer]:
        for week in events_dict[customer][year]:
            temp = vector_zeros_temp.copy()
            for weekday in events_dict[customer][year][week]:                             # cargamos los apps
                temp2 = vector_zeros_temp2.copy()   
                for turn in events_dict[customer][year][week][weekday]:                   # cargamos turnos
                    d = events_dict[customer][year][week][weekday][turn]
                    temp2[turn*l_app:(turn+1)*l_app] = d
                
                #temp[weekday*l_turn*l_app:(weekday+1)*l_turn*l_app] = temp2
                print(len(temp2))

#                 pos_app = set_app.index(weekday)                                          # posicion del app
#                 temp2 = vector_zeros_turn.copy()                                      # creamos vector temporal turnos por dias con 0s
#                     pos_turn = set_turn.index(turn)                                   # posicion del turno
#                     temp2[turno_help(turno)*38:(turno_help(turno)+1)*38] = profiles[uid][year][week][weekday][turno]
                
# for turno in profiles[uid][year][week][weekday]:                        
#                     #print(uid,year,week,weekday,turno,len(profiles[uid][year][week][weekday][turno]))
#                     temp2[turno_help(turno)*38:(turno_help(turno)+1)*38] = profiles[uid][year][week][weekday][turno]
#                 temp[weekday*len(temp2):(weekday+1)*len(temp2)] = temp2
          
                
fw0.close()  

  4%|▍         | 386/9282 [00:00<00:02, 3852.12it/s]

148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148


 12%|█▏        | 1133/9282 [00:00<00:02, 3416.35it/s]


148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148

 16%|█▌        | 1476/9282 [00:00<00:02, 3059.74it/s]

148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148


 23%|██▎       | 2136/9282 [00:00<00:02, 2885.30it/s]


148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148

 27%|██▋       | 2529/9282 [00:00<00:02, 3177.66it/s]

148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148


 36%|███▌      | 3314/9282 [00:01<00:01, 3362.30it/s]


148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148

 43%|████▎     | 3994/9282 [00:01<00:01, 3315.09it/s]

148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148


 53%|█████▎    | 4892/9282 [00:01<00:01, 3862.86it/s]

148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148


 63%|██████▎   | 5838/9282 [00:01<00:00, 4230.18it/s]

148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148


 72%|███████▏  | 6679/9282 [00:01<00:00, 4126.05it/s]


148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148

 77%|███████▋  | 7184/9282 [00:01<00:00, 4379.39it/s]


148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148

 87%|████████▋ | 8095/9282 [00:02<00:00, 4458.99it/s]


148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148

100%|██████████| 9282/9282 [00:02<00:00, 3860.11it/s]

148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148
148





In [100]:

l_turn = len(set_turn)
l_weekday = len(set_weekday)
l_app= len(set_app)

vector_zeros_temp = np.zeros(l_weekday*l_turn*l_app)
vector_zeros_temp2 = np.zeros(l_turn*l_app) 

In [101]:
l_turn, l_weekday, l_app

(4, 5, 37)

In [103]:
vector_zeros_temp2

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
title = ['client_id','year','week','profile_id','mccg','turn','size']
titlec = []
for mccg in set_mcc:
    for weekday in set_weekday:
        for turn in set_turn:
            titlec.append('m'+str(mccg)+'_d'+str(weekday)+'_t'+str(turn))
title = title + titlec


footprint_file = "footprints.csv"
fw0=open(footprint_file,'w')
fw0.write(",".join(map(str, title))+"\n")

#footprint_result = [title]
l_turn = len(set_turn)
l_weekday = len(set_weekday)
l_mcc = len(set_mcc)
vector_zeros_mcc = np.zeros(l_mcc*l_weekday*l_turn)
vector_zeros_turn = np.zeros(l_weekday*l_turn) 

for customer in tqdm(events_dict):
    profile_id=0
    for year in events_dict[customer]:
        for week in events_dict[customer][year]:
            temp = vector_zeros_mcc.copy() 
            for mcc in events_dict[customer][year][week]:                             # cargamos los mccgs
                pos_mcc = set_mcc.index(mcc)                                            # posicion del mccg
                temp2 = vector_zeros_turn.copy()                                        # creamos vector temporal turnos por dias con 0s
                for turn in events_dict[customer][year][week][mcc]:                     # cargamos turnos
                    pos_turn = set_turn.index(turn)                                       # posicion del turno
                    d = events_dict[customer][year][week][mcc][turn]                      # Dias como array
                    for k in range(pos_turn*l_weekday,(pos_turn+1)*l_weekday):            # cargamos dias
                        temp2[k] += d[k-(pos_turn*l_weekday)]                               # posicion de cada dia en el vector temporal
                for j in range(pos_mcc*l_weekday*l_turn,(pos_mcc+1)*l_weekday*l_turn):
                    temp[j] = temp2[j-(pos_mcc*l_weekday*l_turn)]                         # vector temporal es cargado en el tensor
            list_raw = [customer, year, week, profile_id, mcc, turn, sum(temp)] + list(temp)      # Escribimos los datos del primer comportamiento (Tensor)
            profile_id += 1                                                           # perfil cambia cada unidad de fecha diferente (eg. cada semana != es un perfil) de cada cliente
            #footprint_result.append(list_raw)
            
            # Salvando Footprints
            string = (",".join(map(str, list_raw)))+"\n"
            fw0.write(string)
    fw0.flush()
fw0.close()         
del events_dict


In [None]:
# Salvando Footprints
# print('Numero de registros:',len(footprint_result)-1)
# time.sleep(1)
# with open("footprints.csv", 'w') as f:   
#     writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
#     for item in tqdm(footprint_result):
#         writer.writerow(item)

### Footprints recovering

In [None]:
# Opcional Read footprints
footprint_df=pd.read_csv('footprints.csv', sep=',')
len(footprint_df)
footprint_df.head()

In [None]:
titlec = list(footprint_df.columns)[7:]

In [None]:
footprint_result = footprint_df.values
del footprint_df

footprint_dict = {}

for row in tqdm(footprint_result):
    client = row[0]
    if client not in footprint_dict:
        footprint_dict[client] = []
    footprint_dict[client].append(list(row[1:]))
    
del footprint_result

## Function to clustering

In [None]:
def process_footprint(data,log=False):
    # KMeans(init='k-means++', n_clusters=k, n_init=10)
    tests = np.arange(1,len(data)+1)
    K={}
    for k in tests:
        K[k]=bench_k_means(MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=30, #3000
                                           n_init=10, max_no_improvement=10, verbose=0,
                                           random_state=0),
                           name="k-means++", data=data)
    return K

def plot_best_k(x,y,b_k):
    pylab.plot(x,y)
    pylab.scatter(x[b_k],y[b_k],s=20, marker='o')
    pylab.text(x[b_k],y[b_k],"Better k = %s" %(b_k))
    pylab.show() 
    #return pylab

def compute_best_k(x, y, occurrencies, minimum_threshold = 5, plot=False, reason_points=10, sf=0.9):
    '''
    '''
    
    if occurrencies <= minimum_threshold:
        b_k = max(1, round(np.sqrt(occurrencies/2)))
        #b_k = occurrencies
        if plot:
            print('Better K is',b_k,'by definition')
            #plot_best_k(x,y,b_k)
    else:
        #warnings.simplefilter("ignore")
        spl = UnivariateSpline(x, y, k=3, s=0)
        spl.set_smoothing_factor(sf)
        #warnings.simplefilter("default")
        #spl = interp1d(x, y)
        points = reason_points*occurrencies
        xs = np.linspace(min(x), max(x), points)
        ys = spl(xs)
        idx_better_k = get_change_point(xs, ys)
        b_k = int(np.round(xs[idx_better_k]))
        if plot:
            plot_best_k(xs,ys,idx_better_k)
    return b_k

def bench_k_means(estimator, name, data,distance_function=None):

    
    t0 = time.time()
    if distance_function:
        estimator.fit(data,distance_function)
    else:
        estimator.fit(data)
    #cluster_labels = estimator.fit_predict(data)
    #silhouette_score_ = silhouette_score(data, cluster_labels)
    
    inertia=estimator.inertia_
    duration=time.time() - t0
    return {'inertia':inertia,'duration':duration, 'estimator':estimator}#,'silhouette':silhouette_score_}

def get_change_point(x, y):
    """
         Elección del mejor K
         :: param x: lista de valores de K
         :: param y: lista de valores de SSE
    """
    import math
    max_d = -float('infinity')
    index = 0

    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[len(x)-1], y[len(y)-1]], p=[x[i], y[i]])
        d = math.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
    
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta *  y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point

### Individual clustering

In [None]:
# client = Client(n_workers=8) # In this example I have 8 cores and processes (can also use threads if desired)

def my_function_parallel(i):
    key = i[0]
    value = i[1]
    
    to_cluster = []
    to_tag = []
    for v in value:
        to_cluster.append(v[6:])
        to_tag.append([key]+v[0:3])
    
    if len(value) > MIN_TO_CLUTER: 
        K=process_footprint(to_cluster)
        # Choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]
        with warnings.catch_warnings():
#             warnings.simplefilter("ignore")
#             if len(x)>=20:
#                 x = x[0:20]
#                 y = y[0:20]
            best_k=compute_best_k(x,y,len(y),plot=False)
        # Clustering
        if best_k==1:
            cluster_centers=[np.average(to_cluster,axis=0)]
            labels=[0]*len(to_cluster)  
        else:
            cluster_centers=K[best_k]['estimator'].cluster_centers_
            labels=K[best_k]['estimator'].labels_
    else:
        cluster_centers=np.array(to_cluster)
        labels=np.array(range(0,len(to_cluster)))
    
    return [key,cluster_centers,labels]

In [None]:
test_list = list(footprint_dict.keys())

In [None]:
global MIN_TO_CLUTER
MIN_TO_CLUTER = 7

DELTA = 900
final_list = np.array_split(test_list,DELTA)

client = Client(n_workers=10)
client

In [None]:
title = ['client_id','year','week','profile_id']
titles = (",".join(map(str, title + titlec)))+",tag\n"
individual_labels = "individual_footprint_tagged.csv"
fw1=open(individual_labels,'w')
fw1.write(titles)

title = ['client_id','tag']
titles = (",".join(map(str, title + titlec)))+"\n"
individual_clusters = "individual_footprint_clusters.csv"
fw2=open(individual_clusters,'w')
fw2.write(titles)

print('Iteraciones':len(final_list))

In [None]:
counter = 0
for sublist in final_list:
    
    print('Iteracion',counter)
    
    futures = []
    for key in sublist:
        value = footprint_dict[key]
        i = [key, value]
        future = client.submit(my_function_parallel, i)
        futures.append(future)
        
    results = client.gather(futures)
    
    for row in results:
        key = row[0]
        value = footprint_dict[key]
        cluster_centers = row[1]
        labels = row[2]

        to_cluster = []
        to_tag = []
        for v in value:
            to_cluster.append(v[6:])
            to_tag.append([key]+v[0:3])

        # Save labels
        for index, item in enumerate(to_cluster):
            string = (",".join(map(str, to_tag[index])))+","+(",".join(map(str, item)))+","+str(labels[index])+"\n"
            fw1.write(string)
        fw1.flush()

        # Save centroids
        for index, item in enumerate(cluster_centers):
            string = key+","+str(index)+","+(",".join(map(str, item)))+"\n"
            fw2.write(string)
        fw2.flush()    
    
    time.sleep(2)
    counter+=1

In [None]:
client.close()
fw1.close()
fw2.close()

### Recovering individual footprint clusters

In [None]:
# Opcional Read footprints
individual_cluster_df=pd.read_csv('individual_footprint_clusters.csv', sep=',')
len(individual_cluster_df)
individual_cluster_df.head()

### Global clustering

In [None]:
def process_footprint_global(data, top_limit=None, batch_size=30):
    # KMeans(init='k-means++', n_clusters=k, n_init=10)
    if top_limit==None:
        top_limit=len(data)+1
    
    tests = np.arange(1,top_limit)
    K={}
    for k in tqdm(tests):
        with warnings.catch_warnings():
            K[k]=bench_k_means(MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=batch_size, #3000
                                               n_init=10, max_no_improvement=10, verbose=0,
                                               random_state=0),
                               name="k-means++", data=data)
    return K

def plot_best_k(x,y,b_k):
    pylab.plot(x,y)
    pylab.scatter(x[b_k],y[b_k],s=20, marker='o')
    pylab.text(x[b_k],y[b_k],"Better k = %s" %(b_k))
    pylab.show() 
    #return pylab

def compute_best_k(x, y, occurrencies, minimum_threshold = 5, plot=False, reason_points=10, sf=0.9):
    '''
    '''
    
    if occurrencies <= minimum_threshold:
        b_k = max(1, round(np.sqrt(occurrencies/2)))
        #b_k = occurrencies
        if plot:
            print('Better K is',b_k,'by definition')
            #plot_best_k(x,y,b_k)
    else:
        #warnings.simplefilter("ignore")
        spl = UnivariateSpline(x, y, k=3, s=0)
        spl.set_smoothing_factor(sf)
        #warnings.simplefilter("default")
        #spl = interp1d(x, y)
        points = reason_points*occurrencies
        xs = np.linspace(min(x), max(x), points)
        ys = spl(xs)
        idx_better_k = get_change_point(xs, ys)
        b_k = int(np.round(xs[idx_better_k]))
        if plot:
            plot_best_k(xs,ys,idx_better_k)
    return b_k

def bench_k_means(estimator, name, data,distance_function=None):

    
    t0 = time.time()
    if distance_function:
        estimator.fit(data,distance_function)
    else:
        estimator.fit(data)
    #cluster_labels = estimator.fit_predict(data)
    #silhouette_score_ = silhouette_score(data, cluster_labels)
    
    inertia=estimator.inertia_
    duration=time.time() - t0
    return {'inertia':inertia,'duration':duration, 'estimator':estimator}#,'silhouette':silhouette_score_}

def get_change_point(x, y):
    """
         Elección del mejor K
         :: param x: lista de valores de K
         :: param y: lista de valores de SSE
    """
    import math
    max_d = -float('infinity')
    index = 0

    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[len(x)-1], y[len(y)-1]], p=[x[i], y[i]])
        d = math.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
    
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta *  y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point

In [None]:
len(individual_cluster_df)

In [None]:
titlec = list(individual_cluster_df.columns)[2:]

In [None]:
to_cluster = individual_cluster_df[titlec].values

In [None]:
K=process_footprint_global(to_cluster, batch_size= 1000, top_limit = 300)


In [None]:
# Choose k
x=list(K.keys())
y=[K[k]['inertia'] for k in K]

In [None]:

with warnings.catch_warnings():
    best_k=compute_best_k(x,y,len(y),plot=True,reason_points=100)

In [None]:
# Clustering
if best_k==1:
    cluster_centers=[np.average(to_cluster,axis=0)]
    labels=[0]*len(to_cluster)  
else:
    cluster_centers=K[best_k]['estimator'].cluster_centers_
    labels=K[best_k]['estimator'].labels_