In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import math

from sklearn import metrics
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import MiniBatchKMeans
import pylab
from scipy.interpolate import interp1d
from scipy.interpolate import UnivariateSpline
    
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

import csv

from tqdm import tqdm, trange
import warnings

In [2]:
path_trx = 'trx2016_lima20.csv'
path_mcc = 'mcc.parquet'

In [3]:
def look_mccg(mcc_code):
    try:
        mccg_code = mcc_dic[mcc_code]
    except:
        mccg_code = 0
    return mccg_code

mcc_df = pd.read_parquet(path_mcc, engine='pyarrow')
mcc_df.set_index('id')
mcc_dic = mcc_df['mccg'].to_dict()

In [4]:
fields = ['client_id', 'date', 'mcc_new', 'amount_sol']
datos = pd.read_csv(path_trx, skipinitialspace=True, usecols=fields)

In [5]:
# datos = datos.head(1000)
datos = datos.to_numpy()

In [6]:
def time_window(hora):
    '''Hour of day to range'''
    if hora <6: return 0      # Madrugada
    if hora <12: return 1      # Mañana
    if hora <18: return 2      # Tarde
    return 3 # Noche

events_dict = {}
set_mcc = set()
set_turn = set()
set_weekday = set()
for tupla in tqdm(datos):
    trx_id = tupla[0]
    trx_mcc = look_mccg(tupla[2])
    trx_amount = tupla[3]
    trx_date = datetime.datetime.strptime(tupla[1]+':00', '%Y-%m-%d %H:%M:%S%z')
    trx_year = str(trx_date.year)
    trx_week = str(trx_date.isocalendar()[1])
    trx_weekday = trx_date.isocalendar()[2]-1
    trx_turn = time_window(trx_date.hour)
    set_mcc.add(trx_mcc)
    set_turn.add(trx_turn)
    set_weekday.add(trx_weekday)

    if trx_id not in events_dict:
        events_dict[trx_id] = {}
    if trx_year not in events_dict[trx_id]:
        events_dict[trx_id][trx_year] = {}
    if trx_week not in events_dict[trx_id][trx_year]:
        events_dict[trx_id][trx_year][trx_week] = {}
    if trx_mcc not in events_dict[trx_id][trx_year][trx_week]:
        events_dict[trx_id][trx_year][trx_week][trx_mcc] = {}
    if trx_turn not in events_dict[trx_id][trx_year][trx_week][trx_mcc]:
        events_dict[trx_id][trx_year][trx_week][trx_mcc][trx_turn] = np.array([0]*7)

    events_dict[trx_id][trx_year][trx_week][trx_mcc][trx_turn][trx_weekday]+=trx_amount
    # events_dict[trx_id][trx_year][trx_week][trx_mcc][trx_turn][trx_weekday]+=1
del datos

100%|██████████| 45319900/45319900 [45:15<00:00, 16688.94it/s] 


In [9]:
set_mcc = sorted(set_mcc)
set_weekday = sorted(set_weekday)
set_turn = sorted(set_turn)

In [10]:
title = ['client_id','year','week','profile_id','mccg','turn','size']
titlec = []
for mccg in set_mcc:
    for weekday in set_weekday:
        for turn in set_turn:
            titlec.append('m'+str(mccg)+'_d'+str(weekday)+'_t'+str(turn))
title = title + titlec

footprint_result = [title]
l_turn = len(set_turn)
l_weekday = len(set_weekday)
l_mcc = len(set_mcc)
vector_zeros_mcc = np.zeros(l_mcc*l_weekday*l_turn)
vector_zeros_turn = np.zeros(l_weekday*l_turn) 

for customer in tqdm(events_dict):
    profile_id=0
    for year in events_dict[customer]:
        for week in events_dict[customer][year]:
            temp = vector_zeros_mcc.copy() 
            for mcc in events_dict[customer][year][week]:                             # cargamos los mccgs
                pos_mcc = set_mcc.index(mcc)                                            # posicion del mccg
                temp2 = vector_zeros_turn.copy()                                        # creamos vector temporal turnos por dias con 0s
                for turn in events_dict[customer][year][week][mcc]:                     # cargamos turnos
                    pos_turn = set_turn.index(turn)                                       # posicion del turno
                    d = events_dict[customer][year][week][mcc][turn]                      # Dias como array
                    for k in range(pos_turn*l_weekday,(pos_turn+1)*l_weekday):            # cargamos dias
                        temp2[k] += d[k-(pos_turn*l_weekday)]                               # posicion de cada dia en el vector temporal
                for j in range(pos_mcc*l_weekday*l_turn,(pos_mcc+1)*l_weekday*l_turn):
                    temp[j] = temp2[j-(pos_mcc*l_weekday*l_turn)]                         # vector temporal es cargado en el tensor
            list_raw = [customer, year, week, profile_id, mcc, turn, sum(temp)] + list(temp)      # Escribimos los datos del primer comportamiento (Tensor)
            profile_id += 1                                                           # perfil cambia cada unidad de fecha diferente (eg. cada semana != es un perfil) de cada cliente
            footprint_result.append(list_raw)
del events_dict


100%|██████████| 482280/482280 [1:33:18<00:00, 86.14it/s]    


In [13]:
# Salvando Footprints
print('Numero de registros:',len(footprint_result)-1)
time.sleep(1)
with open("footprints.csv", 'w') as f:   
    writer = csv.writer(f, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL, lineterminator='\n')
    for item in tqdm(footprint_result):
        writer.writerow(item)

Numero de registros: 15823353


100%|██████████| 15823354/15823354 [06:10<00:00, 42753.07it/s]


In [37]:
# Opcional Read footprints
footprint_df=pd.read_csv('footprints.csv', sep=',')
len(footprint_df)
footprint_df.head()

Unnamed: 0,client_id,year,week,profile_id,mccg,turn,size,m0_d0_t0,m0_d0_t1,m0_d0_t2,...,m0_d4_t2,m0_d4_t3,m0_d5_t0,m0_d5_t1,m0_d5_t2,m0_d5_t3,m0_d6_t0,m0_d6_t1,m0_d6_t2,m0_d6_t3
0,XxgXt1gGsxk=,2017,5,0,0,3,111.0,0.0,0.0,0.0,...,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0
1,XxgXt1gGsxk=,2017,9,1,0,2,582.0,0.0,0.0,35.0,...,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,69.0
2,XxgXt1gGsxk=,2017,2,2,0,3,351.0,0.0,0.0,0.0,...,0.0,0.0,89.0,45.0,0.0,0.0,45.0,0.0,0.0,0.0
3,XxgXt1gGsxk=,2017,1,3,0,3,325.0,49.0,0.0,0.0,...,0.0,65.0,100.0,111.0,0.0,0.0,0.0,0.0,0.0,0.0
4,XxgXt1gGsxk=,2017,11,4,0,0,27.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
titlec = list(footprint_df.columns)[7:]

In [None]:
footprint_result = footprint_df.values
del footprint_df

footprint_dict = {}

for row in tqdm(footprint_result):
    client = row[0]
    if client not in footprint_dict:
        footprint_dict[client] = []
    footprint_dict[client].append(list(row[1:]))
    
del footprint_result

 86%|████████▌ | 13598847/15823353 [05:54<00:16, 138061.31it/s]

### Function to clustering

In [None]:
def process_footprint(data,log=False):
    # KMeans(init='k-means++', n_clusters=k, n_init=10)
    tests = np.arange(1,len(data)+1)
    K={}
    for k in tests:
        K[k]=bench_k_means(MiniBatchKMeans(init='k-means++', n_clusters=k, batch_size=30, #3000
                                           n_init=10, max_no_improvement=10, verbose=0,
                                           random_state=0),
                           name="k-means++", data=data)
    return K

def plot_best_k(x,y,b_k):
    pylab.plot(x,y)
    pylab.scatter(x[b_k],y[b_k],s=20, marker='o')
    pylab.text(x[b_k],y[b_k],"Better k = %s" %(b_k))
    pylab.show() 
    #return pylab

def compute_best_k(x, y, occurrencies, minimum_threshold = 5, plot=False, reason_points=10, sf=0.9):
    '''
    '''
    
    if occurrencies <= minimum_threshold:
        b_k = max(1, round(np.sqrt(occurrencies/2)))
        #b_k = occurrencies
        if plot:
            print('Better K is',b_k,'by definition')
            #plot_best_k(x,y,b_k)
    else:
        #warnings.simplefilter("ignore")
        spl = UnivariateSpline(x, y, k=3, s=0)
        spl.set_smoothing_factor(sf)
        #warnings.simplefilter("default")
        #spl = interp1d(x, y)
        points = reason_points*occurrencies
        xs = np.linspace(min(x), max(x), points)
        ys = spl(xs)
        idx_better_k = get_change_point(xs, ys)
        b_k = int(np.round(xs[idx_better_k]))
        if plot:
            plot_best_k(xs,ys,idx_better_k)
    return b_k

def bench_k_means(estimator, name, data,distance_function=None):

    
    t0 = time.time()
    if distance_function:
        estimator.fit(data,distance_function)
    else:
        estimator.fit(data)
    #cluster_labels = estimator.fit_predict(data)
    #silhouette_score_ = silhouette_score(data, cluster_labels)
    
    inertia=estimator.inertia_
    duration=time.time() - t0
    return {'inertia':inertia,'duration':duration, 'estimator':estimator}#,'silhouette':silhouette_score_}

def get_change_point(x, y):
    """
         Elección del mejor K
         :: param x: lista de valores de K
         :: param y: lista de valores de SSE
    """
    import math
    max_d = -float('infinity')
    index = 0

    for i in range(0, len(x)):
        c = closest_point_on_segment(a=[x[0], y[0]], b=[x[len(x)-1], y[len(y)-1]], p=[x[i], y[i]])
        d = math.sqrt((c[0]-x[i])**2 + (c[1]-y[i])**2)
        if d > max_d:
            max_d = d
            index = i
    
    return index

def closest_point_on_segment(a, b, p):
    sx1 = a[0]
    sx2 = b[0]
    sy1 = a[1]
    sy2 = b[1]
    px = p[0]
    py = p[1]

    x_delta = sx2 - sx1
    y_delta = sy2 - sy1

    if x_delta == 0 and y_delta == 0:
        return p

    u = ((px - sx1) * x_delta + (py - sy1) * y_delta) / (x_delta * x_delta + y_delta *  y_delta)
    if u < 0:
        closest_point = a
    elif u > 1:
        closest_point = b
    else:
        cp_x = sx1 + u * x_delta
        cp_y = sy1 + u * y_delta
        closest_point = [cp_x, cp_y]

    return closest_point

In [None]:
global MIN_TO_CLUTER
MIN_TO_CLUTER = 7

In [None]:

title = ['client_id','year','week','profile_id']
titles = (",".join(map(str, title + titlec)))+",tag\n"
individual_labels = "individual_footprint_tagged.csv"
fw1=open(individual_labels,'w')
fw1.write(titles)


title = ['client_id','tag']
titles = (",".join(map(str, title + titlec)))+"\n"
individual_clusters = "individual_footprint_clusters.csv"
fw2=open(individual_clusters,'w')
fw2.write(titles)


counter=0
# For each client
barra_iter = tqdm(footprint_dict.items())
for key, value in barra_iter:
    
    to_cluster = []
    to_tag = []
    for v in value:
        #v = v.tolist()
        to_cluster.append(v[6:])
        to_tag.append([key]+v[0:3])
        #to_tag.append(np.concatenate(([key], v[0:3])))

    if len(value) > MIN_TO_CLUTER:
        
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            K=process_footprint(to_cluster)
        
        # Choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            best_k=compute_best_k(x,y,len(to_cluster),plot=False)

        # Clustering
        if best_k==1:
            cluster_centers=[np.average(to_cluster,axis=0)]
            labels=[0]*len(to_cluster)  
        else:
            cluster_centers=K[best_k]['estimator'].cluster_centers_
            labels=K[best_k]['estimator'].labels_
    else:
        cluster_centers=np.array(value)
        labels=np.array(range(0,len(value)))
    
    # Save labels
    for index, item in enumerate(to_cluster):
        string = (",".join(map(str, to_tag[index])))+","+(",".join(map(str, item)))+","+str(labels[index])+"\n"
        fw1.write(string)
    fw1.flush()
    
    # Save centroids
    for index, item in enumerate(cluster_centers):
        string = key+","+str(index)+","+(",".join(map(str, item)))+"\n"
        fw2.write(string)
    fw2.flush()

    barra_iter.set_description("Procesando cliente '%s' ==> " % key)
    
    counter+=1 #
    if counter==100:  break;  
    pass

fw1.close()
fw2.close()
#del footprint_dict

In [32]:

title = ['client_id','year','week','profile_id']
titles = (",".join(map(str, title + titlec)))+",tag\n"
individual_labels = "individual_footprint_tagged.csv"
fw1=open(individual_labels,'w')
fw1.write(titles)


title = ['client_id','tag']
titles = (",".join(map(str, title + titlec)))+"\n"
individual_clusters = "individual_footprint_clusters.csv"
fw2=open(individual_clusters,'w')
fw2.write(titles)


counter=0
# For each client
barra_iter = tqdm(footprint_dict.items())
for key, value in barra_iter:
    
    to_cluster = []
    to_tag = []
    for v in value:
        #v = v.tolist()
        to_cluster.append(v[6:])
        to_tag.append([key]+v[0:3])
        #to_tag.append(np.concatenate(([key], v[0:3])))

    if len(value) > MIN_TO_CLUTER:
        
        K=process_footprint(to_cluster)
        
        # Choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            if len(x)>=20:
                x = x[0:20]
                y = y[0:20]
            best_k=compute_best_k(x,y,len(y),plot=False)

        # Clustering
        if best_k==1:
            cluster_centers=[np.average(to_cluster,axis=0)]
            labels=[0]*len(to_cluster)  
        else:
            cluster_centers=K[best_k]['estimator'].cluster_centers_
            labels=K[best_k]['estimator'].labels_
    else:
        cluster_centers=np.array(value)
        labels=np.array(range(0,len(value)))
    
#     # Save labels
#     for index, item in enumerate(to_cluster):
#         string = (",".join(map(str, to_tag[index])))+","+(",".join(map(str, item)))+","+str(labels[index])+"\n"
#         fw1.write(string)
#     fw1.flush()
    
#     # Save centroids
#     for index, item in enumerate(cluster_centers):
#         string = key+","+str(index)+","+(",".join(map(str, item)))+"\n"
#         fw2.write(string)
#     fw2.flush()

    barra_iter.set_description("Procesando cliente '%s' ==> " % key)
    
    counter+=1 #
    if counter==100:  break;  
    pass

fw1.close()
fw2.close()
#del footprint_dict

Procesando cliente 'uY+MrfQgDFU=' ==> :   0%|          | 3/482280 [00:55<2491:18:37, 18.60s/it]


KeyboardInterrupt: 

In [8]:
from dask.distributed import Client

# client = Client(n_workers=8) # In this example I have 8 cores and processes (can also use threads if desired)

def my_function(i):
    key = i[0]
    value = i[1]
    
    to_cluster = []
    to_tag = []
    for v in value:
        to_cluster.append(v[6:])
        to_tag.append([key]+v[0:3])
    
    if len(value) > MIN_TO_CLUTER: 
        K=process_footprint(to_cluster)
        # Choose k
        x=list(K.keys())
        y=[K[k]['inertia'] for k in K]
        with warnings.catch_warnings():
#             warnings.simplefilter("ignore")
#             if len(x)>=20:
#                 x = x[0:20]
#                 y = y[0:20]
            best_k=compute_best_k(x,y,len(y),plot=False)
        # Clustering
        if best_k==1:
            cluster_centers=[np.average(to_cluster,axis=0)]
            labels=[0]*len(to_cluster)  
        else:
            cluster_centers=K[best_k]['estimator'].cluster_centers_
            labels=K[best_k]['estimator'].labels_
    else:
        cluster_centers=np.array(to_cluster)
        labels=np.array(range(0,len(to_cluster)))
    
    return [key,cluster_centers,labels]

# futures = []

# for key, value in tqdm(footprint_dict.items()):
#     i = [key, value]
#     future = client.submit(my_function, i)
#     futures.append(future)

# results = client.gather(futures)
# client.close()

In [39]:
title = ['client_id','year','week','profile_id']
titles = (",".join(map(str, title + titlec)))+",tag\n"
individual_labels = "individual_footprint_taggedxd.csv"
fw1=open(individual_labels,'w')
fw1.write(titles)

title = ['client_id','tag']
titles = (",".join(map(str, title + titlec)))+"\n"
individual_clusters = "individual_footprint_clustersxd.csv"
fw2=open(individual_clusters,'w')
fw2.write(titles)


266

In [40]:
DELTA = 900
#test_list = list(footprint_dict.keys())

In [41]:
final_list = np.array_split(test_list,DELTA)[270:280]

In [42]:
len(final_list[0])

536

In [43]:
client = Client(n_workers=10)

In [44]:
counter = 270
for sublist in final_list:
    
    print('Iteracion',counter)
    
    futures = []
    for key in sublist:
        value = footprint_dict[key]
        i = [key, value]
        future = client.submit(my_function, i)
        futures.append(future)
        
    results = client.gather(futures)
    
    for row in results:
        key = row[0]
        value = footprint_dict[key]
        cluster_centers = row[1]
        labels = row[2]

        to_cluster = []
        to_tag = []
        for v in value:
            to_cluster.append(v[6:])
            to_tag.append([key]+v[0:3])

        # Save labels
        for index, item in enumerate(to_cluster):
            string = (",".join(map(str, to_tag[index])))+","+(",".join(map(str, item)))+","+str(labels[index])+"\n"
            fw1.write(string)
        fw1.flush()

        # Save centroids
        for index, item in enumerate(cluster_centers):
            string = key+","+str(index)+","+(",".join(map(str, item)))+"\n"
            fw2.write(string)
        fw2.flush()    
    
    time.sleep(2)
    counter+=1

Iteracion 270
Iteracion 271
Iteracion 272
Iteracion 273
Iteracion 274
Iteracion 275
Iteracion 276
Iteracion 277
Iteracion 278
Iteracion 279


In [45]:
client.close()
fw1.close()
fw2.close()



In [24]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:61487  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [77]:
final_list = np.array_split(test_list,DELTA)[280:850]

In [78]:
key_list =[]
for sub_list in final_list:
    key_list = [*key_list, *sub_list]

len(key_list)

305450

In [93]:
#individual_footprint_tagged_df = pd.read_csv('procesado/individual_footprint_clusters2.csv', sep=',')
individual_footprint_clusters_df = pd.read_csv('procesado/individual_footprint_clusters2.csv', sep=',')

ParserError: Error tokenizing data. C error: Expected 30 fields in line 3172, saw 36


In [80]:
len(individual_footprint_tagged_df)

9664305

In [83]:
len(np.unique(individual_footprint_tagged_df['client_id']))

332200

In [81]:
individual_footprint_tagged_df.head()

Unnamed: 0,client_id,year,week,profile_id,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,...,m0_d4_t3,m0_d5_t0,m0_d5_t1,m0_d5_t2,m0_d5_t3,m0_d6_t0,m0_d6_t1,m0_d6_t2,m0_d6_t3,tag
0,D54t9eGiLsA=,2016,39,0,0.0,0.0,0.0,0.0,0.0,0.0,...,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
1,D54t9eGiLsA=,2016,35,1,0.0,0.0,0.0,0.0,0.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,0.0,3
2,D54t9eGiLsA=,2016,34,2,0.0,0.0,0.0,0.0,163.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,176.0,0.0,5
3,D54t9eGiLsA=,2016,30,3,249.0,0.0,0.0,0.0,347.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,D54t9eGiLsA=,2016,22,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0


In [84]:
filtrado_df= individual_footprint_tagged_df[individual_footprint_tagged_df.client_id.isin(key_list)]
filtrado_df.head()

Unnamed: 0,client_id,year,week,profile_id,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,...,m0_d4_t3,m0_d5_t0,m0_d5_t1,m0_d5_t2,m0_d5_t3,m0_d6_t0,m0_d6_t1,m0_d6_t2,m0_d6_t3,tag
0,D54t9eGiLsA=,2016,39,0,0.0,0.0,0.0,0.0,0.0,0.0,...,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
1,D54t9eGiLsA=,2016,35,1,0.0,0.0,0.0,0.0,0.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,0.0,3
2,D54t9eGiLsA=,2016,34,2,0.0,0.0,0.0,0.0,163.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,176.0,0.0,5
3,D54t9eGiLsA=,2016,30,3,249.0,0.0,0.0,0.0,347.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,D54t9eGiLsA=,2016,22,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0


In [85]:
len(filtrado_df)

9086979

In [86]:
len(np.unique(filtrado_df['client_id']))

305450

In [65]:
individual_footprint_tagged_df1_5 = pd.read_csv('procesado/individual_footprint_tagged1_5.csv', sep=',')
len(individual_footprint_tagged_df1_5)

202203

In [66]:
5956845+202203

6159048

In [67]:
filtrado_df2 = filtrado_df.append(individual_footprint_tagged_df1_5)

In [68]:
len(filtrado_df2)

6159048

In [69]:
filtrado_df2.head()

Unnamed: 0,client_id,year,week,profile_id,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,...,m0_d4_t3,m0_d5_t0,m0_d5_t1,m0_d5_t2,m0_d5_t3,m0_d6_t0,m0_d6_t1,m0_d6_t2,m0_d6_t3,tag
0,XxgXt1gGsxk=,2017,5,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,1
1,XxgXt1gGsxk=,2017,9,1,0.0,0.0,35.0,0.0,180.0,0.0,...,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,69.0,3
2,XxgXt1gGsxk=,2017,2,2,0.0,0.0,0.0,72.0,100.0,0.0,...,0.0,89.0,45.0,0.0,0.0,45.0,0.0,0.0,0.0,1
3,XxgXt1gGsxk=,2017,1,3,49.0,0.0,0.0,0.0,0.0,0.0,...,65.0,100.0,111.0,0.0,0.0,0.0,0.0,0.0,0.0,6
4,XxgXt1gGsxk=,2017,11,4,0.0,0.0,0.0,27.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [87]:
filtrado_df.to_csv("individual_footprint_tagged_2.csv", encoding="utf-8",index=False) 

In [88]:
filtrado_df_read = pd.read_csv('individual_footprint_tagged_2.csv', sep=',')
len(filtrado_df_read)

9086979

In [90]:
filtrado_df_read.head()

Unnamed: 0,client_id,year,week,profile_id,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,...,m0_d4_t3,m0_d5_t0,m0_d5_t1,m0_d5_t2,m0_d5_t3,m0_d6_t0,m0_d6_t1,m0_d6_t2,m0_d6_t3,tag
0,D54t9eGiLsA=,2016,39,0,0.0,0.0,0.0,0.0,0.0,0.0,...,156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
1,D54t9eGiLsA=,2016,35,1,0.0,0.0,0.0,0.0,0.0,129.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,0.0,3
2,D54t9eGiLsA=,2016,34,2,0.0,0.0,0.0,0.0,163.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,176.0,0.0,5
3,D54t9eGiLsA=,2016,30,3,249.0,0.0,0.0,0.0,347.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,D54t9eGiLsA=,2016,22,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0


In [89]:
len(np.unique(filtrado_df_read['client_id']))

305450

### correcion centers

In [28]:
individual_footprint_clusters_df = pd.read_csv('procesado/individual_footprint_clusters.csv', sep=',')
len(individual_footprint_clusters_df)

2572929

In [36]:
len(abc)

2572929

In [30]:
individual_footprint_clusters_df['flag'] = np.where(individual_footprint_clusters_df.a.notnull(), 'complete', 'incomplete')
individual_footprint_clusters_df.head()

Unnamed: 0,client_id,tag,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,m0_d1_t2,m0_d1_t3,...,m0_d6_t1,m0_d6_t2,m0_d6_t3,a,b,c,d,e,f,flag
0,XxgXt1gGsxk=,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,191.0,0.0,,,,,,,incomplete
1,XxgXt1gGsxk=,1,0.0,0.0,0.0,16.0,31.666667,0.0,0.0,0.0,...,8.111111,2.269841,0.0,,,,,,,incomplete
2,XxgXt1gGsxk=,2,0.0,1.617021,9.978723,12.654255,5.106383,3.510638,13.404255,0.0,...,0.0,6.888298,12.505319,,,,,,,incomplete
3,XxgXt1gGsxk=,3,0.0,0.0,35.0,0.0,180.0,0.0,0.0,0.0,...,0.0,0.0,69.0,,,,,,,incomplete
4,XxgXt1gGsxk=,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,100.0,0.0,,,,,,,incomplete


In [31]:
individual_footprint_clusters_df[individual_footprint_clusters_df['flag']=='complete']

Unnamed: 0,client_id,tag,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,m0_d1_t2,m0_d1_t3,...,m0_d6_t1,m0_d6_t2,m0_d6_t3,a,b,c,d,e,f,flag
14309,DBpkx3TN1qw=,0,2017.0,15.0,0.0,0.0,2.0,165.0,8.0,0.0,...,0.0,0.0,29.0,0.0,29.0,0.0,0.0,0.0,0.0,complete
14310,DBpkx3TN1qw=,1,2017.0,14.0,1.0,0.0,0.0,374.0,0.0,0.0,...,0.0,0.0,0.0,53.0,16.0,29.0,0.0,71.0,0.0,complete
14311,DBpkx3TN1qw=,2,2017.0,13.0,2.0,0.0,3.0,295.0,0.0,0.0,...,0.0,201.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,complete
22562,nwtEtzbZ4YY=,0,2016.0,35.0,0.0,0.0,1.0,4694.0,0.0,0.0,...,0.0,0.0,242.0,0.0,0.0,0.0,0.0,109.0,0.0,complete
22563,nwtEtzbZ4YY=,1,2016.0,34.0,1.0,0.0,3.0,5296.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5296.0,0.0,complete
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2566364,Q2O23OWVN/I=,2,2016.0,46.0,2.0,0.0,0.0,296.0,0.0,0.0,...,0.0,0.0,16.0,0.0,28.0,43.0,35.0,0.0,33.0,complete
2566365,Q2O23OWVN/I=,3,2016.0,52.0,3.0,0.0,0.0,49.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,complete
2566366,Q2O23OWVN/I=,4,2016.0,45.0,4.0,0.0,0.0,46.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,complete
2566367,Q2O23OWVN/I=,5,2016.0,44.0,5.0,0.0,0.0,106.0,0.0,13.0,...,0.0,0.0,42.0,0.0,32.0,0.0,0.0,8.0,0.0,complete


In [32]:
for index, row in tqdm(individual_footprint_clusters_df.iterrows()):
    if row['flag'] == 'complete':
        ll = list(row[0:3])+list(row[9:-1])+[0,0,0,0,0,0,'xd']
        individual_footprint_clusters_df.iloc[index] = ll

2572929it [53:15, 805.14it/s] 


In [33]:
individual_footprint_clusters_df[individual_footprint_clusters_df['flag']=='xd']

Unnamed: 0,client_id,tag,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,m0_d1_t2,m0_d1_t3,...,m0_d6_t1,m0_d6_t2,m0_d6_t3,a,b,c,d,e,f,flag
14309,DBpkx3TN1qw=,0,2017.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
14310,DBpkx3TN1qw=,1,2017.0,0.0,7.0,100.0,0.0,8.0,90.0,0.0,...,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
14311,DBpkx3TN1qw=,2,2017.0,0.0,0.0,0.0,0.0,0.0,79.0,0.0,...,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
22562,nwtEtzbZ4YY=,0,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,109.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
22563,nwtEtzbZ4YY=,1,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5296.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2566364,Q2O23OWVN/I=,2,2016.0,0.0,0.0,0.0,70.0,13.0,0.0,0.0,...,35.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
2566365,Q2O23OWVN/I=,3,2016.0,0.0,33.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
2566366,Q2O23OWVN/I=,4,2016.0,0.0,46.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,xd
2566367,Q2O23OWVN/I=,5,2016.0,13.0,0.0,0.0,0.0,11.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,xd


In [34]:
abc = individual_footprint_clusters_df.iloc[:, 0:30]
abc.head()

Unnamed: 0,client_id,tag,m0_d0_t0,m0_d0_t1,m0_d0_t2,m0_d0_t3,m0_d1_t0,m0_d1_t1,m0_d1_t2,m0_d1_t3,...,m0_d4_t2,m0_d4_t3,m0_d5_t0,m0_d5_t1,m0_d5_t2,m0_d5_t3,m0_d6_t0,m0_d6_t1,m0_d6_t2,m0_d6_t3
0,XxgXt1gGsxk=,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,191.0,0.0
1,XxgXt1gGsxk=,1,0.0,0.0,0.0,16.0,31.666667,0.0,0.0,0.0,...,0.0,43.380952,88.777778,15.730159,0.0,0.0,17.015873,8.111111,2.269841,0.0
2,XxgXt1gGsxk=,2,0.0,1.617021,9.978723,12.654255,5.106383,3.510638,13.404255,0.0,...,0.0,0.0,1.148936,7.079787,4.228723,5.984043,19.946809,0.0,6.888298,12.505319
3,XxgXt1gGsxk=,3,0.0,0.0,35.0,0.0,180.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,69.0
4,XxgXt1gGsxk=,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,61.0,0.0,0.0,0.0,100.0,0.0


In [35]:
individual_footprint_clusters_df.to_csv("clusters/individual_footprint_clusters1_xd.csv", encoding="utf-8",index=False) 