# FOOTPRINTS

In [1]:
import numpy as np
import datetime
from datetime import date
import json
import pylab
import pandas as pd
import matplotlib.pyplot as plt
import os, sys

## Preparacion de datos

### Cargando datos

In [2]:

def leer_data():
    outfile='./SOURCES/data.csv'
    data = pd.read_csv(outfile)
    return data

data_original = leer_data()
data_original.head(3)

Unnamed: 0,client_id,date,año,mes,dia,hora,merchant_departement,merchant_province,merchant_district,mcc,mccg,client_age,quantity,amount_sol,dia_semana,turno
0,NNeQwQy9MAQ=,2016-07-15 22:23:25,2016,7,15,22,LIMA,LIMA,San Juan De Luriganc,7994,2,33.0,1,100.0,Fri,4-Noche
1,qFRoPHPOc/I=,2016-06-21 21:30:55,2016,6,21,21,LIMA,LIMA,San Juan De Luriganc,7994,2,52.0,1,20.0,Tues,4-Noche
2,qFRoPHPOc/I=,2017-02-01 01:29:59,2017,2,1,1,LIMA,LIMA,San Juan De Luriganc,7994,2,52.0,1,100.0,Weds,1-Madrugada


### Preparacion de datos

In [3]:
data = data_original[['client_id','date','año','mes','dia','hora','mccg','quantity','amount_sol']]
data.head(2)

Unnamed: 0,client_id,date,año,mes,dia,hora,mccg,quantity,amount_sol
0,NNeQwQy9MAQ=,2016-07-15 22:23:25,2016,7,15,22,2,1,100.0
1,qFRoPHPOc/I=,2016-06-21 21:30:55,2016,6,21,21,2,1,20.0


## Definicion de variables

### mccgs

In [4]:
mccgs = ((data.groupby(['mccg'], as_index=False, sort=True)).count())['mccg']
mccgs,mccgs[3],len(mccgs)

(0      2
 1      3
 2      4
 3      5
 4      6
 5      8
 6      9
 7     10
 8     11
 9     12
 10    13
 11    14
 12    15
 13    16
 14    17
 15    18
 16    19
 17    21
 18    22
 19    23
 20    25
 21    26
 22    27
 23    28
 Name: mccg, dtype: int64, 5, 24)

### Clientes

In [5]:
clientes =  data.groupby('client_id').client_id.count().index
clientes

Index(['+++g8j9k+5A=', '++/oQ9Lb9dI=', '++14g8obpj0=', '++1XMtcwMec=',
       '++3gxZFOJCM=', '++438ugzEhg=', '++5u+heOZ8o=', '++70ByX0a3Q=',
       '++7i5fi6kBU=', '++834mPfd7g=',
       ...
       'zzvndpR1ntg=', 'zzvt24FsKYk=', 'zzw6JFGkUrQ=', 'zzwY0tP4zKM=',
       'zzxOoziChuk=', 'zzxTJkA/ah0=', 'zzxjK1vzG4E=', 'zzy5LVrTPmc=',
       'zzyhW1+NTLM=', 'zzzBu/yw4wE='],
      dtype='object', name='client_id', length=227663)

# FOOTPRINT PARA CADA MCCG

## Unidad de TXs temporales (U)

### Funciones

In [6]:
# definimos los 4 time_windows que usaremos

def time_window(hora):
    tw = 9999
    if hora >=0:
        tw = 0      # Madrugada
    if hora >=6:
        tw = 1      # Mañana
    if hora >=12:
        tw = 2      # Tarde
    if hora >=18:
        tw = 3      # Noche
    return tw

In [7]:
# Definimos los U 

def procesar_u(user):    
    uid=list(user['client_id'])[0]
    # Lista los años en que tiene txs el usuario
    years = set(list(user['año']))
    anni = {year:{} for year in list(years)}
    
    # para cada fila (para cada fecha)
    for dat in  range(0,len(user)):
        año = user.iloc[dat]['año']
        week=datetime.datetime(año,user.iloc[dat]['mes'],user.iloc[dat]['dia']).isocalendar()[1]
        weekday=datetime.datetime(año,user.iloc[dat]['mes'],user.iloc[dat]['dia']).weekday()
        turn = time_window(user.iloc[dat]['hora'])
        mccg = user.iloc[dat]['mccg']
        
        # Si la semana no existe en el año
        if not(week in anni[año]):
            anni[año][week] = {}
        # Si el mccg no existe en la semana y año
        if not (mccg in anni[año][week]):
            anni[año][week][mccg]={}  #NUMERO DE MCCGs VARIABLES
        # Si el turno no existe en el mccg,semana y año
        if not (turn in anni[año][week][mccg]):
            anni[año][week][mccg][turn]=np.array([0]*7)  #CUATRO TURNOS
            
        anni[año][week][mccg][turn][weekday]+=user.iloc[dat]['quantity'] # suma cantidades "importancia por compras"
        #anni[año][week][turn][weekday]+=user.iloc[dat]['amount_sol'] # suma montos "importancia por gastos"
            
    return uid,anni

### Procesando U

In [8]:

file='./RESULTS/U'
    
##################################################
#        Procesando U de cada CLIENTE
##################################################
    
# Extraemos la lista de clientes sin repetir

profiles={}
contador=0 
print("Number of rows "+str(len(data)))
# Para cada cliente
for cliente in clientes:
    cliente_i= data[data['client_id'] == cliente]
    ## ejecutamos para cada usuario
    results=procesar_u(cliente_i)
    profiles[results[0]]=results[1]
    
    #if contador%5000==0:
    #    print(contador)
    contador += 1


Number of rows 708962


In [9]:
title = 'customer_id,year,week,profile_id,mccg,turn,size'
for i in range(len(mccgs)):
    for j in range(4):            # numero de turnos
        for k in range(7):            # numero de dias
            title = title+','+'m'+str(mccgs[i])+'t'+str(j)+'d'+str(k)
title = title+'\n'

In [10]:
individual_footprint="%s.footprint" %(file)
fw=open(individual_footprint,'w')

fw.write(title)
footprints=0 

for uid in profiles:
    profile_id=0
    for year in profiles[uid]:                                    #######################
        for week in profiles[uid][year]:                          #######################
            temp=np.zeros(4*7*len(mccgs))
            for mccg in profiles[uid][year][week]:                #######################
                pos_mccg = list(mccgs).index(mccg) # posicion del mccg
                temp2 = np.zeros(4*7)
                for turn in profiles[uid][year][week][mccg]:      #######################
                    pos_turn = turn # posicion del turno
                    d=profiles[uid][year][week][mccg][turn]       #######################        
                    
                    for i in range(pos_turn*7,(pos_turn+1)*7):
                        temp2[i] += d[i-(pos_turn*7)]
                
                for i in range(pos_mccg*28,(pos_mccg+1)*28):
                    temp[i] = temp2[i-(pos_mccg*28)]
                
                txt = ''+str(uid)+','+str(year)+','+str(week)+','+str(profile_id)+','+str(mccg)+','+str(turn)+','+str(sum(temp))
                for i in range(len(temp)):
                    txt = txt +','+str(temp[i])
                
                fw.write(txt +'\n')

            profile_id = profile_id + 1
    footprints+=profile_id
    fw.flush()
fw.close()
print ("number of footprint: "+str(footprints))

number of footprint: 561753
