# Librerias

In [1]:
# Esto son las funciones y que se importan al archivo para procesar los datos. 

import numpy as np
import openpyxl
import pandas as pd
import datetime
from math import floor
import os
import seaborn as sb
import matplotlib.pyplot as plt
import time
import re
import warnings
import ast

# Ejemplo advertencias de librerias desactualizadas
warnings.filterwarnings("ignore")

# Configuración para que al imprimir las tablas se vean completas hasta 90 columnas
pd.options.display.max_columns = 90

# Paths

In [2]:
pathPrints = "../data/raw/prints.json"
pathTaps = "../data/raw/taps.json"
pathPayments = "../data/raw/pays.csv"

# Carga fuentes de datos

## Prints

In [3]:
df_prints = pd.read_json(pathPrints, lines=True)

In [4]:
print(f"Dimensiones del datase prints: {df_prints.shape}")
df_prints.head(5)

Dimensiones del datase prints: (508617, 3)


Unnamed: 0,day,event_data,user_id
0,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",98702
1,2020-11-01,"{'position': 1, 'value_prop': 'prepaid'}",98702
2,2020-11-01,"{'position': 0, 'value_prop': 'prepaid'}",63252
3,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",24728
4,2020-11-01,"{'position': 1, 'value_prop': 'link_cobro'}",24728


In [5]:
df_prints.info()

<class 'pandas.core.frame.DataFrame'>
Index: 508617 entries, 0 to 508616
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   day         508617 non-null  object
 1   event_data  508617 non-null  object
 2   user_id     508617 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 15.5+ MB


In [6]:
# Sacamos la informacion del Json de event_data a columnas
df_prints = df_prints.join(pd.json_normalize(df_prints["event_data"]))

In [7]:
df_prints.head(5)

Unnamed: 0,day,event_data,user_id,position,value_prop
0,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",98702,0,cellphone_recharge
1,2020-11-01,"{'position': 1, 'value_prop': 'prepaid'}",98702,1,prepaid
2,2020-11-01,"{'position': 0, 'value_prop': 'prepaid'}",63252,0,prepaid
3,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",24728,0,cellphone_recharge
4,2020-11-01,"{'position': 1, 'value_prop': 'link_cobro'}",24728,1,link_cobro


## Taps

In [8]:
df_taps = pd.read_json(pathTaps, lines=True)

In [9]:
print(f"Dimensiones del dataset Taps: {df_taps.shape}")
df_taps.head(5)

Dimensiones del dataset Taps: (50859, 3)


Unnamed: 0,day,event_data,user_id
0,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",98702
1,2020-11-01,"{'position': 2, 'value_prop': 'point'}",3708
2,2020-11-01,"{'position': 3, 'value_prop': 'send_money'}",3708
3,2020-11-01,"{'position': 0, 'value_prop': 'transport'}",93963
4,2020-11-01,"{'position': 1, 'value_prop': 'cellphone_recha...",93963


In [10]:
df_taps.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50859 entries, 0 to 50858
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   day         50859 non-null  object
 1   event_data  50859 non-null  object
 2   user_id     50859 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


In [11]:
# Sacamos la informacion del Json de event_data a columnas
df_taps = df_taps.join(pd.json_normalize(df_taps["event_data"]))

In [12]:
df_taps.head(5)

Unnamed: 0,day,event_data,user_id,position,value_prop
0,2020-11-01,"{'position': 0, 'value_prop': 'cellphone_recha...",98702,0,cellphone_recharge
1,2020-11-01,"{'position': 2, 'value_prop': 'point'}",3708,2,point
2,2020-11-01,"{'position': 3, 'value_prop': 'send_money'}",3708,3,send_money
3,2020-11-01,"{'position': 0, 'value_prop': 'transport'}",93963,0,transport
4,2020-11-01,"{'position': 1, 'value_prop': 'cellphone_recha...",93963,1,cellphone_recharge


## Payments

In [13]:
df_pays = pd.read_csv(pathPayments)

In [14]:
print(f"Dimensiones del dataset Payments: {df_pays.shape}")
df_pays.head(5)

Dimensiones del dataset Payments: (756483, 4)


Unnamed: 0,pay_date,total,user_id,value_prop
0,2020-11-01,7.04,35994,link_cobro
1,2020-11-01,37.36,79066,cellphone_recharge
2,2020-11-01,15.84,19321,cellphone_recharge
3,2020-11-01,26.26,19321,send_money
4,2020-11-01,35.35,38438,send_money


# Base 1: Modelo Clasificacion

## Transformacion Prints

In [15]:
prints_grouped = (
    df_prints.groupby(["user_id", "value_prop"])
             .agg(prints_count=("value_prop", "count"))
             .reset_index()
)


In [16]:
prints_grouped.head(5)

Unnamed: 0,user_id,value_prop,prints_count
0,1,cellphone_recharge,2
1,1,credits_consumer,2
2,1,link_cobro,4
3,1,point,2
4,1,prepaid,1


## Transformación Taps

In [17]:
taps_grouped = (
    df_taps.groupby(["user_id", "value_prop"])
             .agg(taps_count=("value_prop", "count"))
             .reset_index()
)

In [18]:
taps_grouped.head(5)

Unnamed: 0,user_id,value_prop,taps_count
0,1,link_cobro,1
1,3,point,1
2,4,link_cobro,1
3,4,prepaid,1
4,7,send_money,1


## Transformación Payments

In [19]:
pays_grouped = (
    df_pays.groupby(["user_id", "value_prop"])
             .agg(pays_count=("value_prop", "count"), pays_total=("total", "sum"))
             .reset_index()
)

In [20]:
pays_grouped.head(5)

Unnamed: 0,user_id,value_prop,pays_count,pays_total
0,1,cellphone_recharge,1,15.47
1,1,credits_consumer,1,37.92
2,1,link_cobro,1,137.14
3,1,transport,1,100.89
4,2,credits_consumer,1,15.21


## Marca si estuvo en las primeras 2 posiciones de recomendacion

In [21]:
top2_count = (
    df_prints[df_prints["position"] <= 1]
    .groupby(["user_id", "value_prop"])
    .agg(top2_count=("value_prop", "count"))
)\
.reset_index()

In [22]:
top2_count.head(5)

Unnamed: 0,user_id,value_prop,top2_count
0,1,link_cobro,3
1,1,point,2
2,1,prepaid,1
3,1,send_money,2
4,1,transport,3


## Cantidad de productos por cliente

In [23]:
user_products = (
    df_pays.groupby("user_id")["value_prop"]
      .nunique()
      .reset_index(name="count_productos")
)

In [24]:
user_products.head(5)

Unnamed: 0,user_id,count_productos
0,1,4
1,2,6
2,3,5
3,4,4
4,5,5


## Estructura final para modelo clasificacion

In [25]:
# Extraer usuarios unicos

df_users_prints = df_prints[["user_id"]].copy()
df_users_prints = df_users_prints.drop_duplicates()

df_users_taps = df_taps[["user_id"]].copy()
df_users_taps = df_users_taps.drop_duplicates()

df_users_pays = df_pays[["user_id"]].copy()
df_users_pays = df_users_pays.drop_duplicates()

In [26]:
df_users = pd.concat([df_users_prints, df_users_taps, df_users_pays], ignore_index=True)\
.drop_duplicates()

In [27]:
print(f"Cantidad de usuarios {df_users.shape[0]}")

Cantidad de usuarios 99943


In [28]:
# Extraer productos unicos

recomend = df_prints[["value_prop"]].copy()
recomend = recomend.drop_duplicates()

In [29]:
recomend

Unnamed: 0,value_prop
0,cellphone_recharge
1,prepaid
4,link_cobro
5,credits_consumer
6,point
9,transport
15,send_money


In [30]:
# Unificar dataframe usuario - producto
df_mod1 = df_users.merge(recomend, how="cross")

In [31]:
df_mod1.head(5)

Unnamed: 0,user_id,value_prop
0,98702,cellphone_recharge
1,98702,prepaid
2,98702,link_cobro
3,98702,credits_consumer
4,98702,point


In [32]:
# Union con informacion de prints, taps y payments

df_mod1 = df_mod1.merge(prints_grouped, on=["user_id", "value_prop"], how="left")\
.merge(taps_grouped, on=["user_id", "value_prop"], how="left")\
.merge(pays_grouped, on=["user_id", "value_prop"], how="left")\
.merge(top2_count, on=["user_id", "value_prop"], how="left")\
.merge(user_products, on=["user_id"], how="left")

In [33]:
for col in df_mod1.columns:

    if "count" in col:
        df_mod1[col] = df_mod1[col].fillna(0).astype(int)

In [36]:
df_mod1["pays_total"] = df_mod1["pays_total"].fillna(0)
df_mod1["label"] = np.where(df_mod1["pays_count"] > 0, 1, 0)

In [37]:
df_mod1.head(5)

Unnamed: 0,user_id,value_prop,prints_count,taps_count,pays_count,pays_total,top2_count,count_productos,label
0,98702,cellphone_recharge,1,1,0,0.0,1,3,0
1,98702,prepaid,1,0,1,128.68,1,3,1
2,98702,link_cobro,0,0,1,123.78,0,3,1
3,98702,credits_consumer,0,0,0,0.0,0,3,0
4,98702,point,0,0,1,2.49,0,3,1
