## Objetivo


+ Este notebook tem como objetivo realizar uma Análise Exploratória dos Dados (EAD) de bases do Ifood. 
+ O objetivo é responder:
    + `O que fizeram os clientes darem Churn?`
    + `Qual a importância dos eventos (push, compra, acesso, entre outros) ao longo da vida do cliente?`

+ As bases são:
    + `ORDERS - Informações sobre os pedidos realizados.`
    + `MARKETING PUSH FULL - Notificações PUSH ao longo de 6 meses (Junho-Dezembro/2019).`
    + `CUSTOMER SEGMENTATION - Segmentação do cliente.`
    + `ORDERS WITH COST REVENUE - Informações sobre o pedido relacionado, onde verifica-se se o mesmo gerou custo ou receita.`
    + `SESSION VISITS - Comportamento de uso do app.`

    
Criado por Jaime Mishima e Ariel Vicente </br>

## Imports

In [3]:
from pyspark.sql.functions import udf, count, when, isnull, col, mean, sum, max, min, stddev, count, trim, lower, split, explode
from pyspark.sql.functions import *
from pyspark.mllib.stat import Statistics

# tratamento de datas
from pyspark.sql.functions import datediff, to_date, to_timestamp, from_utc_timestamp, round, dayofweek, month

# para o groupby e lag column
import pyspark.sql.functions as f
from pyspark.sql.window import Window
from pyspark.sql import SQLContext
from pyspark.sql.functions import lit

# para a remoção de missing:
from functools import reduce

# para ajuste de type de arrays
from pyspark.sql.types import ArrayType, StringType

# para correlacao
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

## Help Functions

In [5]:
from pyspark.sql.functions import count, when, isnull, col
def contar_missing(df):
  """Realiza a contagem da quantidade de missing que existe dentro de um dataframe.
  Args:
    df - Dataframe Spark
  Returns:
    Data Frame spark com apenas 1 linha com a contagem de missing para cada variável.
  """
  aux = []
  for c in df.columns:
    aux.append(count(when(isnull(c), c)).alias(c))
  return df.select(aux)

def percent_missing(df):
  total_linhas = df.count()
  df = contar_missing(df)
  colunas = df.columns
  total_missing = list(df.first().asDict().values())
  valores = zip(colunas, total_missing)
  
  df_aux = spark.createDataFrame(valores, ['variaveis', 'total_missing'])
  df_aux = df_aux.withColumn('perc_missing', col('total_missing') *100/total_linhas)
  return df_aux

In [6]:
def percentByCol(df, group_column):
  """Retorna o groupby de uma coluna `col` de um dataframe `df`
  Args:
    df - Dataframe Spark
    group_column - Nome da coluna do dataframe df
  Returns:
    Data Frame spark com o groupby da coluna `col` e uma coluna com o percentual
  """
  return df.groupby(group_column)\
           .count()\
           .withColumnRenamed('count', 'cnt_per_group')\
           .withColumn('percent', f.col('cnt_per_group')*100/f.sum('cnt_per_group').over(Window.partitionBy()))\
           .orderBy('percent', ascending=False)

In [7]:
# Disclaimer: baseado em https://stackoverflow.com/questions/46944493/removing-duplicate-columns-after-a-df-join-in-spark
def join_removing_repeated(df1, df2, cond, how='left'):
    """Retorna o dataframe resultado do join de `df1` e `df2`
    Args:
      df1 - dataframe 1
      df2 - dataframe 2
      cond - chaves para realizar o join
      how - tipo de join (default left)
    Returns:
      Data Frame resultado do join removendo as colunas repetidas
    """
    df = df1.join(df2, cond, how=how)
    repeated_columns = [c for c in df1.columns if c in df2.columns]
    for col in repeated_columns:
        df = df.drop(df2[col])
    return df

In [8]:
def limpeza(x):
  """Retorna a string x após eliminação de caracteres indesejados
    Args:
      x - string
    Returns:
      String tratada
    """
  return x.replace('"', '').replace('\\', '').replace('[', '').replace(']', '')

udf_limpeza = udf(limpeza, StringType()) # create an udf based on limpeza function

In [9]:
from pyspark.sql.functions import udf
push_range = udf(lambda pushes: '1- < 20' if pushes < 20 else 
                                 '2- 20-60' if (pushes >= 20 and pushes < 60) else
                                 '3- 60-100' if (pushes >= 60 and pushes < 100) else
                                 '4- 100-140' if (pushes >= 100 and pushes < 140) else
                                 '5- 140-180' if (pushes >= 140 and pushes < 180) else
                                 '6- 180-220' if (pushes >= 180 and pushes < 220) else
                                 '7- 220-260' if (pushes >= 220 and pushes < 260) else
                                 '8- 260-300' if (pushes >= 260 and pushes < 300) else
                                 '9- 300+'  if (pushes >= 300) else '')

In [10]:
from pyspark.sql.functions import udf
order_range = udf(lambda orders: '1- < 5' if orders < 5 else 
                                 '2- 5-10' if (orders >= 5 and orders < 10) else
                                 '3- 0-15' if (orders >= 10 and orders < 15) else
                                 '4- 15-20' if (orders >= 15 and orders < 20) else
                                 '5- 20-25' if (orders >= 20 and orders < 25) else
                                 '6- 25-30' if (orders >= 25 and orders < 30) else
                                 '7- 30-35' if (orders >= 30 and orders < 35) else
                                 '8- 35-40' if (orders >= 35 and orders < 40) else
                                 '9- 40+'  if (orders >= 40) else '')

In [11]:
import seaborn as sns
def plot_corr_matrix(correlations,attr,fig_no, figsize=[15,10]):
  """Retorna o heatmap de uma lista de listas com as correlacoes de variaveis
  Args:
    correlations - lista de lista com as correlacoes
    attr - lista com os nomes das variaveis
    fig_no - If not provided, a new figure will be created, and the figure number will be incremented (para o plt.figure)
    figsize 0- tamanho do heatmap
  Returns:
    Heatmap com a matriz de correlacao
    """
  fig=plt.figure(fig_no, figsize=figsize)
  ax=fig.add_subplot(111)
  ax.set_title("Correlacao Variaveis")
  ax = sns.heatmap(correlations, cmap="YlGnBu")
  indice = list(range(1, len(attr)+1))
  indice = [str(s) + ' - ' for s in indice]
  res = [i + j for i, j in zip(indice, attr)] 
  #ax.set_xticks(range(len(filter_colunas_order)))
  ax.set_yticklabels(res)
  plt.yticks(rotation=0) 
  plt.show()

## Leitura das Bases

In [13]:
root_dir = '/dbfs/FileStore/ifood'
dbutils.fs.ls(f'{root_dir}')
df_customer_segmentation = spark.read.parquet(f'{root_dir}/customer_segmentation')
df_orders = spark.read.parquet(f'{root_dir}/orders')
df_orders_with_cost_revenue = spark.read.parquet(f'{root_dir}/orders_with_cost_revenue')
df_sessions_visits = spark.read.parquet(f'{root_dir}/sessions_visits')
df_marketing_push_full = spark.read.parquet(f'{root_dir}/marketing_push_full')

print('Customer segmentation:     ', df_customer_segmentation.count(), '   linhas e ', len(df_customer_segmentation.columns), ' colunas')
print('Orders:                    ', df_orders.count(), '   linhas e ', len(df_orders.columns), ' colunas')
print('Orders with cost revenue:  ', df_orders_with_cost_revenue.count(), '   linhas e ', len(df_orders_with_cost_revenue.columns), ' colunas')
print('Visits:                    ', df_sessions_visits.count(), '   linhas e ', len(df_sessions_visits.columns), ' colunas')
print('Pushes:                    ', df_marketing_push_full.count(), ' linhas e ', len(df_marketing_push_full.columns), ' colunas')

## Análise Exploratória das Bases (EAD)

### 1. Pushes

#### Leitura e caching

In [17]:
display(df_marketing_push_full)

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,canvas_name,canvas_step_id,canvas_is_first_step,canvas_first_step_name,canvas_step_name,canvas_real_step_name,canvas_step_index,canvas_tags,send_id,event_date
push,received,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-08-29T18:31:10.000Z,android,23e2e9bf-61ae-48aa-8e24-85d8f2348b21,2019-08-29 / ANDROID / 20190829MIDIAKITKAS,android_push,,,,,,,,,,2019-08-29T00:00:00.000Z
push,send,iFood Brasil,sample,5ac3121c8e5b4b5696ac49dc,290070089ba4bc832a39ffdb7f3bc7e70cb4e7746bc51163946e0045d548277e,2019-11-13T12:38:00.000Z,ios,,,,,,0.0,,,,,,,2019-11-13T00:00:00.000Z
push,send,iFood Brasil,sample,5ac2c99c059e9ee7e5eed43b,0753a6e30f7b58265e00becd06423d17181aa22d43f51ec34ebe9a2af0be3ab3,2019-10-10T11:31:02.000Z,ios,c3858a3b-1eea-4ab8-99ca-5b0166a9fb14,2019-10-10 / IOS / Almoço / Cidades DFN Geral,ios_push,,,0.0,,,,,,,2019-10-10T00:00:00.000Z
push,received,iFood Brasil,sample,5ac36acd409f7734a97df5b9,8468bb60ea7888a33a3506e4f3d2b541afca6651bde242ed49a75da8cce67a68,2019-10-27T21:05:31.000Z,android,e5b7abaf-eafa-4a2e-bde5-62965fefa0f5,2019-10-27 / ANDROID / Ativos / Ceia - KA / CUPOM,android_push,,,,,,,,,,2019-10-28T00:00:00.000Z
push,send,iFood Brasil,sample,5ac3654ad75e5a3c29fb256f,6cbda064947aec380ac7c459de0902fc0182b23dcbaa59ee7317705927f52eee,2019-10-28T11:30:49.000Z,android,2398c143-4f12-4541-afd9-06bb94cb7b8a,2019-10-28 / ANDROID / Ativos / Almoço - Light / CUPOM,android_push,,,0.0,,,,,,,2019-10-28T00:00:00.000Z
push,send,iFood Brasil,sample,5ac311e58e5b4b411df4c647,28b2fe193b22c7a4026afb94d4bd3104a296606b51ea107da1ad4a22611d2f2b,2019-10-25T12:12:49.000Z,android,,,,,,0.0,,,,,,,2019-10-25T00:00:00.000Z
push,received,iFood Brasil,sample,5ac35d290b97ee7b6f067992,54fcbbfab48d41e79c96471d17a570f1fb2d65d1bcc1c81a8547fc1a953edb4e,2019-10-15T18:12:44.000Z,android,fe0beda0-401f-4783-9ac3-45faa723a4bc,2019-10-15 / ANDROID / Ativos / Jantar / CUPOM,android_push,,,,,,,,,,2019-10-15T00:00:00.000Z
push,send,iFood Brasil,sample,5b3b5459614b1ee0f8772035,bf3eea65d3cc69a040517cfbc9783ab5015500fb61afb015143aaf128fa78611,2019-07-15T18:27:53.000Z,ios,110ab169-e1af-47c1-860f-7beea100459e,2019-07-15 / iOS / Ativos / Jantar / CUPOM / R$8,ios_push,,,0.0,,,,,,,2019-07-15T00:00:00.000Z
push,received,iFood Brasil,sample,5c8ea2111668ab61c397680c,39ed72e5d00f25f46eefd5c566ef8ca352864a6f990800ad0dce219a1a150588,2019-08-18T18:19:45.000Z,ios,310dfc26-55e7-41b7-9b26-d492122bffa6,2019-08-18 / iOS / Ativos / Jantar / CUPOM,ios_push,,,,,,,,,,2019-08-18T00:00:00.000Z
push,send,iFood Brasil,sample,5af112133d477eece72791e2,71f0d523744e4211a3810b2d8530d68b0f75ede4e629b29a667055ab955573e0,2019-06-06T18:04:12.000Z,android,a38ea1b2-5073-4fb9-a895-35734b4e8f92,2019-06-06 / ANDROID / 20190606RECRAG,android_push,,,0.0,,,,,,,2019-06-06T00:00:00.000Z


In [18]:
# Adicionar no cache (100% em memoria)
df_marketing_push_full.cache()

In [19]:
df_marketing_push_full.dtypes

#### Tratamento de Missings

In [21]:
colunas_pushes_missing = percent_missing(df_marketing_push_full).orderBy('perc_missing', ascending=False)
display(colunas_pushes_missing)

variaveis,total_missing,perc_missing
send_id,28426642,100.0
canvas_tags,28405316,99.92497882796005
canvas_first_step_name,28405316,99.92497882796005
canvas_step_name,27543807,96.89433947210507
canvas_real_step_name,27543807,96.89433947210507
canvas_step_index,27543807,96.89433947210507
canvas_step_id,27543369,96.89279866401384
canvas_name,27542381,96.88932305124186
canvas_is_first_step,13672374,48.097042204281465
message_variation_channel,6042795,21.25750554708502


Removendo colunas com missing share acima de 22%

In [23]:
# Selecionar linhas de um dataframe > 22%
# Pega as linhas da coluna variaveis filtradas pelo threshold acima e converte para uma lista
# Remove colunas do dataframe passando essa lista

thresholdMissingPushes = 22
df_columns_to_drop = colunas_pushes_missing.filter(colunas_pushes_missing['perc_missing'] > thresholdMissingPushes).select('variaveis')
list_columns_to_drop = list([row[0] for row in df_columns_to_drop.collect()])
df_mpf = df_marketing_push_full.drop(*list_columns_to_drop)
display(df_mpf)

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,event_date
push,received,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-08-29T18:31:10.000Z,android,23e2e9bf-61ae-48aa-8e24-85d8f2348b21,2019-08-29 / ANDROID / 20190829MIDIAKITKAS,android_push,2019-08-29T00:00:00.000Z
push,send,iFood Brasil,sample,5ac3121c8e5b4b5696ac49dc,290070089ba4bc832a39ffdb7f3bc7e70cb4e7746bc51163946e0045d548277e,2019-11-13T12:38:00.000Z,ios,,,,2019-11-13T00:00:00.000Z
push,send,iFood Brasil,sample,5ac2c99c059e9ee7e5eed43b,0753a6e30f7b58265e00becd06423d17181aa22d43f51ec34ebe9a2af0be3ab3,2019-10-10T11:31:02.000Z,ios,c3858a3b-1eea-4ab8-99ca-5b0166a9fb14,2019-10-10 / IOS / Almoço / Cidades DFN Geral,ios_push,2019-10-10T00:00:00.000Z
push,received,iFood Brasil,sample,5ac36acd409f7734a97df5b9,8468bb60ea7888a33a3506e4f3d2b541afca6651bde242ed49a75da8cce67a68,2019-10-27T21:05:31.000Z,android,e5b7abaf-eafa-4a2e-bde5-62965fefa0f5,2019-10-27 / ANDROID / Ativos / Ceia - KA / CUPOM,android_push,2019-10-28T00:00:00.000Z
push,send,iFood Brasil,sample,5ac3654ad75e5a3c29fb256f,6cbda064947aec380ac7c459de0902fc0182b23dcbaa59ee7317705927f52eee,2019-10-28T11:30:49.000Z,android,2398c143-4f12-4541-afd9-06bb94cb7b8a,2019-10-28 / ANDROID / Ativos / Almoço - Light / CUPOM,android_push,2019-10-28T00:00:00.000Z
push,send,iFood Brasil,sample,5ac311e58e5b4b411df4c647,28b2fe193b22c7a4026afb94d4bd3104a296606b51ea107da1ad4a22611d2f2b,2019-10-25T12:12:49.000Z,android,,,,2019-10-25T00:00:00.000Z
push,received,iFood Brasil,sample,5ac35d290b97ee7b6f067992,54fcbbfab48d41e79c96471d17a570f1fb2d65d1bcc1c81a8547fc1a953edb4e,2019-10-15T18:12:44.000Z,android,fe0beda0-401f-4783-9ac3-45faa723a4bc,2019-10-15 / ANDROID / Ativos / Jantar / CUPOM,android_push,2019-10-15T00:00:00.000Z
push,send,iFood Brasil,sample,5b3b5459614b1ee0f8772035,bf3eea65d3cc69a040517cfbc9783ab5015500fb61afb015143aaf128fa78611,2019-07-15T18:27:53.000Z,ios,110ab169-e1af-47c1-860f-7beea100459e,2019-07-15 / iOS / Ativos / Jantar / CUPOM / R$8,ios_push,2019-07-15T00:00:00.000Z
push,received,iFood Brasil,sample,5c8ea2111668ab61c397680c,39ed72e5d00f25f46eefd5c566ef8ca352864a6f990800ad0dce219a1a150588,2019-08-18T18:19:45.000Z,ios,310dfc26-55e7-41b7-9b26-d492122bffa6,2019-08-18 / iOS / Ativos / Jantar / CUPOM,ios_push,2019-08-18T00:00:00.000Z
push,send,iFood Brasil,sample,5af112133d477eece72791e2,71f0d523744e4211a3810b2d8530d68b0f75ede4e629b29a667055ab955573e0,2019-06-06T18:04:12.000Z,android,a38ea1b2-5073-4fb9-a895-35734b4e8f92,2019-06-06 / ANDROID / 20190606RECRAG,android_push,2019-06-06T00:00:00.000Z


Ao remover algumas colunas, ficam linhas duplicadas. Optamos por remover linhas repetidas. Assim temos **20.87M de registros**.

In [25]:
df_mpf = df_mpf.dropDuplicates()
print(df_mpf.count())

Inputa unknown para colunas categóricas com missing abaixo do threshold (22%). E a média para colunas numéricas (nao se aplica nessa base)

In [27]:
# Para as colunas que nao remover, separar em categoricas e numericas
df_columns_missing = colunas_pushes_missing.filter(
                                                   (colunas_pushes_missing['perc_missing'] <= thresholdMissingPushes) &
                                                   (colunas_pushes_missing['perc_missing'] > 0)
                                                  ).select('variaveis')
list_df_columns_missing = list([row[0] for row in df_columns_missing.collect()])

filter_colunas_numericas = [x[0] for x in df_mpf[list_df_columns_missing].dtypes if x[1] in ('double', 'int', 'long')]
filter_colunas_categoricas = [x[0] for x in df_mpf[list_df_columns_missing].dtypes if x[1] not in ('double', 'int', 'long')]

# Inputar unknown para colunas categoricas
for coluna in filter_colunas_categoricas:
  df_mpf = df_mpf.fillna('unknown', subset=[coluna])

# Inputar media para colunas numericas
for coluna in filter_colunas_numericas:
  media = df_mpf.agg(mean(coluna)).collect()[0][0]
  df_mpf = df_mpf.fillna(media, subset=[coluna])

In [28]:
display(df_mpf)

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,event_date
push,send,iFood Brasil,sample,5ace75b521ee9f180b96da24,a7a46fa1dfabc8fd9c027a51aba18c9332d07652541b7438ce6e6ce642a8d52e,2019-10-02T18:19:25.000Z,android,cdbb7816-880d-4406-9381-efc610863be2,2019-10-02 / ANDROID / Ativos / Jantar / CUPOM,android_push,2019-10-02T00:00:00.000Z
push,received,iFood Brasil,sample,5ac311afbf062b4020f7ec08,2873048204304c8485e4fa6191a8e1b43f7b28abca24f8d823842b201a79b93e,2019-10-27T22:14:39.000Z,ios,unknown,unknown,unknown,2019-10-28T00:00:00.000Z
push,received,iFood Brasil,sample,5c0db994820f229401fa42e5,241e219b53e9cdfe733c654de5c496091713f0fee51b75ff43e103a66c15a05c,2019-11-27T11:29:26.000Z,android,e3474335-2525-4977-917f-66cee3a2577c,2019-11-27 / ANDROID / Ativos / Almoço - Light / CUPOM,android_push,2019-11-27T00:00:00.000Z
push,send,iFood Brasil,sample,5c107c378caa6b789e5c38b7,432f7c45ede3ffe2f2c949089757eaaeb718c236aac7cec56803cd8cfd7a2234,2019-06-08T19:49:02.000Z,android,bd3daf82-e117-40d1-9d5c-7656f1362b06,2019-06-08 / ANDROID / 20190608RECACAOPONTUALKA,android_push,2019-06-08T00:00:00.000Z
push,send,iFood Brasil,sample,5ac3120e832920cddd4eaf58,28f8af69956d26080ce13750e08a481a5379cb431517588cf77458f966219b7c,2019-06-11T11:53:35.000Z,android,unknown,unknown,unknown,2019-06-11T00:00:00.000Z
push,send,iFood Brasil,sample,5b4534172d15ef563929d170,6bc6e8a3cc7a785eb2397d928d2b937e57577aa4b99ec8e579be933a05324707,2019-11-01T10:15:13.000Z,android,unknown,unknown,unknown,2019-11-01T00:00:00.000Z
push,received,iFood Brasil,sample,5ac36f3c6ee329d3b8c09401,980a7fb7dfd8f5a9b67ae624c31cd53ec849c22587294e9157e6e73b1b7f17db,2019-10-04T11:19:44.000Z,ios,a9942a70-ed0b-4a8c-ac8b-1fe77cdc53ef,2019-10-04 / iOS / Ativos / Almoço - Light / CUPOM,ios_push,2019-10-04T00:00:00.000Z
push,received,iFood Brasil,sample,5cdd81c0cb756f6e6039acdd,a5a7937ffa49421f5c31069576c2576fce5e9459fa58fa74447db494dc0ffa75,2019-10-10T13:30:53.000Z,android,unknown,unknown,unknown,2019-10-10T00:00:00.000Z
push,received,iFood Brasil,sample,5ac364f65ca872eae7a61ada,6b4241b4a0f280284c25eaa2116f823e8bd524453c4eae369d3e047ac5af231e,2019-08-28T11:24:23.000Z,ios,06eb0625-bf5c-404e-9d0a-8ed869295262,2019-08-28 / iOS / Ativos / Almoço / CUPOM,unknown,2019-08-28T00:00:00.000Z
push,send,iFood Brasil,sample,5ac363668e504d030bb8da33,64b6c18a195a6305aac0274603498f113531bb740091fb9fddaa3d5eac6284c4,2019-06-19T20:24:54.000Z,android,unknown,unknown,unknown,2019-06-19T00:00:00.000Z


#### Tratamento de Datas

In [30]:
from pyspark.sql.functions import col
from pyspark.sql.functions import datediff, to_date, to_timestamp, from_utc_timestamp, round, dayofweek, month

# importante: transformar primeiro para timestamp, depois para date!!!
df_mpf = df_mpf.withColumn('event_time_utc3', from_utc_timestamp('event_time_utc3', 'UTC'))\
               .withColumn('event_date', from_utc_timestamp('event_date', 'UTC'))\
               .withColumn('event_date', to_date('event_date', 'YYYY-MM-DD'))\
               .withColumn('event_month', month('event_date'))\
               .withColumn('event_dayofweek', dayofweek('event_date'))
display(df_mpf)

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,event_date,event_month,event_dayofweek
push,send,iFood Brasil,sample,5ace75b521ee9f180b96da24,a7a46fa1dfabc8fd9c027a51aba18c9332d07652541b7438ce6e6ce642a8d52e,2019-10-02T18:19:25.000+0000,android,cdbb7816-880d-4406-9381-efc610863be2,2019-10-02 / ANDROID / Ativos / Jantar / CUPOM,android_push,2019-10-02,10,4
push,received,iFood Brasil,sample,5ac311afbf062b4020f7ec08,2873048204304c8485e4fa6191a8e1b43f7b28abca24f8d823842b201a79b93e,2019-10-27T22:14:39.000+0000,ios,unknown,unknown,unknown,2019-10-28,10,2
push,received,iFood Brasil,sample,5ac3715ce41e7ec8dde33d2c,a181461278b074497cf5d77e52df00a08c4d9b49e1df06e39f1dedc57adc8ef8,2019-07-11T09:05:57.000+0000,ios,26e91b0c-82eb-44f5-bf71-b26433d72f06,2019-07-11 / IOS / QuintaLoop,ios_push,2019-07-11,7,5
push,send,iFood Brasil,sample,5ac377a2c9f0895b04a64d57,b2d08ff98a22fe924d36ada7e978fee273e47b7aebf73be7239e451e3b5c3899,2019-10-21T11:22:17.000+0000,android,5af14d08-14de-4d95-94b9-679c8d6ab928,2019-10-21 / ANDROID / Ativos / Almoço / CUPOM,android_push,2019-10-21,10,2
push,send,iFood Brasil,sample,5ac3811471666591e0c8194e,de39115680e14a3a3911662d39eacfec58baf5d951b824cd0215253fc2fbf3d2,2019-07-23T16:06:18.000+0000,android,5aa11544-ea25-4994-a8ef-0372145b7185,2019-07-23 / ANDROID / To Go,android_push,2019-07-23,7,3
push,received,iFood Brasil,sample,5ac32e060dd604a8b6234369,4920aea87c9e21be10de324bed9859c1e2239e06baa991a32917bd3875e561b5,2019-10-23T11:35:31.000+0000,android,80e60538-075a-454a-91dc-154c126a58ee,2019-10-23 / ANDROID / Ativos / Almoço / CUPOM,android_push,2019-10-23,10,4
push,send,iFood Brasil,sample,5b8b53d454a5ee2b3f0a8c97,c59ab1024f99eec1c84e0584153d70bfd4dc16125bcc093e938238168c300870,2019-08-24T14:50:13.000+0000,android,unknown,unknown,unknown,2019-08-24,8,7
push,send,iFood Brasil,sample,5b26fba83f5ceac7b64e8726,f7261bddc98a298dc833a428befecc6adf6d97b8f7c3755236152e88bfefda69,2019-11-17T11:42:24.000+0000,android,d4a9ef4f-fa17-43e0-884c-986a208e1ef6,2019-11-17 / ANDROID / Ativos / Almoço - Light / CUPOM,android_push,2019-11-17,11,1
push,received,iFood Brasil,sample,5b5270f2376c97e2f20f0e55,36a6962fb680aa7109f7468a3894b754d887a59a1187cc5ab6860efe73290fa4,2019-07-30T18:21:32.000+0000,ios,f94965dc-d829-4e73-88f2-e2a6ecac2585,2019-07-30 / iOS / Ativos / Jantar / CUPOM,unknown,2019-07-30,7,3
push,send,iFood Brasil,sample,5c0bc1bb2760acf76c3df3d3,e0aff996db63f9fcf152311ec931abec2736c42a5dbc5e7137686d79845d2258,2019-09-30T15:57:31.000+0000,ios,unknown,unknown,unknown,2019-09-30,9,2


In [31]:
df_mpf_checkpoint = df_mpf # checkpoint para nao ter que rodar as celulas anteriores
df_mpf.cache()

In [32]:
df_mpf.dtypes

#### Análise Variáveis Qualitativas

In [34]:
variaveis_categoricas = [x[0] for x in df_mpf.dtypes if x[1] not in ('double', 'int', 'long')]
variaveis_categoricas

##### Canais

Somente temos um canal nessa base: pushes. Portanto essa variável pode ser removida no modelo

In [37]:
# Distribuicao por canais
display(df_mpf.groupby('event_channel').count())

event_channel,count
push,20875331


##### Brand

Somente temos uma marca nessa base: iFood Brasil. Portanto essa variável pode ser removida no modelo.

In [40]:
display(df_mpf.groupby('brand').count())

brand,count
iFood Brasil,20875331


##### Message Variation Channel

Talvez usar essa variável como proxy para quando a plataforma não é identificada.

In [43]:
display(df_mpf.groupby('platform','message_variation_channel').count())

platform,message_variation_channel,count
android,android_push,10154789
unknown,ios_push,47077
unknown,unknown,46104
unknown,android_push,63956
ios,ios_push,6332398
android,unknown,2539141
ios,unknown,1691866


##### Tipos de Eventos

+10.3M de pushes foram enviados. Destes 98.3% foram recebidos. Apenas 3.16% foram clicados e bounce da base foi de 4.4%.

In [46]:
# Tipos de eventos
import pyspark.sql.functions as f
from pyspark.sql.window import Window
from pyspark.sql import SQLContext

my_window = Window.partitionBy().orderBy(col('percent').desc())
aux = percentByCol(df_mpf, 'event_name')

df = aux.withColumn('prev_value', f.lag(aux.cnt_per_group).over(my_window))\
        .withColumn("Funnel", (f.col("cnt_per_group") * 100 / f.col("prev_value")))
display(df)

event_name,cnt_per_group,percent,prev_value,Funnel
send,10355984,49.608717581532,,
received,10182446,48.77741100248902,10355984.0,98.3242731931606
click,322629,1.545503637762678,10182446.0,3.168482307689134
bounce,14272,0.068367778216307,322629.0,4.423656893831614


##### Platforma

60.8% dos pushes foram para Android, 38.4% para ios. Unknowns representaram apenas 0.75%

In [49]:
display(percentByCol(df_mpf, 'platform'))

platform,cnt_per_group,percent
android,12693930,60.80828131539567
ios,8024264,38.43897852446028
unknown,157137,0.7527401601440475


##### Sample Type

Aparentemente 0.75% dos pushes representam um universal control group

In [52]:
display(percentByCol(df_mpf, 'sample_type'))

sample_type,cnt_per_group,percent
sample,20718194,99.24725983985596
control,157137,0.7527401601440475


#### Análise Variáveis Quantitativas

##### Event Month

Agosto foi o mês com menor volume de pushes recebidos (11.1% da base) e dezembro foi o mês com maior volume (17.5% da base)

In [56]:
display(percentByCol(df_mpf.filter(df_mpf['event_name'] == 'received'), 'event_month').orderBy('event_month'))

event_month,cnt_per_group,percent
6,1271095,12.483199027031423
7,1741702,17.10494708245936
8,1128464,11.082445219940276
9,1185351,11.641122378650474
10,1612874,15.839750095409295
11,1459939,14.337802527997694
12,1783021,17.510733668511474


##### Day of Week

Finais de semana tem menor volume de pushes (talvez a demanda é maior e não precisa de muitos incentivos). Quintas tiveram o maior volume de envios (16.4% da base)

In [59]:
display(percentByCol(df_mpf.filter(df_mpf['event_name'] == 'received'), 'event_dayofweek').orderBy('event_dayofweek'))

event_dayofweek,cnt_per_group,percent
1,1313607,12.90070185493741
2,1538769,15.111978006070448
3,1534154,15.066654907867914
4,1479915,14.53398328849473
5,1676006,16.459758293832348
6,1411231,13.859449880706462
7,1228764,12.067473768090691


##### User-id

In [61]:
# Histograma de frequencia de pushes recebidos por usuário
df_mpf_sample = df_mpf.sample(withReplacement=False, fraction=1.0, seed=42)
aux = df_mpf_sample\
            .filter(df_mpf_sample['event_name'] == 'received')\
            .groupby('user_id')\
            .count()\
            .withColumnRenamed('count', 'lifecycle_pushes')\
            .groupby('lifecycle_pushes')\
            .count()\
            .withColumnRenamed('count', 'count_users')\
            .withColumn('percent', f.col('count_users')*100/f.sum('count_users').over(Window.partitionBy()))\
            .orderBy('lifecycle_pushes', ascending=False)

In [62]:
display(aux)

lifecycle_pushes,count_users,percent
476,1,0.0035767937620716
374,1,0.0035767937620716
348,1,0.0035767937620716
346,1,0.0035767937620716
332,1,0.0035767937620716
330,2,0.0071535875241433
328,1,0.0035767937620716
324,1,0.0035767937620716
322,1,0.0035767937620716
318,2,0.0071535875241433


##### User-id: Medidas de posição

A base de push envolve `28.5k usuarios únicos` que receberam pushes. Em média cada um recebeu 732 pushes no período (~4/dia).

In [65]:
df_mpf.select(f.countDistinct('user_id')).show()

In [66]:
avg_pushes = df_mpf.groupBy('user_id')\
                           .count()\
                           .withColumnRenamed('count', 'number_pushes')\

chosen_column = 'number_pushes'
exprrs = [sum(chosen_column), max(chosen_column), min(chosen_column), stddev(chosen_column), mean(chosen_column)]
display(avg_pushes.agg(*exprrs))

sum(number_pushes),max(number_pushes),min(number_pushes),stddev_samp(number_pushes),avg(number_pushes)
20875331,4647,1,462.98739483835794,732.2622070997614


Um evento `received` vai ter um evento `send` corresponte? </br>
Sim, abaixo podemos ver que toda linha de um push com event_name `received` vai ter um correspondente `send`, caso ele tenha sido enviado com sucesso.

In [68]:
display(df_mpf.filter(df_mdf['user_id'] == '5ac365c96ee329b1951ac7e9'))

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,event_date,event_month,event_dayofweek
push,send,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-09-24T15:19:49.000+0000,android,e06d2a9a-106a-4dac-af0b-3afb84ff8353,2019-09-24 / ANDROID / Ativos / Lanche da Tarde / CUPOM,android_push,2019-09-24,9,3
push,send,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-09-16T11:16:57.000+0000,android,80df02dc-6a78-4588-9a8d-0b42e65ae0bd,2019-09-16 / ANDROID / Ativos / Almoço / CUPOM,android_push,2019-09-16,9,2
push,received,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-10-30T18:43:26.000+0000,android,91bfd4e5-aa6b-45f8-b9ca-f4f2cf8598de,2019-10-30 / ANDROID / Ativos / Jantar - Light / CUPOM,android_push,2019-10-30,10,4
push,send,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-11-10T11:47:39.000+0000,android,8709b71b-8659-4560-9897-32f48cb1b194,2019-11-10 / ANDROID / Ativos / Almoço - Light / CUPOM,android_push,2019-11-10,11,1
push,send,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-12-23T15:21:25.000+0000,android,ff031779-a252-4984-b371-3796ec79e36d,2019-12-23 / ANDROID / Ativos / Lanche da Tarde / CUPOM,android_push,2019-12-23,12,2
push,received,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-07-22T11:42:34.000+0000,android,230108da-6949-4a0c-acff-b2815ff11a25,2019-07-22 / ANDROID / Almoço / CRM_Restaurante_Expansao (TMA),unknown,2019-07-22,7,2
push,send,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-10-23T11:25:08.000+0000,android,87d04b9f-11f3-4dc9-bebe-b5909f6240a2,2019-10-23 / ANDROID / Ativos / Almoço - Light / CUPOM,android_push,2019-10-23,10,4
push,send,iFood Brasil,control,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-09-01T18:01:31.000+0000,unknown,38d5426c-2152-40e0-b9c8-11bc2654faae,2019-09-01 / ANDROID / 20190901ACAOPONTUALKAS,android_push,2019-09-01,9,1
push,received,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-12-19T18:44:18.000+0000,android,60e95be9-7449-41fe-8b7e-5fca8f4f02b6,2019-12-19 / ANDROID / Ativos / Jantar / CUPOM,android_push,2019-12-19,12,5
push,received,iFood Brasil,sample,5ac365c96ee329b1951ac7e9,6f040026ecb33d63875ca8340a55db433fe185d8e40a0111f7166063893944a8,2019-11-07T11:33:00.000+0000,android,6d26e61c-1a33-4a7b-87bd-37506bce0446,2019-11-07 / ANDROID / Ativos / Almoço - Light / CUPOM,android_push,2019-11-07,11,5


In [69]:
from pyspark.sql.functions import udf
push_range = udf(lambda pushes: '1- < 20' if pushes < 20 else 
                                 '2- 20-60' if (pushes >= 20 and pushes < 60) else
                                 '3- 60-100' if (pushes >= 60 and pushes < 100) else
                                 '4- 100-140' if (pushes >= 100 and pushes < 140) else
                                 '5- 140-180' if (pushes >= 140 and pushes < 180) else
                                 '6- 180-220' if (pushes >= 180 and pushes < 220) else
                                 '7- 220-260' if (pushes >= 220 and pushes < 260) else
                                 '8- 260-300' if (pushes >= 260 and pushes < 300) else
                                 '9- 300+'  if (pushes >= 300) else '')

Share de clientes por bins de pushes recebidos durante os meses:

In [71]:
aux = df_mpf_sample.groupby('user_id', 'event_month')\
            .count()\
            .withColumnRenamed('count', 'monthly_pushes')

aux = aux.withColumn('push_range', push_range(aux.monthly_pushes))\
         .groupby('event_month', 'push_range')\
         .count()\
         .withColumnRenamed('count', 'qtde_clientes')\
         .orderBy('event_month', 'push_range')
display(aux)

event_month,push_range,qtde_clientes
6,1- < 20,3326
6,2- 20-60,4366
6,3- 60-100,5745
6,4- 100-140,9574
6,5- 140-180,3011
6,6- 180-220,888
6,7- 220-260,419
6,8- 260-300,154
6,9- 300+,126
7,1- < 20,2505


A maior média mensal de pushes foi em dezembro com 152 pushes/cliente, a menor foi em agosto com 94 pushes/cliente. Para o modelo, podemos colocar uma variável que indica a média de pushes o cliente recebeu no mês.

In [73]:
monthly_mean = df_mpf_sample.groupby('user_id', 'event_month')\
                            .count()\
                            .withColumnRenamed('count', 'monthly_pushes')\
                            .groupby('event_month')\
                            .mean()\
                            .orderBy('event_month', asc=True)
display(monthly_mean)

event_month,avg(event_month),avg(monthly_pushes)
6,6.0,94.93944003766887
7,7.0,137.31427369387288
8,8.0,94.00453061224488
9,9.0,99.45214399803416
10,10.0,135.06339820973966
11,11.0,124.97182274247491
12,12.0,152.325634718806


### 2. Customer Segmentation

#### Tratamento de Duplicidades

In [77]:
df_customer_segmentation = df_customer_segmentation.distinct()
# Após a remoção de duplicados, as dimensões do dataframe passam a ser:
print('Customer segmentation:     ', df_customer_segmentation.count(), '   linhas e ', len(df_customer_segmentation.columns), ' colunas')

#### Tratamento de Missing

In [79]:
perc_miss = percent_missing(df_customer_segmentation) # Calcula o percentual de missings para todas as colunas
col_drop = perc_miss.filter((perc_miss['perc_missing'] <= 0.1) & (perc_miss['perc_missing'] > 0)).select('variaveis').rdd.flatMap(lambda x: x).collect() # Seleciona colunas com menos de 0.1% de missings
col_drop.append('customer_id') # Acrescenta a coluna de customer_id na lista de colunas com missing
aux1 = df_customer_segmentation.select(col_drop) # Criar datafram auxiliar apenas com as colunas em col_drop
# Cria lista com o customer_id de quem tem algum dado nulo para alguma das colunas em col_drop:
c_id_drop = aux1.where(reduce(lambda x, y: x | y, (f.col(x).isNull() for x in aux1.columns))).select('customer_id').distinct().rdd.flatMap(lambda x: x).collect() 
# Exclui de df_customer_segmentation todos os registros de quem teve informação nula identificada: (Decisão tomada por conta da base ser histórica e o impacto em número absoluto de resgistros ser pequeno)
df_customer_segmentation = df_customer_segmentation.filter(~df_customer_segmentation.customer_id.isin(c_id_drop))

display(percent_missing(df_customer_segmentation).orderBy(desc('perc_missing')))
# A existência de missings é informação relevante para as colunas que permaneceram na lista.

variaveis,total_missing,perc_missing
last_invalid_order_date,117137,55.91158164044601
preferred_shift_bucket_description,54968,26.23720788147243
days_to_reorder_at_concluded,18857,9.00078280128303
days_to_reorder_at_datasource,17879,8.53396593859783
merchant_variety_bucket,0,0.0
merchant_offer,0,0.0
merchant_offer_bucket,0,0.0
merchant_offer_bucket_description,0,0.0
top_dish_bucket,0,0.0
top_dish_bucket_description,0,0.0


#### Tratamento de Arrays

In [81]:
#Ajuste coluna preferred_dishes (tipo array):
df_customer_segmentation = df_customer_segmentation.withColumn('aux_1', udf_limpeza(col('preferred_dishes')))
df_customer_segmentation = df_customer_segmentation.withColumn('preferred_dishes_ar', split(col('aux_1'), ',').cast(ArrayType(StringType())))
#Ajuste coluna top_3_merchants_code (tipo array):
df_customer_segmentation = df_customer_segmentation.withColumn('aux_2', udf_limpeza(col('top_3_merchants_code')))
df_customer_segmentation = df_customer_segmentation.withColumn('top_3_merchants_code_ar', split(col('aux_2'), ',').cast(ArrayType(StringType())))

list_columns_to_drop = ['aux_1','aux_2']
df_customer_segmentation = df_customer_segmentation.drop(*list_columns_to_drop) # Exclusão de colunas auxiliares

df_customer_segmentation.columns

#### Tratamento de Datas

In [83]:
df_customer_segmentation = df_customer_segmentation.withColumn('registration_date', from_utc_timestamp('registration_date', 'UTC'))\
                                                   .withColumn('registration_date', to_date('registration_date', 'YYYY-MM-DD'))\
                                                   .withColumn('last_valid_order_date', from_utc_timestamp('last_valid_order_date', 'UTC'))\
                                                   .withColumn('last_valid_order_date', to_date('last_valid_order_date', 'YYYY-MM-DD'))\
                                                   .withColumn('last_invalid_order_date', from_utc_timestamp('last_invalid_order_date', 'UTC'))\
                                                   .withColumn('last_invalid_order_date', to_date('last_invalid_order_date', 'YYYY-MM-DD'))\
                                                   .withColumn('first_order_date', from_utc_timestamp('first_order_date', 'UTC'))\
                                                   .withColumn('first_order_date', to_date('first_order_date', 'YYYY-MM-DD'))\
                                                   .withColumn('last_order_date', from_utc_timestamp('last_order_date', 'UTC'))\
                                                   .withColumn('last_order_date', to_date('last_order_date', 'YYYY-MM-DD'))\
                                                   .withColumn('segmentation_month', from_utc_timestamp('segmentation_month', 'UTC'))\
                                                   .withColumn('segmentation_month', to_date('segmentation_month', 'YYYY-MM-DD'))\
                                                   .withColumn('registration_month', month('registration_date'))\
                                                   .withColumn('registration_dayofweek', dayofweek('registration_date'))\
                                                   .withColumn('first_order_month', month('first_order_date'))\
                                                   .withColumn('first_order_dayofweek', dayofweek('first_order_date'))\
                                                   .withColumn('segmentation_month_month', month('segmentation_month'))\
                                                   .withColumn('segmentation_month_dayofweek', dayofweek('segmentation_month'))\
                                                   .withColumn('last_order_month', month('last_order_date'))\
                                                   .withColumn('last_order_dayofweek', dayofweek('last_order_date'))

#### Análise Variáveis Qualitativas

Describe das variáveis numéricas que auxiliaram a explicar as variáveis Qualitativas

In [86]:
var_num_cust_seg = [x[0] for x in df_customer_segmentation.dtypes if x[1] in ('double', 'int', 'long')]
display(df_customer_segmentation.describe(var_num_cust_seg))

summary,orders_last_91d,recency_months,days_to_reorder_at_datasource,days_to_reorder_at_concluded,rfv_score,recency_days,recency_days_bucket,freq_last_91d,freq_last_91d_bucket,avg_aov_last_91d,maturity_orders,maturity_orders_bucket,benefits_sensitivity,merchant_variety,merchant_offer,registration_month,registration_dayofweek,first_order_month,first_order_dayofweek,segmentation_month_month,segmentation_month_dayofweek,last_order_month,last_order_dayofweek
count,209504.0,209504.0,191625.0,190647.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0,209504.0
mean,8.547378570337559,1.2802836620343765,34.40750885997784,35.56226937255755,2.7033708186955856,39.26262028409959,2.736907171223461,3.042965814495186,2.54704922101726,55.06199437719568,38.838552008553535,3.6059072857797454,0.6554377943448566,0.7475997084287131,1357.3949566595388,6.266863639835039,4.128723079272949,6.121019169085077,4.155787001680159,9.00339372995265,3.567550022911257,8.076919772414847,4.185309111043226
stddev,9.977829133633788,1.6360901047516403,74.83428254547731,75.88633798354759,0.6586515408969449,50.03528473568874,1.4838300778174165,3.2914642907782548,1.1215154732515034,22.986055395669755,62.337680435058985,1.1924221024110615,0.3351547210458754,0.2453606747899528,1338.513079953022,3.4003146241778524,2.2201720121999484,3.2071897317007814,2.226641198413645,1.998587334034061,2.257105194537576,2.0410546903375395,2.17692227337059
min,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.33,1.0,13.07,1.0,1.0,0.0,0.0114942528735632,1.0,1.0,1.0,1.0,1.0,6.0,1.0,1.0,1.0
max,213.0,40.03225806,2612.0,2612.0,5.0,1217.0,5.0,92.5,5.0,1428.0,1297.0,5.0,1.0,1.0,7031.0,12.0,7.0,12.0,7.0,12.0,7.0,12.0,7.0


Variáveis Qualitativas

In [88]:
var_cat_cust_seg = [x[0] for x in df_customer_segmentation.dtypes if x[1] not in ('double', 'int', 'long')]
var_cat_cust_seg

A chave da base é uma combinação entre customer_id e segmentation_month, isto é, a tabela é um recorte de uma base histórica de um grupo específico de clientes. Após a exclusão dos duplicados, tem-se aproximadamente 30k clientes e suas informações entre Junho e Dezembro.

In [90]:
customer_amount_seg_month = df_customer_segmentation.groupby('segmentation_month_month').count().orderBy('segmentation_month_month')
display(customer_amount_seg_month)

segmentation_month_month,count
6,29726
7,29963
8,29963
9,29963
10,29963
11,29963
12,29963


#### Ifood Status

Esses clientes são classificados em 5 status em um determido mês. Outubro é o mês com a maior proporção de clientes Inativos. Em compensação, a proporção de Churn também se mostra como a menor desde Junho.

In [93]:
# O status desses clientes ao longo dos meses é:
df_ifood_status = df_customer_segmentation.groupby('ifood_status','segmentation_month_month').count().orderBy('segmentation_month_month','ifood_status')
display(df_ifood_status)

ifood_status,segmentation_month_month,count
Active,6,17865
Churn,6,209
Inactive,6,235
New,6,5015
Resurrected,6,6402
Active,7,18253
Churn,7,11536
Inactive,7,174
Active,8,13825
Churn,8,4597


A tabela abaixo mostra a transição de ifood_status entre o mês anterior (eixo horizontal) e o mês atual (eixo vertical).

Highlights: A maioria dos inativos permanece inativo (33334). A maioria dos ativos, permanece ativo. Pessoas novas tem virado mais Churn do que Ativos (3713 contra 3113), mas a proporção é parecida. Maioria das pessoas que ressucitam se tornam ativas (12427). A maioria do Churn é de ativos (15802) e não de novos (3713).

Dúvidas: Pessoas que eram inativos virando Churn (346)? Pessoas que eram ativas virando Inativas direto (255)? Pessoas que eram novas virando Inativas direto (31)? Pessoas ressurretas virando Inativas direto (65)? Pessoas permanecendo como ressurretas (3308)?

In [95]:
df_status_trans = df_customer_segmentation.filter(df_customer_segmentation['ifood_status'] != 'New').groupby('ifood_status','ifood_status_last_month').count() # Df auxiliar com ifood_status e ifood_status_last_month
df_status_trans_pivot = df_status_trans.groupby('ifood_status').pivot('ifood_status_last_month').sum('count').orderBy('ifood_status') # Pivot do Df contruído no passo anterior colocando o ifood_status_last_month no eixo vertical
display(df_status_trans_pivot)

ifood_status,Active,Churn,Inactive,New,Resurrected
Churn,15802.0,,346.0,3713.0,7342
Resurrected,,3117.0,15931.0,,3308
Active,87675.0,,,3113.0,12427
Inactive,255.0,18030.0,33334.0,31.0,65


In [96]:
df_ifood_status = df_customer_segmentation.select('customer_id','segmentation_month_month','ifood_status')
df_ifood_status_pivot = df_ifood_status.groupby('customer_id','ifood_status').pivot('segmentation_month_month').count().fillna(0)
df_ifood_status_pivot = df_ifood_status_pivot.withColumn('ifood_status_freq', df_ifood_status_pivot['6'] + df_ifood_status_pivot['7'] + df_ifood_status_pivot['8'] + df_ifood_status_pivot['9'] + df_ifood_status_pivot['10'] + df_ifood_status_pivot['11'] + df_ifood_status_pivot['12'])

Vê-se que, em média, as pessoas que tiveram status Ativo na base, o tiveram por 4 meses. Mais ainda, vê-se que, em média, quem deu Churn, o fez apenas uma vez no período.

In [98]:
display(df_ifood_status_pivot.groupby('ifood_status').agg(f.round(avg('ifood_status_freq'),0)).orderBy('ifood_status'))

ifood_status,"round(avg(ifood_status_freq), 0)"
Active,4.0
Churn,1.0
Inactive,3.0
New,1.0
Resurrected,1.0


Vê-se que, aproximadamente 30% (6680 clientes) das pessoas que tiveram status Ativo ao menos uma vez no período, na verdade, permanceram ativas durante os 7 meses que se tem de informação disponível. Entre os que em algum momento tiveram status Inativo, 30% (4751 clientes) foram inativos por 5 meses.

In [100]:
display(df_ifood_status_pivot.groupby('ifood_status','ifood_status_freq').count().orderBy('ifood_status','ifood_status_freq'))

ifood_status,ifood_status_freq,count
Active,1,4227
Active,2,3241
Active,3,2766
Active,4,2728
Active,5,3086
Active,6,1851
Active,7,6680
Churn,1,15168
Churn,2,5196
Churn,3,537


#### Tipo de comida preferida

No período, Lanches aparece como sendo o tipo de prato preferido do clientes tendo sido classificado como uma preferência por cerca de X%. Pizza, Comida Brasileira e Comida Japonesa também se destacam como

In [103]:
df_preferred_dishes = df_customer_segmentation.select('customer_id','segmentation_month_month', explode(col('preferred_dishes_ar')).alias('pratos')) # Cria df auxiliar com pratos favoritos por mês
display(df_preferred_dishes.groupby('pratos').count()) # Ranking de pratos favoritos no período todo

pratos,count
Peixes,109
Cafeteria,471
Congelados Fit,3
Doces & Bolos,7061
Comida Chinesa,14856
Sopas & Caldos,78
Comida Brasileira,75081
Comida Árabe,9485
Açaí,16124
Comida Asiática,208


Vê-se que tipos de prato citados anteriormente permanecem entre os favoritos ao longo dos meses e que a partir de Agosto açaí ultrapassa comida chinesa como um dos 5 tipos de comidas favoritos entre os 30k clientes da base.

In [105]:
# Pratos favoritos mês a mês:

# Df auxiliar com preferência mensal:
df_preferred_dishes_aux = df_preferred_dishes.groupby('pratos','segmentation_month_month').count()
# Inclusão de coluna com ranking de prato preferido para cada mês:
df_preferred_dishes_ranked =  df_preferred_dishes_aux.withColumn(
  "rank", dense_rank().over(Window.partitionBy("segmentation_month_month").orderBy(desc("count"))))
# Inclusão de coluna que agrupa os pratos fora do TOP 5 de favoritos para cada mês:
df_preferred_dishes_ranked = df_preferred_dishes_ranked.withColumn(
  'pratos_group',f.when(df_preferred_dishes_ranked.rank > 5, 'other').otherwise(df_preferred_dishes_ranked.pratos))

display(df_preferred_dishes_ranked.groupby('segmentation_month_month','pratos_group').sum('count').orderBy("segmentation_month_month"))

segmentation_month_month,pratos_group,sum(count)
6,Comida Chinesa,2221
6,other,15656
6,Lanches,18248
6,Pizza,12909
6,Comida Brasileira,9970
6,Comida Japonesa,5274
7,Comida Japonesa,5367
7,Lanches,18938
7,Pizza,13621
7,Comida Brasileira,10593


##### Pedidos válidos

In [107]:
df_qtt_orders = df_customer_segmentation.select('customer_id','segmentation_month_month','ifood_status','orders_last_91d','qtt_orders_last_year','qtt_valid_orders').orderBy('customer_id','segmentation_month_month')
df_qtt_orders_lag = df_qtt_orders.withColumn('qtt_valid_orders_prev',
                                            f.lag(df_qtt_orders['qtt_valid_orders'])
                                             .over(Window.partitionBy("customer_id")
                                             .orderBy("segmentation_month_month")))

df_qtt_orders_final = df_qtt_orders_lag.withColumn('qtt_valid_orders_month', 
          (df_qtt_orders_lag['qtt_valid_orders'] - df_qtt_orders_lag['qtt_valid_orders_prev']) )

df_qtt_orders_result = df_qtt_orders_final.filter((df_qtt_orders_final['segmentation_month_month'] > 6) & (df_qtt_orders_final['qtt_valid_orders_month'] > 0)).groupby('segmentation_month_month').agg(sum('qtt_valid_orders_month'),count('customer_id'))

df_qtt_orders_result = df_qtt_orders_result.withColumn('qtt_valid_orders_avg',df_qtt_orders_result['sum(qtt_valid_orders_month)']/df_qtt_orders_result['count(customer_id)'])

Há um pico de mais de 70k pedidos no mês de Julho que não se repete até Dezembro. Outubro se destaca com o segundo maior número de pedidos no período, algo próximo de 68k. Os demais meses ficaram na casa dos 66k pedidos.

Obs.: Não existem informações consistentes para o mês de Junho.

In [109]:
display(df_qtt_orders_result.select('segmentation_month_month','sum(qtt_valid_orders_month)'))

segmentation_month_month,sum(qtt_valid_orders_month),count(customer_id),qtt_valid_orders_avg
12,66563,16438,4.04933690229955
9,66603,17002,3.917362663216092
8,66547,17174,3.874868988005124
7,71008,18705,3.796204223469661
10,68295,16710,4.087073608617594
11,66668,16527,4.0338839474798815


In [110]:
display(df_qtt_orders_result.agg(avg('sum(qtt_valid_orders_month)')))

avg(sum(qtt_valid_orders_month))
67614.0


Enquanto o número absoluto tem seu pico em Julho, o número médio de pedidos tem uma curva crescente com seu pico em Outubro com algo próximo a 4.1 pedidos por cliente que efetuou ao menos um pedido.

In [112]:
display(df_qtt_orders_result.select('segmentation_month_month','qtt_valid_orders_avg'))

segmentation_month_month,qtt_valid_orders_avg
12,4.04933690229955
9,3.917362663216092
8,3.874868988005124
7,3.796204223469661
10,4.087073608617594
11,4.0338839474798815


São Paulo é a cidade com maior número de clientes, com quase o dobro do que o Rio de Janeiro, segunda colocada no ranking.

In [114]:
display(df_customer_segmentation.groupby(lower(col('top_city'))).count().orderBy(desc('count')))

lower(top_city),count
sao paulo,32821
rio de janeiro,17071
belo horizonte,8161
brasilia,6773
fortaleza,6514
curitiba,6002
campinas,4587
recife,4151
porto alegre,3983
goiania,3268


O cenário se mantém com São Paulo como destaque quando olha-se apenas para clientes ativos.

In [116]:
display(df_customer_segmentation.filter(df_customer_segmentation['ifood_status'] == 'Active').groupby(lower(col('top_city'))).count().orderBy(desc('count')))

lower(top_city),count
sao paulo,17220
rio de janeiro,9395
belo horizonte,4082
brasilia,3667
fortaleza,3257
curitiba,3187
campinas,2536
recife,2154
porto alegre,2109
londrina,1661


Chegando ao nível dos bairros em São Paulo, Bela Vista, Vila Mariana e Pinheiros são os destaques.

In [118]:
display(df_customer_segmentation.filter((df_customer_segmentation['ifood_status'] == 'Active') & (df_customer_segmentation['top_city'] == 'SAO PAULO')).groupby(lower(col('top_district'))).count().orderBy(desc('count')))

lower(top_district),count
bela vista,496
vila mariana,466
pinheiros,296
perdizes,281
jardim paulista,260
consolacao,237
itaim bibi,236
santana,224
indianopolis,224
vila olimpia,212


Ser classificado como Marlin significa estar entre os melhores. Vê-se que a proporção de Marlins aumenta ao longo dos meses.

In [120]:
df_marlin_full = df_customer_segmentation.select('customer_id','marlin_tag','segmentation_month_month','ifood_status','ifood_status_last_month')

df_marlin_full_lag = df_marlin_full.withColumn('marlin_tag_prev',
                                            f.lag(df_marlin_full['marlin_tag'])
                                             .over(Window.partitionBy("customer_id")
                                             .orderBy("segmentation_month_month")))

df_marlin = df_marlin_full.groupby('marlin_tag','segmentation_month_month','ifood_status').count().orderBy('segmentation_month_month','marlin_tag')
display(df_marlin)

marlin_tag,segmentation_month_month,ifood_status,count
1. Marlin,6,Inactive,11
1. Marlin,6,New,76
1. Marlin,6,Resurrected,62
1. Marlin,6,Active,5041
1. Marlin,6,Churn,3
2. Tilapia,6,Inactive,20
2. Tilapia,6,New,1401
2. Tilapia,6,Churn,59
2. Tilapia,6,Active,5609
2. Tilapia,6,Resurrected,780


Assim como mais clientes passam a ser classificados como Marlin ao longo dos meses, os clientes classificados como Marlin mudam seu comportamento e alguns passam inclusive a ficar inativos. Em outubro 15% dos clientes classificados como Marlin estavam inativos.

In [122]:
display(df_marlin.filter(df_marlin['marlin_tag']=='1. Marlin').groupby('segmentation_month_month','ifood_status').sum('count').orderBy('segmentation_month_month',desc('ifood_status')))

segmentation_month_month,ifood_status,sum(count)
6,Resurrected,62
6,New,76
6,Inactive,11
6,Churn,3
6,Active,5041
7,Inactive,11
7,Churn,622
7,Active,5997
8,Resurrected,237
8,Inactive,309


A tabela abaixo mostra a tag_marlin do próximo mês (eixo vertical) para quem era Marlin no mês anterior para cada um dos ifood_status (eixo horizontal).

Nota-se então que a grande maioria dos Inativos classificados como Marlin permanecem como Marlin no mês seguinte (2127). E o mesmo se aplica para quem deu Churn e era Marlin (2196).

In [124]:
df_marlin_lag = df_marlin_full_lag.filter(df_marlin_full_lag['marlin_tag_prev'] == '1. Marlin').groupby('marlin_tag','ifood_status_last_month').count()
df_marlin_full_lag_pivot = df_marlin_lag.groupby('marlin_tag').pivot('ifood_status_last_month').sum('count')
display(df_marlin_full_lag_pivot.orderBy('marlin_tag'))
# Uma vez Marlin, sempre Marlin? NÃO! O que precisa acontecer para alguém deixar de ser Marlin?

marlin_tag,Active,Churn,Inactive,New,Resurrected
1. Marlin,38650,2196,2127,76.0,788
2. Tilapia,2621,689,921,,606
3. Subsidy Carp,57,6,21,,18
4. Retention Carp,7,1,103,,35


##### NPS

A proporção de Promotores permanece maior que 75% em todos os meses, mas a proporção de Neutros e Detratores cresce ao longo do tempo.

In [127]:
df_nps_full = df_customer_segmentation.filter(df_customer_segmentation['last_nps'] != 'Sem Avaliacoes').select('customer_id','segmentation_month_month','last_nps')
df_nps = df_nps_full.groupby('segmentation_month_month','last_nps').count().orderBy('segmentation_month_month','last_nps')
display(df_nps)

segmentation_month_month,last_nps,count
6,Detractor,1025
6,Neutral,2004
6,Promoter,10527
7,Detractor,1048
7,Neutral,1991
7,Promoter,10557
8,Detractor,1015
8,Neutral,1920
8,Promoter,10174
9,Detractor,1037


Consequência do crescimento na proporção de Detratores e Neutros (e diminuição na proporção de Promotores), o NPS apresenta curva decrescente com considerável diminuição na inclinação a partir de Agosto.

In [129]:
df_nps_tot = df_nps_full.groupby('segmentation_month_month').count().withColumnRenamed('count','count_tot').withColumnRenamed('segmentation_month_month','segmentation_month_month_aux')
df_nps_join = df_nps.join(df_nps_tot, df_nps.segmentation_month_month == df_nps_tot.segmentation_month_month_aux, how='left')
df_nps_join = df_nps_join.withColumn('percent',col('count')/col('count_tot'))

df_nps_pivot_aux = df_nps_join.filter(df_nps_join['last_nps'] != 'Neutral').select('segmentation_month_month','last_nps','percent')
df_nps_pivot = df_nps_pivot_aux.groupby('segmentation_month_month').pivot('last_nps').sum('percent')
df_nps_pivot = df_nps_pivot.withColumn('nps',(col('Promoter') - col('Detractor')) * 100.0)

display(df_nps_pivot)

segmentation_month_month,Detractor,Promoter,nps
12,0.0970244692642169,0.7339074089862734,63.68829397220564
6,0.0756122750073768,0.7765565063440543,70.09442313366775
9,0.0829268292682926,0.7629748100759696,68.0047980807677
8,0.0774277214127698,0.776108017392631,69.86802959798611
7,0.0770814945572227,0.7764783759929391,69.93968814357164
10,0.089563664200051,0.7501913753508548,66.06277111508038
11,0.0931866271688531,0.7426999576809141,64.95133305120609


#### Sensibilidade a Benefícios

O índice médio de sensibilidade a benefícios tem curva crescente para todos os clientes da base.

In [132]:
display(df_customer_segmentation.groupby('segmentation_month_month').agg(avg('benefits_sensitivity')).orderBy('segmentation_month_month'))

segmentation_month_month,avg(benefits_sensitivity)
6,0.6446650044406617
7,0.6470061006999089
8,0.6505910168027851
9,0.6572291486471352
10,0.6625707220513501
11,0.6630196065268191
12,0.662897751112593


O mesmo não se aplica para os clientes que são Churn, que tem comportamento bem mais instável.

In [134]:
display(df_customer_segmentation.filter(df_customer_segmentation['ifood_status'] == 'Churn').groupby('segmentation_month_month').agg(avg('benefits_sensitivity')).orderBy('segmentation_month_month'))

segmentation_month_month,avg(benefits_sensitivity)
6,0.6527047685716415
7,0.6584145816031152
8,0.6579176234649798
9,0.6389272073584532
10,0.6733618834268185
11,0.6621119792442269
12,0.6675081493721615


#### Correlação de Variáveis

In [136]:
col_corr_graf =['orders_last_91d',
               'recency_months',
               #'days_to_reorder_at_datasource',
               #'days_to_reorder_at_concluded',
               'rfv_score',
               'recency_days',
               'recency_days_bucket',
               'freq_last_91d',
               'freq_last_91d_bucket',
               'avg_aov_last_91d',
               'maturity_orders',
               'maturity_orders_bucket',
               'benefits_sensitivity',
               'merchant_variety',
               'merchant_offer',
               'registration_month',
               #'registration_dayofweek',
               'first_order_month',
               #'first_order_dayofweek',
               'segmentation_month_month',
               #'segmentation_month_dayofweek',
               'last_order_month',
               #'last_order_dayofweek'
               ]

In [137]:
# Reference: https://stackoverflow.com/questions/55546467/how-to-plot-correlation-heatmap-when-using-pysparkdatabricks
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=col_corr_graf,
                            outputCol=vector_col)
myGraph_vector = assembler.transform(df_customer_segmentation).select(vector_col)
matrix = Correlation.corr(myGraph_vector, vector_col)

# Gera uma lista de listas representando uma matriz de correlacao
matrix = Correlation.corr(myGraph_vector, vector_col).collect()[0][0]
corrmatrix_orders = matrix.toArray().tolist()
df_corr_orders = spark.createDataFrame(corrmatrix_orders,col_corr_graf)

In [138]:
corrmatrix_orders = matrix.toArray().tolist()
df_corr_orders = spark.createDataFrame(corrmatrix_orders,col_corr_graf)
plot_corr_matrix(corrmatrix_orders,col_corr_graf,fig_no=234, figsize=[15,10])

### 3 and 4. Orders and Orders with Cost/Revenue

#### Join bases de Pedidos e Pedidos com Custo/Receita

In [141]:
df_orders_total = join_removing_repeated(df_orders, df_orders_with_cost_revenue, df_orders.order_number == df_orders_with_cost_revenue.order_number, 'left')
df_orders_total.cache()

In [142]:
# Sanity check para verificar se o join está correto:
print(df_orders.count())
print(df_orders_with_cost_revenue.count())
print(df_orders_total.count())

In [143]:
print('Orders with cost/revenue:     ', df_orders_total.count(), '   linhas e ', len(df_orders_total.columns), ' colunas')

#### Tratamento de Missings

Vemos que `customer_has_plus` tem muito missing, como esperado desde que esse é um teste. A abordagem usada vai ser definir `unknown` para as variáveis categóricas com missing. Para as numéricas, iremos adotar a média. Já para as númericas inteiras (ex: com buckets) vamos usar a média arredondada para o integer mais próximo.

In [146]:
colunas_orders_missing = percent_missing(df_orders_total).orderBy('perc_missing', ascending=False)
display(colunas_orders_missing)

variaveis,total_missing,perc_missing
customer_has_plus,291126,59.15535886275801
customer_seg_preferred_shift,195687,39.7626275556856
customer_seg_marlin_tag,60587,12.310977815165664
order_lead_at_login,13431,2.729112565987589
customer_seg_frequency_bucket,8417,1.7102926414948652
customer_seg_recency_bucket,8417,1.7102926414948652
customer_seg_benefits_sensitivity_bucket,8417,1.7102926414948652
session_id,8346,1.69586579374078
customer_seg_gross_income_bucket,8301,1.686722016995233
customer_seg_status_last_month,8301,1.686722016995233


In [147]:
# Para as colunas com missing, separar em categoricas e numericas
df_columns_orders_missing = colunas_orders_missing.filter(colunas_orders_missing['perc_missing'] > 0
                                                  ).select('variaveis')
list_df_columns_orders_missing = list([row[0] for row in df_columns_orders_missing.collect()])

filter_colunas_orders_numericas = [x[0] for x in df_orders_total[list_df_columns_orders_missing].dtypes if x[1] in ('double', 'long')]
filter_colunas_orders_numericas_int = [x[0] for x in df_orders_total[list_df_columns_orders_missing].dtypes if x[1] in ('int')]
filter_colunas_orders_categoricas = [x[0] for x in df_orders_total[list_df_columns_orders_missing].dtypes if x[1] not in ('double', 'int', 'long', 'boolean')]
filter_colunas_orders_booleanas = [x[0] for x in df_orders_total[list_df_columns_orders_missing].dtypes if x[1] in ('boolean')]

# Inputar unknown para colunas categoricas
for coluna in filter_colunas_orders_categoricas:
  df_orders_total = df_orders_total.fillna('unknown', subset=[coluna])

# Transformar colunas booleanas para string, depois inputar unknown
for coluna in filter_colunas_orders_booleanas:
  df_orders_total = df_orders_total.withColumn(coluna,col(coluna).cast('string'))
  df_orders_total = df_orders_total.fillna('unknown', subset=[coluna])

# Inputar media para colunas numericas
for coluna in filter_colunas_orders_numericas:
  media = df_orders_total.agg(mean(coluna)).collect()[0][0]
  df_orders_total = df_orders_total.fillna(media, subset=[coluna])
  
# Inputar media para colunas numericas inteiras arredondando para o integer mais perto
for coluna in filter_colunas_orders_numericas:
  media = df_orders_total.agg(mean(coluna)).collect()[0][0]
  df_orders_total = df_orders_total.fillna(media, subset=[coluna])
  df_orders_total = df_orders_total.withColumn(coluna, f.round(df_orders_total[coluna], 0))

In [148]:
len(list_df_columns_orders_missing)

In [149]:
len(filter_colunas_orders_categoricas)

In [150]:
len(filter_colunas_orders_numericas)

In [151]:
len(filter_colunas_orders_booleanas)

In [152]:
display(df_orders_total)

order_id,order_number,order_timestamp_local,order_shift,last_status_date_local,order_total,credit,paid_amount,delivery_type,scheduled,scheduled_creation_date_local,device_app_version,device_type,device_platform,payment_method,customer_state_label,customer_city,customer_district,customer_centroid_id,customer_has_plus,customer_seg_status_last_month,customer_seg_recency_bucket,customer_seg_frequency_bucket,customer_seg_merchant_offer_bucket,customer_seg_benefits_sensitivity_bucket,customer_seg_marlin_tag,customer_seg_gross_income_bucket,customer_seg_preferred_shift,frn_id,merchant_city,merchant_district,merchant_centroid_id,merchant_dish_type,distance_merchant_customer,promo_is_promotion,normal_items_quantity,promo_items_quantity,order_lag_at_login,order_lead_at_login,order_date_local,valid_order,session_id,customer_id,cohort_month,first_order_date,months_after_first_purchase,general_net_profit
56e7da7d-1019-4176-95c7-18677cf88138,667770135,2019-06-01T00:51:27.395Z,weekend dawn,2019-06-01T06:33:20.137Z,29.0,4.0,25.0,DELIVERY,False,2019-06-01T00:51:27.395Z,Android_iFood_8.48.3,MOBILE,ANDROID,OTHERS_OFFLINE,CE,FORTALEZA,Itaoca,-3.77-38.55,unknown,Active,2.0,4.0,5.0,Alta,3. Subsidy Carp,6.0,unknown,177457,FORTALEZA,Democrito Rocha,-3.76-38.57,Pizza,2265.780724667483,0.0,2.0,0.0,9.0,83.0,2019-06-01T00:00:00.000Z,1,abc8cd16-4b84-415c-8109-3a90ccf7ebcf,abd722bdf397dbb6781354d0731b03bca2070ee4d60134feb0ff9ecae3617b4d,2019-01-01T00:00:00.000Z,2019-01-28T00:00:00.000Z,5.0,6.704
327a5c8f-9502-4671-9993-972accb291f0,667772160,2019-06-01T00:56:32.740Z,weekend dawn,2019-06-01T06:41:05.017Z,34.9,4.0,30.9,DELIVERY,False,2019-06-01T00:56:32.740Z,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,MG,BELO HORIZONTE,Olaria (barreiro),-19.99-44.03,unknown,Inactive,4.0,2.0,5.0,Media,4. Retention Carp,6.0,5. Jantar,260680,BELO HORIZONTE,Flavio Marques Lisboa (barreiro),-19.99-44.0,Sorvetes,3071.509982380759,0.0,1.0,0.0,69.0,1.0,2019-06-01T00:00:00.000Z,1,d3abd2cc-93a2-44f5-a39d-1d684cac763e,83b3746d937cb40cd850794804284d8950845be290aedb0f3d3896e41df3f8d8,2018-06-01T00:00:00.000Z,2018-06-17T00:00:00.000Z,12.0,7.61841875
454fde87-b7e5-4c67-a5d2-356aeb339a44,669393616,2019-06-01T11:47:20.979Z,weekend lunch,2019-06-01T12:18:53.882Z,55.9,4.0,51.9,DELIVERY,False,2019-06-01T11:47:20.979Z,iOS_iFood_9.2.0,MOBILE,IOS,CC_OFFLINE,DF,BRASILIA,Asa Sul,-15.81-47.91,unknown,Active,3.0,5.0,5.0,Media,3. Subsidy Carp,6.0,unknown,211400,BRASILIA,Setor De Habitacoes Individuais Sul,-15.85-47.89,Marmita,4371.288354253842,1.0,0.0,2.0,18.0,2.0,2019-06-01T00:00:00.000Z,1,ac32568d-ebc6-4a6b-b801-87c50cb8da59,a63624c4d82738687bee190af3dd6e534007ee451ae681ee91c4984b4c95d905,2017-07-01T00:00:00.000Z,2017-07-18T00:00:00.000Z,23.0,1.565851111837798
5c007a8b-6b22-461f-9919-2b7754c4e72d,669406022,2019-06-01T11:59:24.212Z,weekend lunch,2019-06-01T14:02:49.540Z,30.5,4.0,26.5,DELIVERY,False,2019-06-01T11:59:24.212Z,Android_iFood_9.3.1,MOBILE,ANDROID,OTHERS_OFFLINE,SC,SAO JOSE,Rocado,-27.59-48.62,unknown,Active,1.0,3.0,5.0,Alta,2. Tilapia,6.0,unknown,184913,SAO JOSE,Kobrasol,-27.6-48.61,Comida Brasileira,903.747462995218,0.0,1.0,0.0,8.0,12.0,2019-06-01T00:00:00.000Z,1,9dbb825d-2a32-4d2d-9d67-b917253dd028,9bc258be21067f360696eb3aae36f1c6d902ad2f98c2e59ceeee11d5da830d17,2018-01-01T00:00:00.000Z,2018-01-12T00:00:00.000Z,17.0,6.86375
4b8c13a8-df68-40bb-a9a4-2ea60f7000b3,669484170,2019-06-01T13:00:25.188Z,weekend lunch,2019-06-01T13:40:27.776Z,67.9,4.0,63.9,DELIVERY,False,2019-06-01T13:00:25.188Z,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,RJ,RIO DE JANEIRO,Copacabana,-22.99-43.19,unknown,Active,4.0,3.0,5.0,Alta,2. Tilapia,6.0,unknown,224519,RIO DE JANEIRO,Leblon,-22.98-43.22,Carnes,3316.477781154116,0.0,1.0,0.0,30.0,6.0,2019-06-01T00:00:00.000Z,1,606080a2-4523-4dbb-8066-a3db8f4dcf4f,6f5ae153404c42fbab723806b3f1a0bf80ba303ca62b72658b715c09406f94b4,2013-05-01T00:00:00.000Z,2013-05-14T00:00:00.000Z,73.0,7.033049999999999
285b824a-c7ff-4bd0-8919-ccdd72f14e8d,669651996,2019-06-01T16:25:09.268Z,weekend snack,2019-06-01T18:25:26.004Z,36.9,4.0,32.9,DELIVERY,False,2019-06-01T16:25:09.268Z,Android_iFood_9.3.1,MOBILE,ANDROID,OTHERS_OFFLINE,MG,CONTAGEM,Jardim Industrial,-19.97-44.02,unknown,Churn,4.0,2.0,5.0,Alta,4. Retention Carp,6.0,unknown,151095,CONTAGEM,Industrial,-19.97-44.03,Açaí,1483.9599361274786,0.0,1.0,0.0,49.0,91.0,2019-06-01T00:00:00.000Z,1,aab04da9-565b-4a2f-b258-c5c73c96b526,356c80f8d58fc48a17fe4c59631cdf51b93d48c1d05fb9ed5621e6a36aeb4992,2018-04-01T00:00:00.000Z,2018-04-23T00:00:00.000Z,14.0,7.54535
1acb0253-aa32-4b60-b4c4-c2c09ee02784,669790559,2019-06-01T18:57:09.604Z,weekend dinner,2019-06-01T21:02:17.272Z,91.0,4.0,87.0,DELIVERY,False,2019-06-01T18:57:09.604Z,Android_iFood_9.3.1,MOBILE,ANDROID,OTHERS_OFFLINE,SP,SAO PAULO,Santana,-23.49-46.63,unknown,Active,1.0,4.0,5.0,Alta,1. Marlin,6.0,5. Jantar,82540,SAO PAULO,Santana,-23.48-46.62,Pizza,1235.4832476031324,0.0,2.0,0.0,7.0,7.0,2019-06-01T00:00:00.000Z,1,29ec3d93-cb1e-4d97-b02f-6ec3fd334556,18632d6ff5223c77505d3fd0735ed6a5cda7741a90cd432380c7bf068cebc5a2,2017-06-01T00:00:00.000Z,2017-06-18T00:00:00.000Z,24.0,13.307
89d77682-a0fd-4ee2-aef2-ca6ddf98b511,669822779,2019-06-01T19:11:25.212Z,weekend dinner,2019-06-01T21:24:23.979Z,103.5,4.0,99.5,DELIVERY,False,2019-06-01T19:11:25.212Z,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,GO,GOIANIA,Setor Oeste,-16.69-49.26,unknown,Active,1.0,3.0,5.0,Alta,2. Tilapia,6.0,5. Jantar,46254,GOIANIA,Setor Bueno,-16.69-49.27,Lanches,1510.6528258060157,0.0,5.0,0.0,6.0,11.0,2019-06-01T00:00:00.000Z,1,3d9b96c8-9f92-4511-8977-9eb14921c561,4e1a1da742c3e38f5bdf3ad0197f55ba955fe93a17cee91655e07c27bcb39cdf,2016-06-01T00:00:00.000Z,2016-06-26T00:00:00.000Z,36.0,15.82040625
ce7b6404-6f6b-49bd-a0bd-89182e48b0f8,669848348,2019-06-01T19:21:35.701Z,weekend dinner,2019-06-01T21:42:36.245Z,87.9,4.0,83.9,DELIVERY,False,2019-06-01T19:21:35.701Z,Android_iFood_9.3.1,MOBILE,ANDROID,CC_OFFLINE,SP,CAMPINAS,Parque Residencial Vila Uniao,-22.95-47.12,unknown,Churn,4.0,2.0,5.0,Baixa,4. Retention Carp,6.0,5. Jantar,28436,CAMPINAS,Parque Residencial Vila Uniao,-22.95-47.12,Pizza,480.7451817927033,0.0,1.0,0.0,56.0,28.0,2019-06-01T00:00:00.000Z,1,fead448c-679c-4620-a4a1-733e384e15e4,fade6a1f163d8dd1626d2b24ab4b482d42d20bcddfc4fd311a6791efc3b46d9f,2016-12-01T00:00:00.000Z,2016-12-07T00:00:00.000Z,30.0,12.97685
9dd560ff-eabe-4ed3-a676-7819b055cfb5,669894756,2019-06-01T19:39:18.514Z,weekend dinner,2019-06-01T20:14:07.204Z,63.0,4.0,59.0,DELIVERY,False,2019-06-01T19:39:18.514Z,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,SP,SAO BERNARDO DO CAMPO,Jardim Do Mar,-23.69-46.56,unknown,Active,1.0,3.0,5.0,Alta,4. Retention Carp,6.0,5. Jantar,146725,SAO BERNARDO DO CAMPO,Jardim Do Mar,-23.69-46.56,Lanches,368.5018663540701,0.0,2.0,0.0,7.0,15.0,2019-06-01T00:00:00.000Z,1,5eaf1f75-dab6-48ea-bbae-d791339d09e0,4a90569892976fbc7845a84f9af03afd70a6194c0653ab836674cee3e06dc4a6,2016-05-01T00:00:00.000Z,2016-05-24T00:00:00.000Z,37.0,10.006950837929567


#### Tratamento de Datas

In [154]:
# importante: transformar primeiro para timestamp, depois para date!!!
df_orders_total = df_orders_total.withColumn('order_timestamp_local', from_utc_timestamp('order_timestamp_local', 'UTC'))\
                                 .withColumn('last_status_date_local', from_utc_timestamp('last_status_date_local', 'UTC'))\
                                 .withColumn('scheduled_creation_date_local', from_utc_timestamp('scheduled_creation_date_local', 'UTC'))\
                                 .withColumn('order_date_local', from_utc_timestamp('order_date_local', 'UTC'))\
                                 .withColumn('cohort_month', from_utc_timestamp('cohort_month', 'UTC'))\
                                 .withColumn('first_order_date', from_utc_timestamp('first_order_date', 'UTC'))\
                                 .withColumn('order_date_local', to_date('order_date_local', 'YYYY-MM-DD'))\
                                 .withColumn('cohort_month', to_date('cohort_month', 'YYYY-MM-DD'))\
                                 .withColumn('first_order_date', to_date('first_order_date', 'YYYY-MM-DD'))\
                                 .withColumn('order_date_local_month', month('order_date_local'))\
                                 .withColumn('order_date_local_dayofweek', dayofweek('order_date_local'))
display(df_orders_total)

order_id,order_number,order_timestamp_local,order_shift,last_status_date_local,order_total,credit,paid_amount,delivery_type,scheduled,scheduled_creation_date_local,device_app_version,device_type,device_platform,payment_method,customer_state_label,customer_city,customer_district,customer_centroid_id,customer_has_plus,customer_seg_status_last_month,customer_seg_recency_bucket,customer_seg_frequency_bucket,customer_seg_merchant_offer_bucket,customer_seg_benefits_sensitivity_bucket,customer_seg_marlin_tag,customer_seg_gross_income_bucket,customer_seg_preferred_shift,frn_id,merchant_city,merchant_district,merchant_centroid_id,merchant_dish_type,distance_merchant_customer,promo_is_promotion,normal_items_quantity,promo_items_quantity,order_lag_at_login,order_lead_at_login,order_date_local,valid_order,session_id,customer_id,cohort_month,first_order_date,months_after_first_purchase,general_net_profit,order_date_local_month,order_date_local_dayofweek
56e7da7d-1019-4176-95c7-18677cf88138,667770135,2019-06-01T00:51:27.395+0000,weekend dawn,2019-06-01T06:33:20.137+0000,29.0,4.0,25.0,DELIVERY,False,2019-06-01T00:51:27.395+0000,Android_iFood_8.48.3,MOBILE,ANDROID,OTHERS_OFFLINE,CE,FORTALEZA,Itaoca,-3.77-38.55,unknown,Active,2.0,4.0,5.0,Alta,3. Subsidy Carp,6.0,unknown,177457,FORTALEZA,Democrito Rocha,-3.76-38.57,Pizza,2265.780724667483,0.0,2.0,0.0,9.0,83.0,2019-06-01,1,abc8cd16-4b84-415c-8109-3a90ccf7ebcf,abd722bdf397dbb6781354d0731b03bca2070ee4d60134feb0ff9ecae3617b4d,2019-01-01,2019-01-28,5.0,6.704,6,7
327a5c8f-9502-4671-9993-972accb291f0,667772160,2019-06-01T00:56:32.740+0000,weekend dawn,2019-06-01T06:41:05.017+0000,34.9,4.0,30.9,DELIVERY,False,2019-06-01T00:56:32.740+0000,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,MG,BELO HORIZONTE,Olaria (barreiro),-19.99-44.03,unknown,Inactive,4.0,2.0,5.0,Media,4. Retention Carp,6.0,5. Jantar,260680,BELO HORIZONTE,Flavio Marques Lisboa (barreiro),-19.99-44.0,Sorvetes,3071.509982380759,0.0,1.0,0.0,69.0,1.0,2019-06-01,1,d3abd2cc-93a2-44f5-a39d-1d684cac763e,83b3746d937cb40cd850794804284d8950845be290aedb0f3d3896e41df3f8d8,2018-06-01,2018-06-17,12.0,7.61841875,6,7
454fde87-b7e5-4c67-a5d2-356aeb339a44,669393616,2019-06-01T11:47:20.979+0000,weekend lunch,2019-06-01T12:18:53.882+0000,55.9,4.0,51.9,DELIVERY,False,2019-06-01T11:47:20.979+0000,iOS_iFood_9.2.0,MOBILE,IOS,CC_OFFLINE,DF,BRASILIA,Asa Sul,-15.81-47.91,unknown,Active,3.0,5.0,5.0,Media,3. Subsidy Carp,6.0,unknown,211400,BRASILIA,Setor De Habitacoes Individuais Sul,-15.85-47.89,Marmita,4371.288354253842,1.0,0.0,2.0,18.0,2.0,2019-06-01,1,ac32568d-ebc6-4a6b-b801-87c50cb8da59,a63624c4d82738687bee190af3dd6e534007ee451ae681ee91c4984b4c95d905,2017-07-01,2017-07-18,23.0,1.565851111837798,6,7
5c007a8b-6b22-461f-9919-2b7754c4e72d,669406022,2019-06-01T11:59:24.212+0000,weekend lunch,2019-06-01T14:02:49.540+0000,30.5,4.0,26.5,DELIVERY,False,2019-06-01T11:59:24.212+0000,Android_iFood_9.3.1,MOBILE,ANDROID,OTHERS_OFFLINE,SC,SAO JOSE,Rocado,-27.59-48.62,unknown,Active,1.0,3.0,5.0,Alta,2. Tilapia,6.0,unknown,184913,SAO JOSE,Kobrasol,-27.6-48.61,Comida Brasileira,903.747462995218,0.0,1.0,0.0,8.0,12.0,2019-06-01,1,9dbb825d-2a32-4d2d-9d67-b917253dd028,9bc258be21067f360696eb3aae36f1c6d902ad2f98c2e59ceeee11d5da830d17,2018-01-01,2018-01-12,17.0,6.86375,6,7
4b8c13a8-df68-40bb-a9a4-2ea60f7000b3,669484170,2019-06-01T13:00:25.188+0000,weekend lunch,2019-06-01T13:40:27.776+0000,67.9,4.0,63.9,DELIVERY,False,2019-06-01T13:00:25.188+0000,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,RJ,RIO DE JANEIRO,Copacabana,-22.99-43.19,unknown,Active,4.0,3.0,5.0,Alta,2. Tilapia,6.0,unknown,224519,RIO DE JANEIRO,Leblon,-22.98-43.22,Carnes,3316.477781154116,0.0,1.0,0.0,30.0,6.0,2019-06-01,1,606080a2-4523-4dbb-8066-a3db8f4dcf4f,6f5ae153404c42fbab723806b3f1a0bf80ba303ca62b72658b715c09406f94b4,2013-05-01,2013-05-14,73.0,7.033049999999999,6,7
285b824a-c7ff-4bd0-8919-ccdd72f14e8d,669651996,2019-06-01T16:25:09.268+0000,weekend snack,2019-06-01T18:25:26.004+0000,36.9,4.0,32.9,DELIVERY,False,2019-06-01T16:25:09.268+0000,Android_iFood_9.3.1,MOBILE,ANDROID,OTHERS_OFFLINE,MG,CONTAGEM,Jardim Industrial,-19.97-44.02,unknown,Churn,4.0,2.0,5.0,Alta,4. Retention Carp,6.0,unknown,151095,CONTAGEM,Industrial,-19.97-44.03,Açaí,1483.9599361274786,0.0,1.0,0.0,49.0,91.0,2019-06-01,1,aab04da9-565b-4a2f-b258-c5c73c96b526,356c80f8d58fc48a17fe4c59631cdf51b93d48c1d05fb9ed5621e6a36aeb4992,2018-04-01,2018-04-23,14.0,7.54535,6,7
1acb0253-aa32-4b60-b4c4-c2c09ee02784,669790559,2019-06-01T18:57:09.604+0000,weekend dinner,2019-06-01T21:02:17.272+0000,91.0,4.0,87.0,DELIVERY,False,2019-06-01T18:57:09.604+0000,Android_iFood_9.3.1,MOBILE,ANDROID,OTHERS_OFFLINE,SP,SAO PAULO,Santana,-23.49-46.63,unknown,Active,1.0,4.0,5.0,Alta,1. Marlin,6.0,5. Jantar,82540,SAO PAULO,Santana,-23.48-46.62,Pizza,1235.4832476031324,0.0,2.0,0.0,7.0,7.0,2019-06-01,1,29ec3d93-cb1e-4d97-b02f-6ec3fd334556,18632d6ff5223c77505d3fd0735ed6a5cda7741a90cd432380c7bf068cebc5a2,2017-06-01,2017-06-18,24.0,13.307,6,7
89d77682-a0fd-4ee2-aef2-ca6ddf98b511,669822779,2019-06-01T19:11:25.212+0000,weekend dinner,2019-06-01T21:24:23.979+0000,103.5,4.0,99.5,DELIVERY,False,2019-06-01T19:11:25.212+0000,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,GO,GOIANIA,Setor Oeste,-16.69-49.26,unknown,Active,1.0,3.0,5.0,Alta,2. Tilapia,6.0,5. Jantar,46254,GOIANIA,Setor Bueno,-16.69-49.27,Lanches,1510.6528258060157,0.0,5.0,0.0,6.0,11.0,2019-06-01,1,3d9b96c8-9f92-4511-8977-9eb14921c561,4e1a1da742c3e38f5bdf3ad0197f55ba955fe93a17cee91655e07c27bcb39cdf,2016-06-01,2016-06-26,36.0,15.82040625,6,7
ce7b6404-6f6b-49bd-a0bd-89182e48b0f8,669848348,2019-06-01T19:21:35.701+0000,weekend dinner,2019-06-01T21:42:36.245+0000,87.9,4.0,83.9,DELIVERY,False,2019-06-01T19:21:35.701+0000,Android_iFood_9.3.1,MOBILE,ANDROID,CC_OFFLINE,SP,CAMPINAS,Parque Residencial Vila Uniao,-22.95-47.12,unknown,Churn,4.0,2.0,5.0,Baixa,4. Retention Carp,6.0,5. Jantar,28436,CAMPINAS,Parque Residencial Vila Uniao,-22.95-47.12,Pizza,480.7451817927033,0.0,1.0,0.0,56.0,28.0,2019-06-01,1,fead448c-679c-4620-a4a1-733e384e15e4,fade6a1f163d8dd1626d2b24ab4b482d42d20bcddfc4fd311a6791efc3b46d9f,2016-12-01,2016-12-07,30.0,12.97685,6,7
9dd560ff-eabe-4ed3-a676-7819b055cfb5,669894756,2019-06-01T19:39:18.514+0000,weekend dinner,2019-06-01T20:14:07.204+0000,63.0,4.0,59.0,DELIVERY,False,2019-06-01T19:39:18.514+0000,iOS_iFood_9.2.0,MOBILE,IOS,CC_ONLINE,SP,SAO BERNARDO DO CAMPO,Jardim Do Mar,-23.69-46.56,unknown,Active,1.0,3.0,5.0,Alta,4. Retention Carp,6.0,5. Jantar,146725,SAO BERNARDO DO CAMPO,Jardim Do Mar,-23.69-46.56,Lanches,368.5018663540701,0.0,2.0,0.0,7.0,15.0,2019-06-01,1,5eaf1f75-dab6-48ea-bbae-d791339d09e0,4a90569892976fbc7845a84f9af03afd70a6194c0653ab836674cee3e06dc4a6,2016-05-01,2016-05-24,37.0,10.006950837929567,6,7


#### Exploração Ordens

Temos +30k clientes que fizeram pedidos no periodo analisado.

In [157]:
df_orders_total.select(f.countDistinct('customer_id')).show()

`74%` dos clientes fizeram entre 0 e 10 pedidos. `91%` fizeram até 20 pedidos.

In [159]:
display(df_orders_total.groupBy('customer_id')\
                           .count()\
                           .withColumnRenamed('count', 'number_orders'))

customer_id,number_orders
8e5f08475a5a81e15af97e2d9b2f49e0d795165f2d92dab2868c3b975ad075e8,32
e598f9ab70ce2aa561bf6f58adee027e17fe361722c3682caa2e15cac959f973,110
03f34adb449900100080d2defbf63b44e86e4632754e3d22720c99b95a318116,47
0087f7139bba1b717477767e6b0c4e08e0de5138c48a58bf34e3672fdb59c32a,18
a2332388f4a574f90b0bcf93cd4f37ec9902222217d02b7516f93e9b1ec2bab6,5
c093589580243a2b025f35cf3039a9952ea6be8ee2af4dc119d4f27da4578f08,14
e01a6c7a8339f7951bcf3d62dd5aaced75c48210489f15b6c32b6e8b52e40d49,68
f92562aaf4bfdfcd0576ea920d2d65a215dcabf4250739f26ffe8c32c3524f6b,8
39263b649334376ac6d6b9ec955a2f23e3f43f9986e5802a3da391469e49d739,26
8d2898750cf496417354a7da91095a7f1483da777e117df21d49e4d8fdf4f2f3,75


In [160]:
avg_order = df_orders_total.groupBy('customer_id')\
                           .count()\
                           .withColumnRenamed('count', 'number_orders')
avg_order = avg_order.withColumn('orders_range', order_range(avg_order.number_orders))\
                     .orderBy('orders_range', asc=True)

display(avg_order)

customer_id,number_orders,orders_range
1226f2861f1c3e97f12133da94fe3007c8654f2ecfd307d78ed539bb6a2fc129,2,1- < 5
cae6670d1e0b6d17366fbe8976045fffa8871558af1f0b525d9995ae8811b6c3,4,1- < 5
e27807191cabd1e818a151299aeb3353013956ae83887082551148ee6924f22c,1,1- < 5
5600ce927ea122946d8861a9397156d4a89fb2647d8bf82c2daec3238845ea4a,3,1- < 5
58fe654421ef13d2a1b9071f5f2b64fe90212e68441c62b1170de1bbe11e720c,1,1- < 5
e1af52e094fc45bcb0cbbb2a04a0a5bcd6244c39b85329bfe6b5d9d9ec56bc3c,1,1- < 5
3c8757f078ee76a46c0036ba1b7f3cb623120368033152253c49c0af9fad88ed,4,1- < 5
78e26a93b4badc9550d5a8d85a70e4d4b8b2105d491b54562146918cbf54b567,1,1- < 5
40fb865f9aea9dc52872c4fa68f24bd68786696540287b24c595de3f7347f37d,3,1- < 5
e702f47372b0db2abb5ddb06990d3d83e2fed434c08f69b745a5e4e1fdbc4755,3,1- < 5


Os `+30k` clientes fizeram `+492k` pedidos. A média de pedidos por cliente foi de `16.4`.

In [162]:
chosen_column = 'number_orders'
exprrs = [sum(chosen_column), max(chosen_column), min(chosen_column), stddev(chosen_column), mean(chosen_column)]
display(avg_order.agg(*exprrs))

sum(number_orders),max(number_orders),min(number_orders),stddev_samp(number_orders),avg(number_orders)
492138,421,1,21.06712970559962,16.36151467801456


#### Variáveis Qualitativas

In [164]:
display(percentByCol(df_orders_total, 'device_app_version'))

device_app_version,cnt_per_group,percent
9.30.0,21766,4.42274321430168
Android_iFood_9.4.1,18141,3.686161198688173
9.27.0,15499,3.14931990620517
iOS_iFood_9.5.0,15345,3.1180278702315203
Android_iFood_9.8.0,12161,2.471054866724374
Android_iFood_9.20.0,12148,2.468413331220105
iOS_iFood_9.17.1,11089,2.253229785141566
9.28.1,10916,2.2180770434309074
iOS_iFood_9.18.0,10897,2.214216337693899
iOS_iFood_9.8.1,10574,2.14858434016475


In [165]:
# 97% dos pedidos são realizados via celular.
display(percentByCol(df_orders_total, 'device_type'))

device_type,cnt_per_group,percent
MOBILE,479398,97.41129520581626
SITE,12739,2.588501599144956
unknown,1,0.0002031950387899329


In [166]:
# Share de pedidos retirados nos restaurantes ainda é bem baixo: 0.33%
display(percentByCol(df_orders_total, 'delivery_type'))

delivery_type,cnt_per_group,percent
DELIVERY,490513,99.66980806196636
TAKEOUT,1625,0.330191938033641


In [167]:
# 98% dos pedidos não são agendados
display(percentByCol(df_orders_total, 'scheduled'))

scheduled,cnt_per_group,percent
False,485424,98.6357485095644
True,6714,1.3642514904356096


In [168]:
# 55% dos pedidos foram feitos em celular android. 42% em IOS: A penetração de mercado em iphone é elevada.
display(percentByCol(df_orders_total, 'device_platform'))

device_platform,cnt_per_group,percent
ANDROID,274822,55.842466950326944
IOS,207432,42.14915328627336
DESKTOP,9883,2.008176568360907
OTHER,1,0.0002031950387899329


#### Variáveis quantitativas: medidas de posição, dispersão

A média do valor dos pedidos foi de R$55.8, com um desconto médio de R$6.25/pedido. O lucro médio por pedido foi de R$7.39 (14.7% do valor pago [7.39/50.32])

32% dos pedidos foram feitos com promoção.

In [172]:
# Estatistica basica variaveis quantitativas
aux = []
for nome, tipo in df_orders_total.dtypes:
  if tipo in ('double', 'int'):
    aux.append(nome)
    
display(df_orders_total[aux].describe())

summary,order_total,credit,paid_amount,customer_seg_recency_bucket,customer_seg_frequency_bucket,customer_seg_merchant_offer_bucket,customer_seg_gross_income_bucket,distance_merchant_customer,promo_is_promotion,normal_items_quantity,promo_items_quantity,order_lag_at_login,order_lead_at_login,months_after_first_purchase,general_net_profit,order_date_local_month,order_date_local_dayofweek
count,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0
mean,55.81699667572885,6.25805912162847,50.32678909167726,1.6588030186654965,3.440977937082688,4.730628400976962,5.96455872133426,2781.805529506255,0.3225416448232,1.5838321771535626,0.4712255505569576,10.915743145215366,9.73069748729015,24.335806216955408,7.395985463685502,8.851121027029004,4.093918778879095
stddev,31.489082445695136,4.44324696730593,32.23955190297728,1.0522834901845284,1.1481918207385826,0.6013609258606312,0.3816573759170446,50211.794754761584,0.4674494370275182,2.1554135665166045,1.3568022095589585,33.392796628430744,18.139347606284343,17.937811997715375,6.81242294289999,2.045058398783173,2.1072029289648144
min,13.0,4.0,9.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1994.034375,6.0,1.0
max,1641.0,84.0,1637.0,5.0,5.0,6.0,6.0,11974527.883558271,1.0,233.0,105.0,2614.0,247.0,95.0,178.382,12.0,7.0


In [173]:
display(percentByCol(df_orders_total, 'promo_is_promotion'))

promo_is_promotion,cnt_per_group,percent
0.0,333403,67.74583551768
1.0,158735,32.25416448232


In [174]:
display(df_orders_total[aux].summary())

summary,order_total,credit,paid_amount,customer_seg_recency_bucket,customer_seg_frequency_bucket,customer_seg_merchant_offer_bucket,customer_seg_gross_income_bucket,distance_merchant_customer,promo_is_promotion,normal_items_quantity,promo_items_quantity,order_lag_at_login,order_lead_at_login,months_after_first_purchase,general_net_profit,order_date_local_month,order_date_local_dayofweek
count,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0,492138.0
mean,55.81699667572885,6.25805912162847,50.32678909167726,1.6588030186654965,3.440977937082688,4.730628400976962,5.96455872133426,2781.805529506255,0.3225416448232,1.5838321771535626,0.4712255505569576,10.915743145215366,9.73069748729015,24.335806216955408,7.395985463685502,8.851121027029004,4.093918778879095
stddev,31.489082445695136,4.44324696730593,32.23955190297728,1.0522834901845284,1.1481918207385826,0.6013609258606312,0.3816573759170446,50211.794754761584,0.4674494370275182,2.1554135665166045,1.3568022095589585,33.392796628430744,18.139347606284343,17.937811997715375,6.81242294289999,2.045058398783173,2.1072029289648144
min,13.0,4.0,9.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1994.034375,6.0,1.0
25%,35.9,4.0,30.0,1.0,3.0,5.0,6.0,985.6013771070672,0.0,1.0,0.0,1.0,1.0,9.0,6.08315,7.0,2.0
50%,47.7,4.0,42.51,1.0,4.0,5.0,6.0,1893.466744055135,0.0,1.0,0.0,4.0,4.0,21.0,7.9758125,9.0,4.0
75%,66.0,4.0,61.6,2.0,4.0,5.0,6.0,3146.908372031764,1.0,2.0,1.0,10.0,10.0,36.0,10.16984375,11.0,6.0
max,1641.0,84.0,1637.0,5.0,5.0,6.0,6.0,11974527.883558271,1.0,233.0,105.0,2614.0,247.0,95.0,178.382,12.0,7.0


#### Correlação Variáveis

In [176]:
# recupera colunas numericas
filter_colunas_order = [x[0] for x in df_orders_total.dtypes if x[1] in ('double', 'int', 'long')]
filter_colunas_order =['order_total',
                       'credit',
                       'paid_amount',
                       'distance_merchant_customer',
                       'promo_is_promotion',
                       'normal_items_quantity',
                       'promo_items_quantity',
                       'order_lag_at_login',
                       'order_lead_at_login',
                       'months_after_first_purchase',
                       'general_net_profit']

In [177]:
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

# Reference: https://stackoverflow.com/questions/55546467/how-to-plot-correlation-heatmap-when-using-pysparkdatabricks
vector_col = "corr_features"
assembler = VectorAssembler(inputCols=filter_colunas_order,
                            outputCol=vector_col)
myGraph_vector = assembler.transform(df_orders_total).select(vector_col)
matrix = Correlation.corr(myGraph_vector, vector_col)

# Gera uma lista de listas representando uma matriz de correlacao
matrix = Correlation.corr(myGraph_vector, vector_col).collect()[0][0]
corrmatrix_orders = matrix.toArray().tolist()
df_corr_orders = spark.createDataFrame(corrmatrix_orders,filter_colunas_order)

`order_total` é altamente correlacionada com o `paid_amount`, como esperado. Podemos ver também uma correlação negativa entre `general_net_profit` e `order_total`.

Interessante observar também a correlação entre `promo_is_promotion` com `normal_items_quantity`.

In [179]:
plot_corr_matrix(corrmatrix_orders,filter_colunas_order,fig_no=234, figsize=[15,10])

### 5. Visits

#### Tratamento de Missings

In [182]:
colunas_visits_missing = percent_missing(df_sessions_visits).orderBy('perc_missing', ascending=False)
display(colunas_visits_missing)

variaveis,total_missing,perc_missing
media_network,453445,97.80531469738148
user_account_uuid,24917,5.374444588240369
user_identifier,24916,5.374228894353134
sum_view_restaurant_screen,15264,3.2923514947586385
sum_click_add_item,7069,1.5247400888658815
sum_view_dish_screen,5730,1.235925973857901
first_order_origin_feature,2214,0.477546266338812
session_ended_at_utc0,2077,0.4479962037875846
session_duration_seconds,2077,0.4479962037875846
session_ended_at_amsp,2077,0.4479962037875846


In [183]:
# Para as colunas com missing, separar em categoricas e numericas
df_columns_visits_missing = colunas_visits_missing.filter(colunas_visits_missing['perc_missing'] > 0
                                                  ).select('variaveis')
list_df_columns_visits_missing = list([row[0] for row in df_columns_visits_missing.collect()])

filter_colunas_visits_numericas = [x[0] for x in df_sessions_visits[list_df_columns_visits_missing].dtypes if x[1] in ('double', 'long')]
filter_colunas_visits_numericas_int = [x[0] for x in df_sessions_visits[list_df_columns_visits_missing].dtypes if x[1] in ('int')]
filter_colunas_visits_categoricas = [x[0] for x in df_sessions_visits[list_df_columns_visits_missing].dtypes if x[1] not in ('double', 'int', 'long', 'boolean')]
filter_colunas_visits_booleanas = [x[0] for x in df_sessions_visits[list_df_columns_visits_missing].dtypes if x[1] in ('boolean')]

# Inputar unknown para colunas categoricas
for coluna in filter_colunas_visits_categoricas:
  df_sessions_visits = df_sessions_visits.fillna('unknown', subset=[coluna])

# Transformar colunas booleanas para string, depois inputar unknown
for coluna in filter_colunas_visits_booleanas:
  df_sessions_visits = df_sessions_visits.withColumn(coluna,col(coluna).cast('string'))
  df_sessions_visits = df_sessions_visits.fillna('unknown', subset=[coluna])

# Inputar media para colunas numericas
for coluna in filter_colunas_visits_numericas:
  media = df_sessions_visits.agg(mean(coluna)).collect()[0][0]
  df_sessions_visits = df_sessions_visits.fillna(media, subset=[coluna])
  
# Inputar media para colunas numericas inteiras arredondando para o integer mais perto
for coluna in filter_colunas_visits_numericas:
  media = df_orders_total.agg(mean(coluna)).collect()[0][0]
  df_sessions_visits = df_sessions_visits.fillna(media, subset=[coluna])
  df_sessions_visits = df_sessions_visits.withColumn(coluna, f.round(df_sessions_visits[coluna], 0))

In [184]:
len(list_df_columns_visits_missing)

In [185]:
len(filter_colunas_visits_categoricas)

In [186]:
len(filter_colunas_visits_numericas)

In [187]:
display(df_sessions_visits)

session_id,dau,platform,user_identifier,user_account_uuid,session_started_at_amsp,session_ended_at_amsp,session_started_at_utc0,session_ended_at_utc0,session_duration_seconds,device_model,device_manufacturer,sum_event_open,sum_view_restaurant_screen,sum_view_dish_screen,sum_click_add_item,sum_view_checkout,sum_callback_purchase,order_session_quantity,first_order_origin_feature,media_network
41162808-5335-4e4f-aa52-45754ec1d558,2019-11-04_661ead80-50a2-48b4-ae78-f872e2f08c11,IOS,ffa840ebe8b397e995245525ec98f0b31a882c4d3e47dc0e91d3115cd20d6d80,ca438b60-3d04-44c0-9883-c4f01fc512ad,2019-11-04T09:14:47.566Z,2019-11-04T09:15:50.323Z,2019-11-04T11:14:47.566Z,2019-11-04T11:15:50.323Z,63.0,"iPhone9,3",Apple,1.0,8.933784760324386,1.0,1.0,1.0,1.0,1.0,Loop,unknown
52f0a543-fcc7-4ff0-bed1-41bec8a51374,2019-12-07_17ae6c54-37e1-42e3-b82b-e562aaeb7730,IOS,a59ea42fbbaf0fa9a48b65f45842d9d061d23775ad2af61c28d1235b5510ae68,414f1e51-e872-42ca-ad5e-c30b085bd5e4,2019-12-07T13:28:37.764Z,2019-12-07T15:04:00.819Z,2019-12-07T15:28:37.764Z,2019-12-07T17:04:00.819Z,5723.0,"iPhone9,3",Apple,17.0,12.0,3.0,1.0,2.0,1.0,1.0,Last Restaurants,unknown
f06f2a6d-14cc-44e3-8ec6-66191659430b,2019-12-29_d8a7f51d-4394-3c9a-ae62-8522f5885d3f,ANDROID,e7dcbc11f83b22ea57ec8b385b9fa38d48532b180e3d19cbde6d4766687a48b9,5c697207-d8a4-460e-b171-3e94e42be5ab,2019-12-29T19:58:38.297Z,2019-12-29T20:17:45.199Z,2019-12-29T22:58:38.297Z,2019-12-29T23:17:45.199Z,1147.0,Redmi Note 8,Xiaomi,2.0,4.0,7.0,7.0,5.0,1.0,1.0,Ranking,unknown
716d0044-0d76-4199-86ca-62930a751d59,2019-12-03_9675dcd9-9dd5-3ec3-809e-9006b4fac160,ANDROID,2959b13e1caef84fa6cde74693ba2476f17eaa72b66dad051b74924762da14c7,615b2fb5-e76d-4fa7-b982-9564650c76c5,2019-12-03T14:54:14.621Z,2019-12-03T16:31:47.527Z,2019-12-03T16:54:14.621Z,2019-12-03T18:31:47.527Z,5853.0,SM-A520F,samsung,11.0,1.0,1.0,1.0,3.0,1.0,1.0,Taxa na faixa,unknown
96081494-8554-4c3f-92b6-94e2070d1660,2019-12-20_e953c804-ecd2-3f00-8465-0ba2d8ce5aa1,ANDROID,9c944018e8b8cd2ac2fec6edf1f46c3c452c4e70c46e39a89cc37430edf01ad7,ba8596c4-9fbf-4f67-9e2a-dbb65f4b7c40,2019-12-20T20:28:22.584Z,2019-12-20T20:54:25.376Z,2019-12-20T23:28:22.584Z,2019-12-20T23:54:25.376Z,1563.0,SM-J710MN,samsung,4.0,4.0,9.0,2.0,2.0,1.0,1.0,CRM Restaurant,unknown
85091579-c5e8-474c-9b77-7129564141d9,2019-08-09_37124edd-a771-41bc-95bd-e0d607863f6d,ANDROID,c447b075-5080-4ea9-99eb-1119973e9e08,c447b075-5080-4ea9-99eb-1119973e9e08,2019-08-09T20:25:25.025Z,2019-08-09T21:22:41.041Z,2019-08-09T23:25:25.025Z,2019-08-10T00:22:41.041Z,3436.0,Moto G Play,motorola,5.0,2.0,24.0,4.0,5.0,1.0,1.0,Lanches,unknown
2284b979-27c9-4ad2-8d12-557ad537200e,2019-12-14_aa7ca448-de7d-4f24-9003-030ad0142522,IOS,0b51610a7974ad8751637867fa168e3b112df6feda0e01ca493977128741eaea,76cb1602-6a45-431a-a2af-2a63bad6f038,2019-12-14T21:46:56.353Z,2019-12-14T21:54:19.136Z,2019-12-15T00:46:56.353Z,2019-12-15T00:54:19.136Z,443.0,"iPhone7,2",Apple,2.0,5.0,2.0,1.0,1.0,1.0,1.0,Ranking,unknown
e969590a-382e-41fc-b7c5-9980984d40f4,2019-12-06_3e0c8796-3451-4f93-954a-7562098407ea,IOS,cc37994f00b79a48ae57678a6d192c454c74b52f6839e7e6bb693ab1e8b67a03,b00996d7-3fa3-4c21-aaaf-3a52cc231e37,2019-12-06T13:07:41.512Z,2019-12-06T13:11:25.127Z,2019-12-06T15:07:41.512Z,2019-12-06T15:11:25.127Z,224.0,"iPhone10,6",Apple,3.0,4.0,2.0,1.0,1.0,1.0,1.0,Last Restaurants,unknown
83a6e98e-ecbc-440e-81b2-b03ab719e190,2019-06-02_6de65a8d-eead-477f-9542-7de7d82a149b,IOS,228ebd5d-1099-4c27-8304-382541861e81,228ebd5d-1099-4c27-8304-382541861e81,2019-06-02T22:48:40.566Z,2019-06-02T23:26:59.356Z,2019-06-03T01:48:40.566Z,2019-06-03T02:26:59.356Z,2299.0,"iPhone11,8",Apple,9.0,9.0,4.0,4.0,3.0,1.0,1.0,Featured Restaurant,unknown
0de892ef-e99e-406f-9959-8b12da2b3bbc,2019-10-01_11c0c641-8c7f-38ee-9c96-26499acdef34,ANDROID,2c64e0bbf845290318ac4816a9a4c547edb326a4e777e0c262d4c3c76c4d3d03,5bf6ae78-249e-4836-9727-3dba86dbc067,2019-10-01T23:19:27.027Z,2019-10-01T23:53:43.043Z,2019-10-02T02:19:27.027Z,2019-10-02T02:53:43.043Z,2056.0,Moto G (5S),motorola,9.0,13.0,20.0,6.0,12.0,1.0,1.0,Last Restaurants,unknown


#### Tratamento de Datas

In [189]:
# importante: transformar primeiro para timestamp, depois para date!!!
df_sessions_visits = df_sessions_visits.withColumn('session_started_at_amsp', from_utc_timestamp('session_started_at_amsp', 'UTC'))\
                                       .withColumn('session_ended_at_amsp', from_utc_timestamp('session_ended_at_amsp', 'UTC'))\
                                       .withColumn('session_started_at_utc0', from_utc_timestamp('session_started_at_utc0', 'UTC'))\
                                       .withColumn('session_ended_at_utc0', from_utc_timestamp('session_ended_at_utc0', 'UTC'))\
                                       .withColumn('session_started_at_utc0', from_utc_timestamp('session_started_at_utc0', 'UTC'))\
                                       .withColumn('session_started_date', to_date('session_started_at_amsp', 'YYYY-MM-DD'))\
                                       .withColumn('session_started_month', month('session_started_date'))\
                                       .withColumn('session_started_dayofweek', dayofweek('session_started_date'))
display(df_sessions_visits)

session_id,dau,platform,user_identifier,user_account_uuid,session_started_at_amsp,session_ended_at_amsp,session_started_at_utc0,session_ended_at_utc0,session_duration_seconds,device_model,device_manufacturer,sum_event_open,sum_view_restaurant_screen,sum_view_dish_screen,sum_click_add_item,sum_view_checkout,sum_callback_purchase,order_session_quantity,first_order_origin_feature,media_network,session_started_date,session_started_month,session_started_dayofweek
41162808-5335-4e4f-aa52-45754ec1d558,2019-11-04_661ead80-50a2-48b4-ae78-f872e2f08c11,IOS,ffa840ebe8b397e995245525ec98f0b31a882c4d3e47dc0e91d3115cd20d6d80,ca438b60-3d04-44c0-9883-c4f01fc512ad,2019-11-04T09:14:47.566+0000,2019-11-04T09:15:50.323+0000,2019-11-04T11:14:47.566+0000,2019-11-04T11:15:50.323+0000,63.0,"iPhone9,3",Apple,1.0,8.933784760324386,1.0,1.0,1.0,1.0,1.0,Loop,unknown,2019-11-04,11,2
52f0a543-fcc7-4ff0-bed1-41bec8a51374,2019-12-07_17ae6c54-37e1-42e3-b82b-e562aaeb7730,IOS,a59ea42fbbaf0fa9a48b65f45842d9d061d23775ad2af61c28d1235b5510ae68,414f1e51-e872-42ca-ad5e-c30b085bd5e4,2019-12-07T13:28:37.764+0000,2019-12-07T15:04:00.819+0000,2019-12-07T15:28:37.764+0000,2019-12-07T17:04:00.819+0000,5723.0,"iPhone9,3",Apple,17.0,12.0,3.0,1.0,2.0,1.0,1.0,Last Restaurants,unknown,2019-12-07,12,7
f06f2a6d-14cc-44e3-8ec6-66191659430b,2019-12-29_d8a7f51d-4394-3c9a-ae62-8522f5885d3f,ANDROID,e7dcbc11f83b22ea57ec8b385b9fa38d48532b180e3d19cbde6d4766687a48b9,5c697207-d8a4-460e-b171-3e94e42be5ab,2019-12-29T19:58:38.297+0000,2019-12-29T20:17:45.199+0000,2019-12-29T22:58:38.297+0000,2019-12-29T23:17:45.199+0000,1147.0,Redmi Note 8,Xiaomi,2.0,4.0,7.0,7.0,5.0,1.0,1.0,Ranking,unknown,2019-12-29,12,1
716d0044-0d76-4199-86ca-62930a751d59,2019-12-03_9675dcd9-9dd5-3ec3-809e-9006b4fac160,ANDROID,2959b13e1caef84fa6cde74693ba2476f17eaa72b66dad051b74924762da14c7,615b2fb5-e76d-4fa7-b982-9564650c76c5,2019-12-03T14:54:14.621+0000,2019-12-03T16:31:47.527+0000,2019-12-03T16:54:14.621+0000,2019-12-03T18:31:47.527+0000,5853.0,SM-A520F,samsung,11.0,1.0,1.0,1.0,3.0,1.0,1.0,Taxa na faixa,unknown,2019-12-03,12,3
96081494-8554-4c3f-92b6-94e2070d1660,2019-12-20_e953c804-ecd2-3f00-8465-0ba2d8ce5aa1,ANDROID,9c944018e8b8cd2ac2fec6edf1f46c3c452c4e70c46e39a89cc37430edf01ad7,ba8596c4-9fbf-4f67-9e2a-dbb65f4b7c40,2019-12-20T20:28:22.584+0000,2019-12-20T20:54:25.376+0000,2019-12-20T23:28:22.584+0000,2019-12-20T23:54:25.376+0000,1563.0,SM-J710MN,samsung,4.0,4.0,9.0,2.0,2.0,1.0,1.0,CRM Restaurant,unknown,2019-12-20,12,6
85091579-c5e8-474c-9b77-7129564141d9,2019-08-09_37124edd-a771-41bc-95bd-e0d607863f6d,ANDROID,c447b075-5080-4ea9-99eb-1119973e9e08,c447b075-5080-4ea9-99eb-1119973e9e08,2019-08-09T20:25:25.025+0000,2019-08-09T21:22:41.041+0000,2019-08-09T23:25:25.025+0000,2019-08-10T00:22:41.041+0000,3436.0,Moto G Play,motorola,5.0,2.0,24.0,4.0,5.0,1.0,1.0,Lanches,unknown,2019-08-09,8,6
2284b979-27c9-4ad2-8d12-557ad537200e,2019-12-14_aa7ca448-de7d-4f24-9003-030ad0142522,IOS,0b51610a7974ad8751637867fa168e3b112df6feda0e01ca493977128741eaea,76cb1602-6a45-431a-a2af-2a63bad6f038,2019-12-14T21:46:56.353+0000,2019-12-14T21:54:19.136+0000,2019-12-15T00:46:56.353+0000,2019-12-15T00:54:19.136+0000,443.0,"iPhone7,2",Apple,2.0,5.0,2.0,1.0,1.0,1.0,1.0,Ranking,unknown,2019-12-14,12,7
e969590a-382e-41fc-b7c5-9980984d40f4,2019-12-06_3e0c8796-3451-4f93-954a-7562098407ea,IOS,cc37994f00b79a48ae57678a6d192c454c74b52f6839e7e6bb693ab1e8b67a03,b00996d7-3fa3-4c21-aaaf-3a52cc231e37,2019-12-06T13:07:41.512+0000,2019-12-06T13:11:25.127+0000,2019-12-06T15:07:41.512+0000,2019-12-06T15:11:25.127+0000,224.0,"iPhone10,6",Apple,3.0,4.0,2.0,1.0,1.0,1.0,1.0,Last Restaurants,unknown,2019-12-06,12,6
83a6e98e-ecbc-440e-81b2-b03ab719e190,2019-06-02_6de65a8d-eead-477f-9542-7de7d82a149b,IOS,228ebd5d-1099-4c27-8304-382541861e81,228ebd5d-1099-4c27-8304-382541861e81,2019-06-02T22:48:40.566+0000,2019-06-02T23:26:59.356+0000,2019-06-03T01:48:40.566+0000,2019-06-03T02:26:59.356+0000,2299.0,"iPhone11,8",Apple,9.0,9.0,4.0,4.0,3.0,1.0,1.0,Featured Restaurant,unknown,2019-06-02,6,1
0de892ef-e99e-406f-9959-8b12da2b3bbc,2019-10-01_11c0c641-8c7f-38ee-9c96-26499acdef34,ANDROID,2c64e0bbf845290318ac4816a9a4c547edb326a4e777e0c262d4c3c76c4d3d03,5bf6ae78-249e-4836-9727-3dba86dbc067,2019-10-01T23:19:27.027+0000,2019-10-01T23:53:43.043+0000,2019-10-02T02:19:27.027+0000,2019-10-02T02:53:43.043+0000,2056.0,Moto G (5S),motorola,9.0,13.0,20.0,6.0,12.0,1.0,1.0,Last Restaurants,unknown,2019-10-01,10,3


In [190]:
df_sessions_visits.dtypes

#### Exploração Sessions

A base de eventos contém +463k sessões de +68k `user_identifiers`.

In [193]:
df_sessions_visits.select(f.countDistinct('session_id')).show()

In [194]:
df_sessions_visits.select(f.countDistinct('user_identifier')).show()

In [195]:
df_sessions_visits.select(f.countDistinct('dau')).show()

In [196]:
df_sessions_visits.select(f.countDistinct('device_model')).show()

In [197]:
display(percentByCol(df_sessions_visits, 'device_manufacturer'))

device_manufacturer,cnt_per_group,percent
Apple,200977,43.349510374875976
samsung,135191,29.15987230921876
motorola,73582,15.871187610543116
Xiaomi,22817,4.921487425046374
asus,13361,2.881886027349985
LGE,8958,1.9321858418532416
LENOVO,2037,0.4393684482981752
Sony,1426,0.3075794831974461
OnePlus,1167,0.2517147664035201
HUAWEI,949,0.2046934989862387


Sextas e finais de semana são os dias com mais sessões geradas, 16% e 34% respectivamente.

In [199]:
display(percentByCol(df_sessions_visits, 'session_started_dayofweek').orderBy('session_started_dayofweek'))

session_started_dayofweek,cnt_per_group,percent
1,81116,17.496225356973383
2,53279,11.491954618006126
3,54848,11.830378327078211
4,59557,12.846080842068936
5,61956,13.363530477546266
6,74527,16.075018333980417
7,78337,16.896812044346664


In [200]:
df_sessions_month = df_sessions_visits.groupby('session_started_dayofweek','session_started_month')\
                                    .count()\
                                    .orderBy('session_started_month','session_started_dayofweek')
display(df_sessions_month)

session_started_dayofweek,session_started_month,count
4,5,1
6,5,9
1,6,16400
2,6,8575
3,6,8872
4,6,10604
5,6,10643
6,6,12233
7,6,15370
1,7,9527


97.8% das sessões não são identificadas. Se forem visitas `orgânicas`, vale entender se é possível fazer uma atribuição do horário do disparo do evento de `push` (`event_time_utc3`) e uma janela de X minutos após em que a sessão é gerada.

In [202]:
display(percentByCol(df_sessions_visits, 'media_network'))

media_network,cnt_per_group,percent
unknown,453445,97.80531469738148
googleadwords_int,4127,0.8901686726198179
Email Avaliacao,3111,0.6710236831888184
Facebook Ads,1075,0.2318709287778784
twitter,795,0.1714766403520124
MGM,473,0.1020232086622665
criteo_int,103,0.0222164703852292
jampp_int,103,0.0222164703852292
doubleclick_int,99,0.0213536948362883
SMS_Prospect,68,0.014667184331996


In [203]:
df_sessions_visits.dtypes

#### Tipos de visualizações por sessão

A quantidade de views das telas de restaurantes por sessões é maior para users com aparelho da `Apple`: Q1 é 5, Mediana em 8.9 e Q3 em 16.

In [206]:
display(df_sessions_visits.select('device_manufacturer','sum_view_restaurant_screen'))

device_manufacturer,sum_view_restaurant_screen
Apple,8.933784760324386
Apple,12.0
Xiaomi,4.0
samsung,1.0
samsung,4.0
motorola,2.0
Apple,5.0
Apple,4.0
Apple,9.0
motorola,13.0


In [207]:
display(df_sessions_visits.select('device_manufacturer','sum_view_dish_screen'))

device_manufacturer,sum_view_dish_screen
Apple,1.0
Apple,3.0
Xiaomi,7.0
samsung,1.0
samsung,9.0
motorola,24.0
Apple,2.0
Apple,2.0
Apple,4.0
motorola,20.0


In [208]:
display(df_sessions_visits.select('device_manufacturer','sum_click_add_item'))

device_manufacturer,sum_click_add_item
Apple,1.0
Apple,1.0
Xiaomi,7.0
samsung,1.0
samsung,2.0
motorola,4.0
Apple,1.0
Apple,1.0
Apple,4.0
motorola,6.0


In [209]:
display(df_sessions_visits.select('device_manufacturer','sum_event_open'))

device_manufacturer,sum_event_open
Apple,1.0
Apple,17.0
Xiaomi,2.0
samsung,11.0
samsung,4.0
motorola,5.0
Apple,2.0
Apple,3.0
Apple,9.0
motorola,9.0


#### Estatísticas básicas

In [211]:
# Estatistica basica variaveis quantitativas
aux = []
for nome, tipo in df_sessions_visits.dtypes:
  if tipo in ('double', 'int'):
    aux.append(nome)
    
display(df_sessions_visits[aux].describe())

summary,session_duration_seconds,sum_event_open,sum_view_restaurant_screen,sum_view_dish_screen,sum_click_add_item,sum_view_checkout,sum_callback_purchase,order_session_quantity,session_started_month,session_started_dayofweek
count,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0
mean,2658.8750712286133,6.091563608433379,8.933784760327478,5.7181703029112825,2.9305466421051785,4.208141738593542,1.248587399517117,1.153414442445203,8.860057805961778,4.089010396445365
stddev,20539.09175762528,6.247750152218677,13.354595274020154,6.768130251305075,3.909645017613745,6.531655744351323,2.24089780822729,2.0707662087639336,2.043890271557854,2.1076238048712965
min,-93.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,1.0
max,2575751.0,631.0,1086.0,409.0,708.0,800.0,359.0,311.0,12.0,7.0


In [212]:
display(df_sessions_visits[aux].summary())

summary,session_duration_seconds,sum_event_open,sum_view_restaurant_screen,sum_view_dish_screen,sum_click_add_item,sum_view_checkout,sum_callback_purchase,order_session_quantity,session_started_month,session_started_dayofweek
count,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0,463620.0
mean,2658.8750712286133,6.091563608433379,8.933784760327478,5.7181703029112825,2.9305466421051785,4.208141738593542,1.248587399517117,1.153414442445203,8.860057805961778,4.089010396445365
stddev,20539.09175762528,6.247750152218677,13.354595274020154,6.768130251305075,3.909645017613745,6.531655744351323,2.24089780822729,2.0707662087639336,2.043890271557854,2.1076238048712965
min,-93.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,1.0
25%,672.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,7.0,2.0
50%,1800.0,4.0,5.0,4.0,2.0,3.0,1.0,1.0,9.0,4.0
75%,3144.0,8.0,10.0,7.0,3.0,5.0,1.0,1.0,11.0,6.0
max,2575751.0,631.0,1086.0,409.0,708.0,800.0,359.0,311.0,12.0,7.0
