In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [5]:
import matplotlib

In [6]:
print(f'Pandas version: {pd.__version__}')
print(f'Numpy version: {np.__version__}')
print(f'Seaborn version: {sns.__version__}')
print(f'Matplotlib version: {matplotlib.__version__}')

Pandas version: 1.4.4
Numpy version: 1.21.5
Seaborn version: 0.11.2
Matplotlib version: 3.5.2


# Initial Inspection

In [10]:
ambev = pd.read_json('dataset.json')

In [11]:
ambev.head()

Unnamed: 0,customer_code,branch_id,sales_channel,seller_code,register_date,total_price,order_id,quantity,item_code,item_total_price,unit_price,group_code,segment_code,is_churn
0,143,0,0,190,2017-11-10T00:00:00Z,1613.53,21804,10,854,292.91,25.04,0,0,0.0
1,433,0,1,153,2011-05-16T00:00:00Z,11163.69,5486,20,246,287.19,12.33,0,5,0.0
2,486,0,0,166,2018-01-24T00:00:00Z,6432.12,22662,12,1420,184.84,12.8,0,0,0.0
3,107,0,1,156,2010-07-28T00:00:00Z,831.82,3956,18,1963,189.18,10.51,0,0,0.0
4,768,0,1,218,2010-12-17T00:00:00Z,1736.48,4730,5,1786,66.87,11.82,0,0,0.0


In [6]:
ambev.info()
# is_churn seems to have a few nulls, look into it.
# numerical variables already in appropriate datatypes.
# register_date not in date format

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204428 entries, 0 to 204427
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   customer_code     204428 non-null  int64  
 1   branch_id         204428 non-null  int64  
 2   sales_channel     204428 non-null  int64  
 3   seller_code       204428 non-null  int64  
 4   register_date     204428 non-null  object 
 5   total_price       204428 non-null  float64
 6   order_id          204428 non-null  int64  
 7   quantity          204428 non-null  int64  
 8   item_code         204428 non-null  int64  
 9   item_total_price  204428 non-null  float64
 10  unit_price        204428 non-null  float64
 11  group_code        204428 non-null  int64  
 12  segment_code      204428 non-null  int64  
 13  is_churn          202513 non-null  float64
dtypes: float64(4), int64(9), object(1)
memory usage: 21.8+ MB


### description of each attribute:

- customer_code: unique id of a customer;

- branch_id: the branch id where this order was made;

- sales_channel: the sales channel this order was made;

- seller_code: seller that made this order;

- register_date: date of the order;

- total_price: total price of the order (sum of all items);

- order_id: id of this order. A order is formed by a set of items;

- item_code: code of the item;

- quantity: quantity of items, given by item_code, were bought;

- item_total_price: total price of items, i.e., quantity* price;

- unit_price: unit price of this item;

- group_code: which group this customer belongs;

- segment_code: segment this client belongs;

- is_churn: True, if we believe the client will not come back. For a given customer_code this value is always the same, it means that "today" (the day you are doing this test) this client is a churn.

In [12]:
ambev['register_date']=ambev['register_date'].astype('datetime64')

In [15]:
ambev['register_date'].min()

Timestamp('2008-01-04 00:00:00')

In [16]:
ambev['register_date'].max()

Timestamp('2018-07-08 00:00:00')

##### 10 years of data. Series range from January 4th, 2008 to July 8th, 2018.

In [20]:
ambev.loc[ambev['customer_code']==143]

Unnamed: 0,customer_code,branch_id,sales_channel,seller_code,register_date,total_price,order_id,quantity,item_code,item_total_price,unit_price,group_code,segment_code,is_churn
0,143,0,0,190,2017-11-10,1613.53,21804,10,854,292.91,25.04,0,0,0.0
119,143,0,0,190,2017-06-21,1072.39,20144,6,2630,125.07,17.82,0,0,0.0
496,143,0,1,190,2010-10-18,4529.35,4386,2,1785,26.34,13.17,0,0,0.0
530,143,0,0,190,2013-05-29,426.38,10091,1,795,16.37,14.37,0,0,0.0
876,143,0,0,190,2016-05-20,1124.92,16905,2,2843,39.32,17.26,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201856,143,0,0,190,2017-06-21,1072.39,20144,3,1963,37.06,11.83,0,0,0.0
202438,143,0,0,190,2016-01-18,1002.47,16041,6,1119,166.26,27.71,0,0,0.0
202660,143,0,1,190,2009-01-17,1076.24,1407,3,1786,42.20,12.56,0,0,0.0
203800,143,0,0,190,2015-03-31,1100.71,14306,2,265,36.10,15.85,0,0,0.0
