### Import packages

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from matplotlib.backends.backend_pdf import PdfPages

%matplotlib inline

### Set-up

For this expercise, we will utilize two files (listed below) from the 'Brazilian E-Commerce Public Dataset' originally posted on [Kaggle]( https://www.kaggle.com/olistbr/brazilian-ecommerce).

In [2]:
# Full path of the 'orders' dataset
orders_file = 'https://www.dropbox.com/s/ej64qorm1uvmjlp/olist_orders_dataset.csv?dl=1'

# Full path of the 'customer' dataset
cust_file = 'https://www.dropbox.com/s/t082qlggu4wxiq5/olist_customers_dataset.csv?dl=1'

# File output location
out = r"C:\Users\scheerja\Downloads"

### Read data

In [3]:
def read_olist_data(file1, file2):
    
    # Read the orders data
    orders = pd.read_csv(file1)

    print (f'{len(orders):,d} read from the orders file.')

    # Drop unnecessary columns
    drop_vars = ['order_approved_at', 'order_delivered_carrier_date', 
                 'order_delivered_customer_date', 'order_estimated_delivery_date']

    orders = orders.drop(drop_vars, axis=1)

    # Date-time conversion
    orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])

    # Let's convert the order purchase timestamps into dates
    orders['order_purchase_date'] = orders['order_purchase_timestamp'].dt.date

    # Extract month from the order date
    orders['order_month'] = orders['order_purchase_timestamp'].dt.month

    # Read the file that contains the unique customer identifier
    cust = pd.read_csv(file2)

    print (f'{len(cust):,d} read from the customer file.')

    # Let's keep only the following two columns: customer_id, customer_unique_id
    cust = cust[['customer_id', 'customer_unique_id']]

    # Merge orders and cust dataframes
    orders = pd.merge(orders, cust, on='customer_id', how='inner')
    
    print (f'{len(orders):,d} records in the output  file.')
    
    return orders

orders = read_olist_data(orders_file, cust_file)

orders.head()

99,441 read from the orders file.
99,441 read from the customer file.
99,441 records in the output  file.


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_purchase_date,order_month,customer_unique_id
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02,10,7c396fd4830fd04220f754e42b4e5bff
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-24,7,af07308b275d755c9edb36a90c618231
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08,8,3a653a41f6f9fc3d2a113cf8398680e8
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18,11,7c142cf63193a1473d2e66489a9ae977
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13,2,72632f0f9dd73dfee390c9b22eb56dd6


## Binary Classification model

Let's build a model to predict whether a customer will make a purchase within the next month.

We will use *August 2018*, which is the most recent month, as the prediction window.

### Order recency

In [4]:
# Let's keep only those columns that we need (for this exercise)

keep_cols = ['customer_unique_id', 'order_id', 'order_purchase_timestamp']

orders = orders[keep_cols]

orders.head()

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp
0,7c396fd4830fd04220f754e42b4e5bff,e481f51cbdc54678b7cc49136f2d6af7,2017-10-02 10:56:33
1,af07308b275d755c9edb36a90c618231,53cdb2fc8bc7dce0b6741e2150273451,2018-07-24 20:41:37
2,3a653a41f6f9fc3d2a113cf8398680e8,47770eb9100c2d0c44946d9cf07ec65d,2018-08-08 08:38:49
3,7c142cf63193a1473d2e66489a9ae977,949d5b44dbf5de918fe9c16f97b45f8a,2017-11-18 19:28:06
4,72632f0f9dd73dfee390c9b22eb56dd6,ad21c59c0840e6cb83a9ceb5573f8159,2018-02-13 21:18:39


For creating model attributes (features) we will have to restrict our data to the timeframe prior to the prediction window

In [5]:
print (len(orders))

# Select data prior to the prediction window

attr_raw = orders[(orders['order_purchase_timestamp'].dt.month <= 7) & (orders['order_purchase_timestamp'].dt.year <= 2018)]

len(attr_raw)

99441


66116

In [10]:
# Check to make sure that the max date is 31-JUL-2018
attr_raw.order_purchase_timestamp.max()


Timestamp('2018-07-31 23:54:20')

In [11]:
# Another way to make such filter is by using a "mask"

mask = (orders['order_purchase_timestamp'].dt.month <= 7) & (orders['order_purchase_timestamp'].dt.year <= 2018)

attr_raw = orders[mask]

len(attr_raw)

66116

To calculate recency (days since the most recent order), we will have to extract the most recent date for each customer.

In [12]:
# Get the max date for each customer

cust_recency = attr_raw.groupby('customer_unique_id')['order_purchase_timestamp'].max().reset_index()

cust_recency.head()

Unnamed: 0,customer_unique_id,order_purchase_timestamp
0,0000366f3b9a7992bf8c76cfdf3221e2,2018-05-10 10:56:27
1,0000b849f77a49e4a4ce2b2a4ca5be3f,2018-05-07 11:11:27
2,0000f46a3911fa3c0805444483337064,2017-03-10 21:05:03
3,0004bd2a26a76fe21f786e4fbd80607f,2018-04-05 19:33:16
4,00050ab1314c0e55a6ca13cf7181fecf,2018-04-20 12:57:23


In [14]:
# Calculate recency

snapshot_date = date(2018, 7, 31)

delta = snapshot_date - cust_recency.order_purchase_timestamp.dt.date

In [15]:
delta

0        82 days
1        85 days
2       508 days
3       117 days
4       102 days
5       153 days
6       514 days
7       141 days
8       378 days
9       129 days
10       99 days
11      205 days
12       36 days
13       72 days
14        5 days
15      385 days
16      148 days
17      556 days
18      463 days
19      398 days
20      194 days
21        3 days
22      468 days
23        3 days
24      159 days
25      170 days
26       91 days
27      397 days
28       20 days
29       71 days
          ...   
64279   372 days
64280   503 days
64281   432 days
64282   146 days
64283    52 days
64284   188 days
64285    97 days
64286   413 days
64287   417 days
64288    15 days
64289     1 days
64290   170 days
64291    34 days
64292   561 days
64293   117 days
64294    97 days
64295    79 days
64296   488 days
64297     4 days
64298   428 days
64299   157 days
64300    35 days
64301   102 days
64302    11 days
64303    29 days
64304   111 days
64305   418 days
64306   539 da

In [16]:
# Get the number of days from the calculated deltas 

cust_recency['order_recency'] = delta

cust_recency.head()

Unnamed: 0,customer_unique_id,order_purchase_timestamp,order_recency
0,0000366f3b9a7992bf8c76cfdf3221e2,2018-05-10 10:56:27,82 days
1,0000b849f77a49e4a4ce2b2a4ca5be3f,2018-05-07 11:11:27,85 days
2,0000f46a3911fa3c0805444483337064,2017-03-10 21:05:03,508 days
3,0004bd2a26a76fe21f786e4fbd80607f,2018-04-05 19:33:16,117 days
4,00050ab1314c0e55a6ca13cf7181fecf,2018-04-20 12:57:23,102 days


In [18]:
# Drop the date (we don't need it any more for this exercise)

## -- INSERT CODE HERE -- ##
cust_recency = cust_recency.drop('order_purchase_timestamp', axis=1)

In [22]:
# How many records (unique customers) do we have for this timeframe?

len(cust_recency)
cust_recency.head()

Unnamed: 0,customer_unique_id,order_recency
0,0000366f3b9a7992bf8c76cfdf3221e2,82 days
1,0000b849f77a49e4a4ce2b2a4ca5be3f,85 days
2,0000f46a3911fa3c0805444483337064,508 days
3,0004bd2a26a76fe21f786e4fbd80607f,117 days
4,00050ab1314c0e55a6ca13cf7181fecf,102 days


In [26]:
orders.head()

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp
0,7c396fd4830fd04220f754e42b4e5bff,e481f51cbdc54678b7cc49136f2d6af7,2017-10-02 10:56:33
1,af07308b275d755c9edb36a90c618231,53cdb2fc8bc7dce0b6741e2150273451,2018-07-24 20:41:37
2,3a653a41f6f9fc3d2a113cf8398680e8,47770eb9100c2d0c44946d9cf07ec65d,2018-08-08 08:38:49
3,7c142cf63193a1473d2e66489a9ae977,949d5b44dbf5de918fe9c16f97b45f8a,2017-11-18 19:28:06
4,72632f0f9dd73dfee390c9b22eb56dd6,ad21c59c0840e6cb83a9ceb5573f8159,2018-02-13 21:18:39


Let's add the number of orders per customer to the `attr` dataframe.

In [33]:
# Count total records (i.e., orders) per customer

## -- INSERT CODE HERE -- ##

cust_orders = orders.groupby('customer_unique_id')['order_id'].count()

cust_orders.unique()

array([ 1,  2,  3,  4,  6,  7,  5,  9, 17], dtype=int64)

In [34]:
# Rename the new column

cust_orders = cust_orders.rename('num_of_orders')

In [35]:
cust_orders

customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2    1
0000b849f77a49e4a4ce2b2a4ca5be3f    1
0000f46a3911fa3c0805444483337064    1
0000f6ccb0745a6a4b88665a16c9f078    1
0004aac84e0df4da2b147fca70cf8255    1
0004bd2a26a76fe21f786e4fbd80607f    1
00050ab1314c0e55a6ca13cf7181fecf    1
00053a61a98854899e70ed204dd4bafe    1
0005e1862207bf6ccc02e4228effd9a0    1
0005ef4cd20d2893f0d9fbd94d3c0d97    1
0006fdc98a402fceb4eb0ee528f6a8d4    1
00082cbe03e478190aadbea78542e933    1
00090324bbad0e9342388303bb71ba0a    1
000949456b182f53c18b68d6babc79c1    1
000a5ad9c4601d2bbdd9ed765d5213b3    1
000bfa1d2f1a41876493be685390d6d3    1
000c8bdb58a29e7115cfc257230fb21b    1
000d460961d6dbfa3ec6c9f5805769e1    1
000de6019bb59f34c099a907c151d855    1
000e309254ab1fc5ba99dd469d36bdb4    1
000ec5bff359e1c0ad76a81a45cb598f    1
000ed48ceeb6f4bf8ad021a10a3c7b43    1
000fbf0473c10fc1ab6f8d2d286ce20c    1
0010a452c6d13139e50b57f19f52e04e    1
0010fb34b966d44409382af9e8fd5b77    1
001147e649a7b1afd577e873841632d

In [39]:
# Combine (merge) `cust_recency` with `cust_orders` so that we have both attributes in one dataset

## -- INSERT CODE HERE -- ##

attr = pd.merge(cust_recency, cust_orders, on='customer_unique_id')

attr.head()



Unnamed: 0,customer_unique_id,order_recency,num_of_orders
0,0000366f3b9a7992bf8c76cfdf3221e2,82 days,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,85 days,1
2,0000f46a3911fa3c0805444483337064,508 days,1
3,0004bd2a26a76fe21f786e4fbd80607f,117 days,1
4,00050ab1314c0e55a6ca13cf7181fecf,102 days,1


### Assign the Target variable (aka the Dependent variable)

For this exercise, we will assume that the objective of the model is to predict whether a customer will make *at least one purchase* in the future (i.e., within the target window of the model).

**Step 1:** Isolate all orders that were placed within the prediction window.

In [79]:
orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])

In [85]:
orders.order_purchase_timestamp.dt.year

0        2017
1        2018
2        2018
3        2017
4        2018
5        2017
6        2017
7        2017
8        2017
9        2017
10       2017
11       2017
12       2018
13       2018
14       2018
15       2018
16       2018
17       2017
18       2017
19       2017
20       2017
21       2018
22       2018
23       2018
24       2018
25       2018
26       2018
27       2018
28       2018
29       2018
         ... 
99411    2018
99412    2017
99413    2018
99414    2017
99415    2017
99416    2018
99417    2018
99418    2017
99419    2017
99420    2017
99421    2017
99422    2018
99423    2017
99424    2017
99425    2018
99426    2017
99427    2018
99428    2018
99429    2017
99430    2018
99431    2017
99432    2017
99433    2017
99434    2017
99435    2017
99436    2017
99437    2018
99438    2017
99439    2018
99440    2018
Name: order_purchase_timestamp, Length: 99441, dtype: int64

In [87]:
# Select orders that were placed in August 2018

## -- INSERT CODE BELOW -- ##

# 1. Create a mask
mask = (orders.order_purchase_timestamp.dt.year == 2018) & (orders.order_purchase_timestamp.dt.month == 8)

# 2. Apply the mask to filter records in the orders dataframe
#    and create a new data frame
target_events_raw = orders[mask]

len(target_events_raw)

6512

In [88]:
target_events_raw.head()

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp
2,3a653a41f6f9fc3d2a113cf8398680e8,47770eb9100c2d0c44946d9cf07ec65d,2018-08-08 08:38:49
24,9c9242ad7f1b52d926ea76778e1c0c57,f3e7c359154d965827355f39d6b1fdac,2018-08-09 11:44:40
43,394b2ce444baae9ae609f5d32000de0f,d22e9fa5731b9e30e8b27afcdc2f8563,2018-08-04 23:25:30
61,da45a9a1df408c39f013b9b0b505042c,f346ad4ee8f630e5e4ddaf862a34e6dd,2018-08-05 13:09:48
78,a71cac9f356cfeb9db35061020806212,6d25592267349b322799e2beb687871e,2018-08-26 22:04:34


**Step 2:** Summarize data to get one record per customer.

In [121]:
# Count the number of orders (we will convert this into a binary flag later)

target_events = target_events_raw.groupby('customer_unique_id').count().reset_index()

target_events.head()
#len(target_events)

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp
0,000ec5bff359e1c0ad76a81a45cb598f,1,1
1,0015752e079902b12cd00b9b7596276b,1,1
2,00172711b30d52eea8b313a7f2cced02,1,1
3,001928b561575b2821c92254a2327d06,1,1
4,002471155ecd08d208d1376720e2a907,1,1


**Step 3:** Merge this dataframe with the `attr` dataframe to create the modeling dataset.

In [123]:
attr.shape

(64309, 3)

In [135]:
# Merge the new dataframe (created above) with the dataframe that contains customer attributes

df = pd.merge(target_events, attr, how='inner', on='customer_unique_id')

df.num_of_orders.describe()

count    159.000000
mean       2.301887
std        1.306046
min        2.000000
25%        2.000000
50%        2.000000
75%        2.000000
max       17.000000
Name: num_of_orders, dtype: float64

In [141]:
# Rename the target column

df['purch'] = df['num_of_orders']

df.head()

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp,order_recency,num_of_orders,purch
0,00172711b30d52eea8b313a7f2cced02,1,1,3 days,2,2
1,083ca1aa470c280236380973a48f77c6,1,1,368 days,4,4
2,083ce299ea3fc1eba79bd0b17f64555b,1,1,190 days,2,2
3,08f334e84bc7ab7b588cb7fc77dd7f4a,1,1,17 days,2,2
4,0c1201c04330536234aaa49b396745d4,1,1,188 days,2,2


**Step 4:** Set the target variable; 1 if at least one purchase was made, 0 otherwise.

In [None]:
# Create the binary target flag

df['purch'] = 

df.head()

In [None]:
# Check the distribution of the target flag



In [None]:
# % distribution of the target flag



In [None]:
# Check the correlations



### Model building

In [None]:
# Set-up

preds = 

X = 
y = 

Tri-fold partitioning is recommended as long as there's sufficient sample size avaiable. In this example, since the number of target events is very small (only 159), we will perform a two-fold paritioning of the modeling sample.

#### Two-fold partition

In [None]:
# Split the dataframe into train(50%) and test(50%)

#from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = 

len(X_train), len(X_test)

In [None]:
# Target distribution in the trainig sample

y_train.value_counts()

In [None]:
# Target distribution in the test sample

y_test.value_counts()

In [None]:
#from sklearn.linear_model import LogisticRegression

# Define the model object

clf = 


# Train (fit) the model using the training sample



# Make predictions on the test sample

preds_test = 

### Model accuracy

In [None]:
#from sklearn.metrics import accuracy_score



Why does the model appear to be very strong in spite of only two predictors that didn't look correlated with the target?

In [None]:
# Check model's predictions against actual value of the target event



Area under the ROC surve (AUC)

In [None]:
# Calculate the probabilities on the test sample

scores_test = 


# Calcualte AUC



## Export results

#### 1. Scored dataset (with all model attributes)

In [None]:
scores_all = 

df['prob_to_order'] = scores_all

df.head()

In [None]:
csv_file_zipped = out + '\olist_purch_model_scores.csv.gz'

df.to_csv(csv_file_zipped, compression='gzip')

#### 2. Export the model

In [None]:
from sklearn.externals import joblib

model_pkl_file = out + '\olist_purch_model.joblib'



##### Load the model

In [None]:
clf_loaded = 

clf_loaded

In [None]:
# Use the model to score a "new" data frame

s