# Starbucks Capstone Project
## Feature Engineering

In [1]:
## Import all the necessary libraries
import os

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import quantile_transform, scale, robust_scale

# Portfolio

In [2]:
portfolio_df = pd.read_json('data/portfolio.json', orient='records', lines=True)
display(portfolio_df)

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
5,3,"[web, email, mobile, social]",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
6,2,"[web, email, mobile, social]",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
7,0,"[email, mobile, social]",0,3,informational,5a8bc65990b245e5a138643cd4eb9837
8,5,"[web, email, mobile, social]",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
9,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5


**offer type** is a categorical feature that could be mapped as one hot encoding. \
**Channels** are categorical feature as well, but values can assume more than one category. Its values may be converted into individual features.  

In [3]:
## Set id as index
portfolio_df.set_index(keys='id', verify_integrity=True, inplace=True)

## Make offer_type one hot encoded
portfolio_df = portfolio_df.join(
    pd.get_dummies(portfolio_df.pop('offer_type')))

## Transform channels in distinct features
channels_df = pd.DataFrame(portfolio_df.pop('channels'))
channels_df = channels_df.explode('channels')
channels_df = channels_df.assign(value=lambda x: 1)
channels_df = channels_df.pivot(columns='channels', values='value')
channels_df.fillna(value=0, inplace=True)
portfolio_df = portfolio_df.join(channels_df)
channels_df = None

## print the result
display(portfolio_df)

Unnamed: 0_level_0,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,7,True,False,False,1.0,1.0,1.0,0.0
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,5,True,False,False,1.0,1.0,1.0,1.0
3f207df678b143eea3cee63160fa8bed,0,0,4,False,False,True,1.0,1.0,0.0,1.0
9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,True,False,False,1.0,1.0,0.0,1.0
0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,False,True,False,1.0,0.0,0.0,1.0
2298d6c36e964ae4a3e7e9706d1fb8c2,3,7,7,False,True,False,1.0,1.0,1.0,1.0
fafdcd668e3743c1bb461111dcafc2a4,2,10,10,False,True,False,1.0,1.0,1.0,1.0
5a8bc65990b245e5a138643cd4eb9837,0,0,3,False,False,True,1.0,1.0,1.0,0.0
f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,True,False,False,1.0,1.0,1.0,1.0
2906b810c7d4411798c6938adc9daaa5,2,10,7,False,True,False,1.0,1.0,0.0,1.0


In [4]:
## Filter out email column, since it is not an informative feature
# once any offer uses this channel.
portfolio_df.drop(columns='email', inplace=True)

display(pd.DataFrame(portfolio_df.describe())
        .style.set_caption('Dataset description'))

display(portfolio_df.corr().abs()
        .style.set_caption('Pairwise correlation'))

Unnamed: 0,reward,difficulty,duration,mobile,social,web
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.2,7.7,6.5,0.9,0.6,0.8
std,3.583915,5.831905,2.321398,0.316228,0.516398,0.421637
min,0.0,0.0,3.0,0.0,0.0,0.0
25%,2.0,5.0,5.0,1.0,0.0,1.0
50%,4.0,8.5,7.0,1.0,1.0,1.0
75%,5.0,10.0,7.0,1.0,1.0,1.0
max,10.0,20.0,10.0,1.0,1.0,1.0


Unnamed: 0,reward,difficulty,duration,bogo,discount,informational,mobile,social,web
reward,1.0,0.465686,0.160262,0.792482,0.288175,0.617647,0.078431,0.288175,0.117647
difficulty,0.465686,1.0,0.808414,0.029516,0.597692,0.695872,0.741058,0.154957,0.244007
duration,0.160262,0.808414,1.0,0.185376,0.741504,0.681115,0.529756,0.185376,0.340557
bogo,0.792482,0.029516,0.185376,1.0,0.666667,0.408248,0.272166,0.25,0.102062
discount,0.288175,0.597692,0.741504,0.666667,1.0,0.408248,0.408248,0.166667,0.408248
informational,0.617647,0.695872,0.681115,0.408248,0.408248,1.0,0.166667,0.102062,0.375
mobile,0.078431,0.741058,0.529756,0.272166,0.408248,0.166667,1.0,0.408248,0.166667
social,0.288175,0.154957,0.185376,0.25,0.166667,0.102062,0.408248,1.0,0.408248
web,0.117647,0.244007,0.340557,0.102062,0.408248,0.375,0.166667,0.408248,1.0


In [5]:
display(portfolio_df)

Unnamed: 0_level_0,reward,difficulty,duration,bogo,discount,informational,mobile,social,web
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,7,True,False,False,1.0,1.0,0.0
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,5,True,False,False,1.0,1.0,1.0
3f207df678b143eea3cee63160fa8bed,0,0,4,False,False,True,1.0,0.0,1.0
9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,True,False,False,1.0,0.0,1.0
0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,False,True,False,0.0,0.0,1.0
2298d6c36e964ae4a3e7e9706d1fb8c2,3,7,7,False,True,False,1.0,1.0,1.0
fafdcd668e3743c1bb461111dcafc2a4,2,10,10,False,True,False,1.0,1.0,1.0
5a8bc65990b245e5a138643cd4eb9837,0,0,3,False,False,True,1.0,1.0,0.0
f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,True,False,False,1.0,1.0,1.0
2906b810c7d4411798c6938adc9daaa5,2,10,7,False,True,False,1.0,0.0,1.0


# Profile

In [44]:
profile_df = pd.read_json('data/profile.json', orient='records', lines=True)
display(profile_df.head())

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


* create new feature to indicate missing values
* since gender is a discrete feature, create another gender category
* as age and income are continuous features, fill them with the respective mean values

In [45]:
# create new feature to indicate missing values
missing_data = profile_df.gender.isna()
profile_df = profile_df.assign(missing_data=missing_data.astype(int))

# fill gender if a new 'U' (unknown) category
profile_df['gender'].mask(missing_data, 'U', inplace=True)


# Make gender one-hot encoded
profile_df = profile_df.join(
    pd.get_dummies(profile_df.pop('gender')))

# Set id as index
profile_df.set_index(keys='id', verify_integrity=True, inplace=True)

## Convert data to datetime format
profile_df.became_member_on = pd.to_datetime(profile_df.became_member_on, format='%Y%m%d').astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  profile_df['gender'].mask(missing_data, 'U', inplace=True)


In [28]:
print('Dataset description')
display(pd.DataFrame(profile_df.describe()).round(2))

Dataset description


Unnamed: 0,age,became_member_on,income,missing_data
count,17000.0,17000.0,14825.0,17000.0
mean,62.53,1.487856e+18,65404.99,0.13
std,26.74,3.552975e+16,21598.3,0.33
min,18.0,1.375056e+18,30000.0,0.0
25%,45.0,1.464221e+18,49000.0,0.0
50%,58.0,1.501632e+18,64000.0,0.0
75%,73.0,1.514592e+18,80000.0,0.0
max,118.0,1.532563e+18,120000.0,1.0


In [29]:
print('Pairwise correlation')
display(profile_df.corr().abs())

Pairwise correlation


Unnamed: 0,age,became_member_on,income,missing_data,F,M,O,U
age,1.0,0.018262,0.306703,0.79461,0.140032,0.388781,0.034171,0.79461
became_member_on,0.018262,1.0,0.025769,0.03193,0.007842,0.030697,0.008271,0.03193
income,0.306703,0.025769,1.0,,0.229396,0.225496,0.011808,
missing_data,0.79461,0.03193,,1.0,0.287602,0.382309,0.043043,1.0
F,0.140032,0.007842,0.229396,0.287602,1.0,0.74945,0.084378,0.287602
M,0.388781,0.030697,0.225496,0.382309,0.74945,1.0,0.112163,0.382309
O,0.034171,0.008271,0.011808,0.043043,0.084378,0.112163,1.0,0.043043
U,0.79461,0.03193,,1.0,0.287602,0.382309,0.043043,1.0


In [30]:
display(profile_df.head(10))

Unnamed: 0_level_0,age,became_member_on,income,missing_data,F,M,O,U
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
68be06ca386d4c31939f3a4f0e3dd783,118,1486857600000000000,,1,False,False,False,True
0610b486422d4921ae7d2bf64640c50b,55,1500076800000000000,112000.0,0,True,False,False,False
38fe809add3b4fcf9315a9694bb96ff5,118,1531353600000000000,,1,False,False,False,True
78afa995795e4d85b5d9ceeca43f5fef,75,1494288000000000000,100000.0,0,True,False,False,False
a03223e636434f42ac4c3df47e8bac43,118,1501804800000000000,,1,False,False,False,True
e2127556f4f64592b11af22de27a7932,68,1524700800000000000,70000.0,0,False,True,False,False
8ec6ce2a7e7949b1bf142def7d0e0586,118,1506297600000000000,,1,False,False,False,True
68617ca6246f4fbc85e91a2a49552598,118,1506902400000000000,,1,False,False,False,True
389bc3fa690240e798340f5a15918d5c,65,1518134400000000000,53000.0,0,False,True,False,False
8974fc5686fe429db53ddde067b88302,118,1479772800000000000,,1,False,False,False,True


# Transcript

In [89]:
transcript_df = pd.read_json('data/transcript.json', orient='records', lines=True)
display(transcript_df.head())

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [90]:
transcript_df = transcript_df.join(
    pd.DataFrame.from_records(transcript_df.pop('value')))
transcript_df.update({'offer_id': transcript_df.pop('offer id')})

display(transcript_df)

Unnamed: 0,person,event,time,amount,offer_id,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,,9b98b8c7a33c4b65b9aebfe6a799e6d9,
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,,0b1e1539f2cc45b7b9fa7c272da2e1d7,
2,e2127556f4f64592b11af22de27a7932,offer received,0,,2906b810c7d4411798c6938adc9daaa5,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,,fafdcd668e3743c1bb461111dcafc2a4,
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,,4d5c57ea9a6940dd891ad53e9dbe8da0,
...,...,...,...,...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,714,1.59,,
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,714,9.53,,
306531,a00058cf10334a308c68e7631c529907,transaction,714,3.61,,
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,714,3.53,,


In [91]:
users_to_discard = np.append(
    transcript_df.query('amount >= 50').person.unique(),
    transcript_df.groupby('person').count().query('amount == 0').index.values)

profile_df = profile_df.query('id not in @users_to_discard')
transcript_df = transcript_df.query('person not in @users_to_discard')

In [92]:
data = profile_df.filter(['became_member_on'])
data_transf = quantile_transform(data, output_distribution='normal', copy=True)
data = pd.DataFrame(data_transf, columns=data.columns, index=data.index).astype(int)
profile_df.update(data)

data = profile_df.query('missing_data == 0').filter(['age','income'])
# data_transf = quantile_transform(data, output_distribution='normal', copy=False)
data_transf = robust_scale(data, copy=False)
data = pd.DataFrame(data_transf, columns=data.columns, index=data.index).astype(int)
profile_df.update(data)

missing_data = profile_df.income.isna()
income_mean = profile_df[~(missing_data.values)].income.mean()
age_mean = profile_df[~(missing_data.values)].age.mean()

# # # fill age and income with its respective mean values
profile_df.loc[missing_data.values, 'age'] = age_mean
profile_df.loc[missing_data.values, 'income'] = income_mean

In [93]:
## Split register according to event
offer_received_df = transcript_df.query('event == "offer received"')
offer_viewed_df = transcript_df.query('event == "offer viewed"')
offer_completed_df = transcript_df.query('event == "offer completed"')
transaction_df = transcript_df.query('event == "transaction"')

In [94]:
## Remove unnecessary columns
offer_received_df = offer_received_df.drop(columns=['amount', 'reward'])

## Create registers when customers did not receive offer
offer_sending_time = offer_received_df.time.unique()
new_index = pd.MultiIndex.from_product(
    (profile_df.index, offer_sending_time),
    names=['person', 'time'])

offer_received_df = offer_received_df.set_index(['person', 'time'])
offer_received_df = offer_received_df.sort_index()
offer_received_df = offer_received_df.reindex(new_index)
offer_received_df = offer_received_df.reset_index()

## Create column to indicate when the offer ends
offer_received_df = offer_received_df.join(portfolio_df.duration*24, on='offer_id')
offer_received_df = offer_received_df.assign(
    offer_ends_on=offer_received_df.time + offer_received_df.duration)

## Create column to indicate when the offer is informational
offer_received_df = offer_received_df.join(
    portfolio_df.informational, on='offer_id')

## Create column to indicate when the offer is viewed or completed
offer_received_df = offer_received_df.assign(viewed_on=np.nan, completed_on=np.nan)

## Create a column to hold the label for that offer sending
offer_received_df = offer_received_df.assign(label=np.nan)

In [95]:
%%time

next_sending_time = {0: 168, 168: 336, 336: 408,
                     408: 504, 504: 576, 576: 720}

def classify_offers(row):
    if row.offer_id is np.nan:
        row.offer_ends_on = next_sending_time[row.time]
        try:
            # In this case, there is no offer to be viewed or completed.
            # So we look for a simple transaction.
            row.completed_on = transaction_df.query(
                'person == @row.person ' \
                'and time >= @row.time ' \
                'and time <= @row.offer_ends_on').time.values[0]
        except:
            # If there is no transaction in this period
            row.label = 0
        else:
            row.label = 1
        finally:
            return row

    try:
        row.viewed_on = offer_viewed_df.query(
            'person == @row.person ' \
            'and offer_id == @row.offer_id ' \
            'and time >= @row.time ' \
            'and time <= @row.offer_ends_on').time.values[0]
    except:
        # Offer was not viewed
        row.label = 0
        return row

    
    if row.informational == 1:
        try:
            # In this case, there is no offer to be completed.
            # So we look for a simple transaction after offer viewed
            row.completed_on = transaction_df.query(
                'person == @row.person ' \
                'and time >= @row.viewed_on ' \
                'and time <= @row.offer_ends_on').time.values[0]
        except:
            # If there is no transaction in this period
            row.label = 0
        else:
            row.label = 1
        finally:
            return row


    try:
        # In the other cases, we need an offer completion
        row.completed_on = offer_completed_df.query(
            'person == @row.person ' \
            'and offer_id == @row.offer_id ' \
            'and time >= @row.viewed_on ' \
            'and time <= @row.offer_ends_on').time.values[0]
    except:
        # If the offer was not completed
        row.label = 0
    else:
        row.label = 1
    finally:
        return row


offer_received_df = offer_received_df.apply(classify_offers, axis=1)

CPU times: user 29min 4s, sys: 29.7 s, total: 29min 34s
Wall time: 28min 46s


In [96]:
## Remove auxiliary columns
offer_received_df.drop(
    inplace=True,
    columns=['duration', 'offer_ends_on', 'informational',
             'viewed_on', 'completed_on'])

In [97]:
## Transform PORTFOLIO
data = portfolio_df[['reward','difficulty','duration']]
data = scale(data)
portfolio_df[['reward','difficulty','duration']] = data
portfolio_df

Unnamed: 0_level_0,reward,difficulty,duration,bogo,discount,informational,mobile,social,web
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ae264e3637204a6fb9bb56bc8210ddfd,1.705882,0.415715,0.227038,True,False,False,1.0,1.0,0.0
4d5c57ea9a6940dd891ad53e9dbe8da0,1.705882,0.415715,-0.681115,True,False,False,1.0,1.0,1.0
3f207df678b143eea3cee63160fa8bed,-1.235294,-1.391743,-1.135192,False,False,True,1.0,0.0,1.0
9b98b8c7a33c4b65b9aebfe6a799e6d9,0.235294,-0.488014,0.227038,True,False,False,1.0,0.0,1.0
0b1e1539f2cc45b7b9fa7c272da2e1d7,0.235294,2.223174,1.589268,False,True,False,0.0,0.0,1.0
2298d6c36e964ae4a3e7e9706d1fb8c2,-0.352941,-0.126522,0.227038,False,True,False,1.0,1.0,1.0
fafdcd668e3743c1bb461111dcafc2a4,-0.647059,0.415715,1.589268,False,True,False,1.0,1.0,1.0
5a8bc65990b245e5a138643cd4eb9837,-1.235294,-1.391743,-1.589268,False,False,True,1.0,1.0,0.0
f19421c1d4aa40978ebb69ca19b0e20d,0.235294,-0.488014,-0.681115,True,False,False,1.0,1.0,1.0
2906b810c7d4411798c6938adc9daaa5,-0.647059,0.415715,0.227038,False,True,False,1.0,0.0,1.0


## Create Features

In [99]:
## Set index as person and time
offer_received_df = offer_received_df.set_index(['person', 'time'])
offer_received_df = offer_received_df.sort_index()
## Join offer data with portfolio and profile data
offer_received_df = offer_received_df.join(portfolio_df, on='offer_id')
offer_received_df = offer_received_df.join(profile_df, on='person')

## Fill NA
offer_received_df.fillna(value=0, inplace=True)

display(offer_received_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,event,offer_id,label,reward,difficulty,duration,bogo,discount,informational,mobile,social,web,age,became_member_on,income,missing_data,F,M,O,U
person,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0009655768c64bdeb2e877511632db8f,0,0,0,0,0.000000,0.000000,0.000000,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,False,True,False,False
0009655768c64bdeb2e877511632db8f,168,offer received,5a8bc65990b245e5a138643cd4eb9837,0,-1.235294,-1.391743,-1.589268,False,False,True,1.0,1.0,0.0,0.0,0.0,0.0,0,False,True,False,False
0009655768c64bdeb2e877511632db8f,336,offer received,3f207df678b143eea3cee63160fa8bed,0,-1.235294,-1.391743,-1.135192,False,False,True,1.0,0.0,1.0,0.0,0.0,0.0,0,False,True,False,False
0009655768c64bdeb2e877511632db8f,408,offer received,f19421c1d4aa40978ebb69ca19b0e20d,0,0.235294,-0.488014,-0.681115,True,False,False,1.0,1.0,1.0,0.0,0.0,0.0,0,False,True,False,False
0009655768c64bdeb2e877511632db8f,504,offer received,fafdcd668e3743c1bb461111dcafc2a4,0,-0.647059,0.415715,1.589268,False,True,False,1.0,1.0,1.0,0.0,0.0,0.0,0,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffff82501cea40309d5fdd7edcca4a07,168,offer received,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0.235294,2.223174,1.589268,False,True,False,0.0,0.0,1.0,0.0,0.0,0.0,0,True,False,False,False
ffff82501cea40309d5fdd7edcca4a07,336,offer received,2906b810c7d4411798c6938adc9daaa5,0,-0.647059,0.415715,0.227038,False,True,False,1.0,0.0,1.0,0.0,0.0,0.0,0,True,False,False,False
ffff82501cea40309d5fdd7edcca4a07,408,offer received,2906b810c7d4411798c6938adc9daaa5,0,-0.647059,0.415715,0.227038,False,True,False,1.0,0.0,1.0,0.0,0.0,0.0,0,True,False,False,False
ffff82501cea40309d5fdd7edcca4a07,504,offer received,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,0.235294,-0.488014,0.227038,True,False,False,1.0,0.0,1.0,0.0,0.0,0.0,0,True,False,False,False


### Organize customer data

In [100]:
%%time

## Certify that dataframe is ordered
offer_received_df = offer_received_df.sort_index()

## Choose features to feed the networks
features_cols = [
    'reward', 'difficulty', 'duration',                 # offer characteristics
    'bogo', 'discount', 'informational',                # offer type
    'mobile', 'social', 'web',                          # channels
    'age','became_member_on', 'income', 'missing_data', # customer data
    'F', 'M', 'O'                                       # customer gender
]

## Create feature and target lists where each position
# holds the data related to one customer
features = []
targets = []
for index, user_data in offer_received_df.groupby('person'):
    targets.append(user_data['label'].values)
    features.append(user_data[features_cols].values)

CPU times: user 12.4 s, sys: 12.3 ms, total: 12.4 s
Wall time: 12.4 s


### Create the data loaders

In [101]:
## Convert features and targets to tensors
features = torch.as_tensor(features, dtype=torch.float)
targets = torch.as_tensor(targets, dtype=torch.long)


## Split data into three random datasets 

# Generate randomic indices
len_dataset = len(features)
random_idx = np.random.choice(len_dataset, len_dataset, replace=False)

# Use the proportions: train: 80%, valid: 10%, test: 10%
train_idx = random_idx[:int(len_dataset*0.8)]
valid_idx = random_idx[int(len_dataset*0.8):-int(len_dataset*0.1)]
test_idx = random_idx[-int(len_dataset*0.1):]

# Create datasets
train_dataset = TensorDataset(features[train_idx], targets[train_idx])
valid_dataset = TensorDataset(features[valid_idx], targets[valid_idx])
test_dataset = TensorDataset(features[test_idx], targets[test_idx])

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

  features = torch.as_tensor(features, dtype=torch.float)


### Count targets within each dataloader

In [102]:
# TARGETS FOR TRAINING
targ_neg = (targets[train_idx] == 0).sum()
targ_pos = (targets[train_idx] == 1).sum()
targ_total = targ_neg + targ_pos
print('Train dataset:\tCN: {:5d} / {:5.2f}%\tCP: {:5d} / {:5.2f}%' \
      .format(targ_neg, targ_neg*100./targ_total,
              targ_pos, targ_pos*100./targ_total))

# TARGETS FOR TRAINING
targ_neg = (targets[valid_idx] == 0).sum()
targ_pos = (targets[valid_idx] == 1).sum()
targ_total = targ_neg + targ_pos
print('Valid dataset:\tCN: {:5d} / {:5.2f}%\tCP: {:5d} / {:5.2f}%' \
      .format(targ_neg, targ_neg*100./targ_total,
              targ_pos, targ_pos*100./targ_total))

# TARGETS FOR TRAINING
targ_neg = (targets[test_idx] == 0).sum()
targ_pos = (targets[test_idx] == 1).sum()
targ_total = targ_neg + targ_pos
print('Test dataset:\tCN: {:5d} / {:5.2f}%\tCP: {:5d} / {:5.2f}%' \
      .format(targ_neg, targ_neg*100./targ_total,
              targ_pos, targ_pos*100./targ_total))

Train dataset:	CN: 63184 / 82.84%	CP: 13088 / 17.16%
Valid dataset:	CN:  7946 / 83.29%	CP:  1594 / 16.71%
Test dataset:	CN:  7881 / 82.66%	CP:  1653 / 17.34%


### Save the dataloaders into a zip file

In [103]:
torch.save((train_dataloader, valid_dataloader, test_dataloader), 'dataloaders.pt')