# Data Preprocessing
We want to build a model that predicts whether or not someone will respond to an offer. Many variables need to be changed to be numeric types for a machine learning model to process it. We also need to combine our data. Please see the steps required below for each of our datasets.

In [283]:
import pandas as pd
import numpy as np
import math
import json
# % matplotlib inline
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Portfolio

In [284]:
portfolio.dtypes

reward         int64
channels      object
difficulty     int64
duration       int64
offer_type    object
id            object
dtype: object

In [285]:
portfolio.head(5)

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [286]:
# machine learning algorithms can only handle numerical features, change the channels column to dummy variables
# use offer_type column to create dummy variables
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['offer_type'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# drop the offer_type column
portfolio.drop('offer_type', axis=1, inplace=True)
# use channel column to create dummy variables
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['channels'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# drop channel column
portfolio.drop('channels', axis=1, inplace=True)

In [287]:
portfolio= portfolio.rename(columns={'id':'offer_id'})

In [288]:
portfolio.head()

Unnamed: 0,reward,difficulty,duration,offer_id,bogo,discount,informational,email,mobile,social,web
0,10,10,7,ae264e3637204a6fb9bb56bc8210ddfd,1,0,0,1,1,1,0
1,10,10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,1,0,0,1,1,1,1
2,0,0,4,3f207df678b143eea3cee63160fa8bed,0,0,1,1,1,0,1
3,5,5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,0,0,1,1,0,1
4,5,20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,1,0,1,0,0,1


## Profile

In [289]:
profile.dtypes

gender               object
age                   int64
id                   object
became_member_on      int64
income              float64
dtype: object

In [290]:
# change the became_member_on column to datetime
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')

In [291]:
# machine learning algorithms cannot consume dates
# change became_member_on to number of days as a member 
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days

# drop became_member_on column
profile.drop('became_member_on', axis=1, inplace=True)

In [292]:
# filter for all the customers who have income data and no gender data
profile[profile['gender'].isnull() & profile['income'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [293]:
profile[profile['income'].isnull() & profile['gender'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


NOTE: because there are only 4 features and all the rows with missing genders also have missing income values, we can drop these rows

In [294]:
profile[profile['age'] >= 118]

Unnamed: 0,gender,age,id,income,days_as_member
0,,118,68be06ca386d4c31939f3a4f0e3dd783,,2326
2,,118,38fe809add3b4fcf9315a9694bb96ff5,,1811
4,,118,a03223e636434f42ac4c3df47e8bac43,,2153
6,,118,8ec6ce2a7e7949b1bf142def7d0e0586,,2101
7,,118,68617ca6246f4fbc85e91a2a49552598,,2094
...,...,...,...,...,...
16980,,118,5c686d09ca4d475a8f750f2ba07e0440,,2490
16982,,118,d9ca82f550ac4ee58b6299cf1e5c824a,,2629
16989,,118,ca45ee1883624304bac1e4c8a114f045,,1940
16991,,118,a9a20fa8b5504360beb4e7c8712f8306,,2719


NOTE: it is also the same records with invalid age values, we will drop these columns.

In [295]:
profile.shape

(17000, 5)

In [296]:
# drop nan values
profile.dropna(inplace=True)

In [297]:
profile.shape

(14825, 5)

In [298]:
print(f"{17000-14825} records dropped")

2175 records dropped


In [299]:
profile[profile['income'].isnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [300]:
profile.head()

Unnamed: 0,gender,age,id,income,days_as_member
1,F,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2173
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2240
5,M,68,e2127556f4f64592b11af22de27a7932,70000.0,1888
8,M,65,389bc3fa690240e798340f5a15918d5c,53000.0,1964
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2054


In [301]:
profile.gender.value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [302]:
# machine learning algorithms can only handle numerical features, change the gender column to dummy variables
# create gender dummy variables
profile = pd.concat([profile, pd.get_dummies(profile['gender'].apply(pd.Series).stack()).sum(level=0)], axis=1)

In [303]:
# drop channel column
profile.drop('gender', axis=1, inplace=True)

In [304]:
profile.head()

Unnamed: 0,age,id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2173,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2240,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1888,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1964,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2054,0,1,0


In [305]:
profile= profile.rename(columns={'id':'user_id'})

In [306]:
profile.head()

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2173,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2240,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1888,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1964,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2054,0,1,0


## Transcript

In [307]:
transcript.dtypes

person    object
event     object
value     object
time       int64
dtype: object

In [308]:
transcript.tail()

Unnamed: 0,person,event,value,time
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714
306533,c02b10e8752c4d8e9b73f918558531f7,transaction,{'amount': 4.05},714


In [309]:
transcript.sample(n=5)

Unnamed: 0,person,event,value,time
60229,af3fb8f091f44fd58d0d1f31a6d74f98,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},168
49615,23b724c1f63849088971c3f94cc839af,transaction,{'amount': 8.73},144
99095,067ee299952e49328b06da73e2ba1965,transaction,{'amount': 5.4},270
224676,399136d18bb5403aa52729510325412f,transaction,{'amount': 33.79},516
116315,e9d95a32f93e4fa4b00990812acde9c7,offer received,{'offer id': 'ae264e3637204a6fb9bb56bc8210ddfd'},336


In [310]:
# convert the json column to usable columns

In [311]:
# transcript['value_type'] = transcript['value'].apply(lambda x: list(x.keys())[0])
# transcript['value_details'] = transcript['value'].apply(lambda x: list(x.values())[0])
# transcript['value_details'] = transcript['value_details'].astype(str)
# this doesnt work for joining later

In [312]:
# unpack the value column into columns
transcript['offer_id'] = transcript['value'].apply(lambda x: x.get('offer_id') or x.get('offer id'))
transcript['amount'] = transcript['value'].apply(lambda x: x.get('amount'))

In [313]:
# drop the value column
transcript.drop(columns=['value'], inplace=True)

In [314]:
# drop all the duplicate records
transcript.drop_duplicates(inplace=True)

## Combining data

In [315]:
portfolio.head(2)

Unnamed: 0,reward,difficulty,duration,offer_id,bogo,discount,informational,email,mobile,social,web
0,10,10,7,ae264e3637204a6fb9bb56bc8210ddfd,1,0,0,1,1,1,0
1,10,10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,1,0,0,1,1,1,1


In [316]:
profile.head(2)

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2173,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2240,1,0,0


In [317]:
transcript.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount
153544,1dbb8d4481aa4b0c8c9b8912fb340c76,offer received,408,0b1e1539f2cc45b7b9fa7c272da2e1d7,
166775,d61124b6a76847b593f8fce78484fcce,offer viewed,408,ae264e3637204a6fb9bb56bc8210ddfd,
63682,371c782cc0b64bc5a7e473715bac0253,offer received,168,2906b810c7d4411798c6938adc9daaa5,
17961,6da8765ed47745959fd67865d3799206,offer viewed,6,4d5c57ea9a6940dd891ad53e9dbe8da0,
163341,82212ba0e8a74861bf1e20cd6f2e41ba,offer received,408,2906b810c7d4411798c6938adc9daaa5,


In [318]:
transcript.shape

(306137, 5)

In [319]:
# combine datasets on corresponding ids
combined = pd.merge(transcript, profile, left_on= 'person', right_on='user_id', how='left')
combined = pd.merge(combined, portfolio, on = 'offer_id', how='left')
combined.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,...,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
52435,c00b91a565324027b72b928446e6a6f8,transaction,162,,0.46,33.0,c00b91a565324027b72b928446e6a6f8,73000.0,1877.0,1.0,...,,,,,,,,,,
54126,04ec49d81d154a70b3f0d5f280da3756,offer received,168,5a8bc65990b245e5a138643cd4eb9837,,65.0,04ec49d81d154a70b3f0d5f280da3756,42000.0,1935.0,0.0,...,0.0,0.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
280311,5b6c387277ba43e99c8f27dc92916cfb,offer viewed,618,f19421c1d4aa40978ebb69ca19b0e20d,,51.0,5b6c387277ba43e99c8f27dc92916cfb,105000.0,2836.0,1.0,...,5.0,5.0,5.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
70310,0ba8b605e9ab4114b55d26c267fdff7f,offer viewed,174,3f207df678b143eea3cee63160fa8bed,,56.0,0ba8b605e9ab4114b55d26c267fdff7f,59000.0,1991.0,1.0,...,0.0,0.0,4.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
296987,ef4bfeba0ad14fa79952653319943a5c,transaction,672,,4.53,28.0,ef4bfeba0ad14fa79952653319943a5c,42000.0,3455.0,0.0,...,,,,,,,,,,


In [320]:
# no records gained, correct
combined.shape

(306137, 22)

In [321]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          138953
amount            167184
age                33749
user_id            33749
income             33749
days_as_member     33749
F                  33749
M                  33749
O                  33749
reward            138953
difficulty        138953
duration          138953
bogo              138953
discount          138953
informational     138953
email             138953
mobile            138953
social            138953
web               138953
dtype: int64

In [322]:
# there are no users associated with these records
combined[combined['user_id'].isnull()].sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,...,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
253635,22e48d2f671b4bcd97de9a51363ac680,offer received,576,ae264e3637204a6fb9bb56bc8210ddfd,,,,,,,...,10.0,10.0,7.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
91847,ac76ca1ff69c456dad050ae58ad82647,transaction,234,,1.46,,,,,,...,,,,,,,,,,
245066,5ceb90c4e738427f90cd7b16d51c59bc,offer received,576,5a8bc65990b245e5a138643cd4eb9837,,,,,,,...,0.0,0.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
116808,c2a9b079816a46e79664ae0fe5858167,offer received,336,4d5c57ea9a6940dd891ad53e9dbe8da0,,,,,,,...,10.0,10.0,5.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
301803,f660a1217750485a9f2f36fd929b5575,offer viewed,696,9b98b8c7a33c4b65b9aebfe6a799e6d9,,,,,,,...,5.0,5.0,7.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0


In [323]:
profile[profile.user_id == '5ae36f912be1492199ec2da838cc6dda']

Unnamed: 0,age,user_id,income,days_as_member,F,M,O


In [324]:
# since we are interested in how users respond to offers, we can drop all the records where there is no user associated
combined.dropna(subset=['user_id'], inplace=True)

In [325]:
# find all the values where amount is null
combined[combined['amount'].isnull()].event.value_counts()
# this is expected as amount null for offer records

offer received     66501
offer viewed       49860
offer completed    32070
Name: event, dtype: int64

In [326]:
# find all the values where offer_id is null
combined[combined['offer_id'].isnull()].event.value_counts()
# this is expected as offer_id null for transaction records

transaction    123957
Name: event, dtype: int64

In [327]:
# all transaction records dont correlate with any offers necessarily (no direct link) do we need to keep them?
combined[combined['reward'].isnull()].sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,...,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
91145,dafc626bbdac41a8ae6b4f7a9478b2b2,transaction,234,,32.21,67.0,dafc626bbdac41a8ae6b4f7a9478b2b2,103000.0,2669.0,1.0,...,,,,,,,,,,
100842,4dc0c5a72d7344b2beddf539149fa29d,transaction,276,,3.3,27.0,4dc0c5a72d7344b2beddf539149fa29d,33000.0,2114.0,0.0,...,,,,,,,,,,
264125,80914d5e89db497d9a9bfafaea38ce3a,transaction,582,,6.19,70.0,80914d5e89db497d9a9bfafaea38ce3a,33000.0,2123.0,1.0,...,,,,,,,,,,
228055,2db7b4739e3a4fc0964fce535ec05d96,transaction,528,,13.13,65.0,2db7b4739e3a4fc0964fce535ec05d96,50000.0,2063.0,1.0,...,,,,,,,,,,
69182,ae7f166083dd4a5eabca0d526fb246b2,transaction,168,,4.02,35.0,ae7f166083dd4a5eabca0d526fb246b2,70000.0,3461.0,1.0,...,,,,,,,,,,


### Split transaction and offer records for seperate analysis
As we are interested specifically in how certain users respond to offers and so tracking if a viewed offer results in a completed offer, we will split all transaction events out of the dataset for this analysis.

In [328]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          123957
amount            148431
age                    0
user_id                0
income                 0
days_as_member         0
F                      0
M                      0
O                      0
reward            123957
difficulty        123957
duration          123957
bogo              123957
discount          123957
informational     123957
email             123957
mobile            123957
social            123957
web               123957
dtype: int64

In [329]:
# split the data into offer and transaction records
offer_df = combined[combined['offer_id'].notnull()]
transaction_df = combined[combined['offer_id'].isnull()]

In [330]:
print(combined.shape)
print(offer_df.shape)
print(transaction_df.shape)

(272388, 22)
(148431, 22)
(123957, 22)


#### Offer records
The offer records are our main dataset for modeling. We need to transform our data into a single line that indicated if a viewed record was completed or not. We also need to ensure that our final dataset contains feautures and target and numeric values only for modeling.

In [331]:
offer_df.sample(n=1)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,...,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
89005,eab78e2cf26f4af494d9535ac3835f00,offer viewed,228,2906b810c7d4411798c6938adc9daaa5,,67.0,eab78e2cf26f4af494d9535ac3835f00,110000.0,2890.0,1.0,...,2.0,10.0,7.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0


In [332]:
# drop all the transactions related columns not related to offers (nan values) and other unecessary columns
offer_df.drop(columns=['amount','person'], inplace=True)
offer_df.sample(n=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_df.drop(columns=['amount','person'], inplace=True)


Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
72161,offer viewed,180,5a8bc65990b245e5a138643cd4eb9837,64.0,f35b342b24734da18a830bb9329a4fbd,66000.0,2258.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0


In [333]:
# portfolio

In [334]:
offer_df[offer_df['informational'] == 1].event.value_counts()

offer received    13300
offer viewed       9360
Name: event, dtype: int64

In [335]:
# informational offers are never completed, so we can drop them 
offer_df = offer_df[offer_df['informational'] != 1]

In [336]:
offer_df.sample(n=5)

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
119557,offer received,336,0b1e1539f2cc45b7b9fa7c272da2e1d7,25.0,9f28a2d7e8874cc39ef8ca04d4d53ff8,52000.0,2663.0,1.0,0.0,0.0,5.0,20.0,10.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
219874,offer viewed,510,4d5c57ea9a6940dd891ad53e9dbe8da0,57.0,34f4506ab17044b19e562b8d3381e19b,91000.0,2042.0,1.0,0.0,0.0,10.0,10.0,5.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
71281,offer completed,174,f19421c1d4aa40978ebb69ca19b0e20d,53.0,c89d7d9465644288b43bd4d0cce1622e,109000.0,2691.0,1.0,0.0,0.0,5.0,5.0,5.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
64411,offer received,168,fafdcd668e3743c1bb461111dcafc2a4,100.0,3b34370727654cfca5322bca2aba9ffd,96000.0,1915.0,1.0,0.0,0.0,2.0,10.0,10.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
175731,offer viewed,426,2906b810c7d4411798c6938adc9daaa5,45.0,7de67c6eb7ca46d7af5b9072727337ca,98000.0,2103.0,1.0,0.0,0.0,2.0,10.0,7.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0


In [337]:
offer_df[(offer_df.user_id == 'bea062a97557458a97f3e2df8d87755a') & (offer_df.offer_id == '4d5c57ea9a6940dd891ad53e9dbe8da0')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
206535,offer received,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2329.0,1.0,0.0,0.0,10.0,10.0,5.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
215894,offer viewed,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2329.0,1.0,0.0,0.0,10.0,10.0,5.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
231216,offer completed,534,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2329.0,1.0,0.0,0.0,10.0,10.0,5.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0


In [338]:
# use the event column to create dummy variables
offer_complete_df = pd.concat([offer_df, pd.get_dummies(offer_df['event'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# rename offer completed column to offer_completed, offer received to offer_received, offer viewed to offer_viewed
offer_complete_df.rename(columns={'offer completed':'offer_completed', 'offer received':'offer_received', 'offer viewed':'offer_viewed'}, inplace=True)


In [339]:
offer_complete_df[(offer_complete_df.user_id == 'bea062a97557458a97f3e2df8d87755a') & (offer_complete_df.offer_id == '4d5c57ea9a6940dd891ad53e9dbe8da0')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,bogo,discount,informational,email,mobile,social,web,offer_completed,offer_received,offer_viewed
206535,offer received,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2329.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0,1,0
215894,offer viewed,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2329.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0,0,1
231216,offer completed,534,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2329.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1,0,0


In [340]:
# create a combined id to group the data by
# we will use these groups and logic to see if an offer that was viewed was also completed
offer_complete_df['combined_id'] = offer_complete_df.apply(lambda x: x['user_id'] + x['offer_id'], axis=1)
combined_id_map = offer_complete_df.groupby('combined_id')

In [341]:
def offer_viewed_and_completed( row, combined_id_map):

    if row['event'] != 'offer viewed':
        return 0
    
    combined_id = row['user_id'] + row['offer_id']
    filtered_rows = combined_id_map.get_group(combined_id)

    # find the previous offer received event
    previous_offer_received = filtered_rows[(filtered_rows['time'] <= row['time']) & (filtered_rows['event'] == 'offer received')].tail(1)
    # find the next offer completed event
    next_offer_completed = filtered_rows[(filtered_rows['time'] >= row['time']) & (filtered_rows['event'] == 'offer completed')].head(1)

    if previous_offer_received.shape[0] == 0 or next_offer_completed.shape[0] == 0:
        return 0

    if (next_offer_completed['time'].iloc[0] - previous_offer_received['time'].iloc[0] < 24 * row['duration']):
        return 1
    
    return 0

offer_complete_df['offer_viewed_and_completed'] = offer_complete_df.apply(lambda x: offer_viewed_and_completed(x, combined_id_map), axis=1)

In [342]:
# test the logic
offer_complete_df[(offer_complete_df.user_id == '0020c2b971eb4e9188eac86d93036a77') & (offer_complete_df.offer_id == 'fafdcd668e3743c1bb461111dcafc2a4')][['event', 'offer_completed','offer_received', 'offer_viewed', 'combined_id','offer_viewed_and_completed']]

Unnamed: 0,event,offer_completed,offer_received,offer_viewed,combined_id,offer_viewed_and_completed
1889,offer received,0,1,0,0020c2b971eb4e9188eac86d93036a77fafdcd668e3743...,0
18431,offer viewed,0,0,1,0020c2b971eb4e9188eac86d93036a77fafdcd668e3743...,1
31327,offer completed,1,0,0,0020c2b971eb4e9188eac86d93036a77fafdcd668e3743...,0
112684,offer received,0,1,0,0020c2b971eb4e9188eac86d93036a77fafdcd668e3743...,0
218771,offer completed,1,0,0,0020c2b971eb4e9188eac86d93036a77fafdcd668e3743...,0


In [343]:
# test the logic
offer_complete_df[(offer_complete_df.user_id == '018a49ffb8cf4812903e7c1f56fbb0b0') & (offer_complete_df.offer_id == 'f19421c1d4aa40978ebb69ca19b0e20d')][['event', 'offer_completed','offer_received', 'offer_viewed', 'combined_id','offer_viewed_and_completed']]

Unnamed: 0,event,offer_completed,offer_received,offer_viewed,combined_id,offer_viewed_and_completed
4254,offer received,0,1,0,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,0
13653,offer viewed,0,0,1,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,1
34599,offer completed,1,0,0,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,0
205689,offer received,0,1,0,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,0
228530,offer viewed,0,0,1,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,1
239790,offer completed,1,0,0,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,0
249162,offer received,0,1,0,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,0
266409,offer viewed,0,0,1,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,1
269576,offer completed,1,0,0,018a49ffb8cf4812903e7c1f56fbb0b0f19421c1d4aa40...,0


In [344]:
# the logic works as expected

In [345]:
offer_complete_df.sample(n= 5)

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,informational,email,mobile,social,web,offer_completed,offer_received,offer_viewed,combined_id,offer_viewed_and_completed
123927,offer viewed,336,2298d6c36e964ae4a3e7e9706d1fb8c2,89.0,03cc530b458e4fb180c222777916e9b5,36000.0,2879.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0,0,1,03cc530b458e4fb180c222777916e9b52298d6c36e964a...,1
80294,offer viewed,198,2298d6c36e964ae4a3e7e9706d1fb8c2,22.0,492e3e3adea34fdf907cabaa222cf27c,71000.0,1820.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0,0,1,492e3e3adea34fdf907cabaa222cf27c2298d6c36e964a...,0
161188,offer received,408,2298d6c36e964ae4a3e7e9706d1fb8c2,43.0,1ea7547159ae462192c3576d258c2bb7,69000.0,2131.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0,1,0,1ea7547159ae462192c3576d258c2bb72298d6c36e964a...,0
2131,offer received,0,2906b810c7d4411798c6938adc9daaa5,57.0,b08f1eafc66e44e6aa486d7d33351964,93000.0,2772.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0,1,0,b08f1eafc66e44e6aa486d7d333519642906b810c7d441...,0
276851,offer completed,612,f19421c1d4aa40978ebb69ca19b0e20d,38.0,e296bfe8daed4c889800d3765c2d5d75,31000.0,2967.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1,0,0,e296bfe8daed4c889800d3765c2d5d75f19421c1d4aa40...,0


In [346]:
# as we are only interested in offers that were viewed and then completed, we can drop all the non viewed records
offer_complete_df = offer_complete_df[offer_complete_df['offer_viewed'] != 0]

In [347]:
# drop any further columns that will not be use for modeling
offer_complete_df.drop(columns=['event','offer_received','time','offer_id','user_id','combined_id','informational', 'offer_completed', 'offer_viewed'], inplace=True)
offer_complete_df.sample(n= 5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_complete_df.drop(columns=['event','offer_received','time','offer_id','user_id','combined_id','informational', 'offer_completed', 'offer_viewed'], inplace=True)


Unnamed: 0,age,income,days_as_member,F,M,O,reward,difficulty,duration,bogo,discount,email,mobile,social,web,offer_viewed_and_completed
238556,31.0,38000.0,2792.0,1.0,0.0,0.0,10.0,10.0,5.0,1.0,0.0,1.0,1.0,1.0,1.0,0
22218,71.0,71000.0,1889.0,0.0,1.0,0.0,2.0,10.0,7.0,0.0,1.0,1.0,1.0,0.0,1.0,0
66930,27.0,31000.0,1932.0,0.0,1.0,0.0,2.0,10.0,10.0,0.0,1.0,1.0,1.0,1.0,1.0,1
261681,82.0,114000.0,2150.0,0.0,1.0,0.0,2.0,10.0,10.0,0.0,1.0,1.0,1.0,1.0,1.0,1
75480,60.0,34000.0,2694.0,0.0,1.0,0.0,3.0,7.0,7.0,0.0,1.0,1.0,1.0,1.0,1.0,1


#### Transaction records
Clean transaction records for future transaction analysis.

In [348]:
transaction_df.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,...,reward,difficulty,duration,bogo,discount,informational,email,mobile,social,web
129602,13fb2c20d4b3441f958f4b9a77823ffe,transaction,342,,22.41,69.0,13fb2c20d4b3441f958f4b9a77823ffe,84000.0,2300.0,0.0,...,,,,,,,,,,
290707,7594bc509b914aa19f35ed3fbf7b0c81,transaction,648,,22.0,57.0,7594bc509b914aa19f35ed3fbf7b0c81,66000.0,1998.0,0.0,...,,,,,,,,,,
148616,4cf3fba7ec604ceab2dfbf74a8d435db,transaction,396,,5.42,60.0,4cf3fba7ec604ceab2dfbf74a8d435db,35000.0,2246.0,0.0,...,,,,,,,,,,
226500,281d18c6603f43beb05270eb41d8c2f0,transaction,522,,0.76,56.0,281d18c6603f43beb05270eb41d8c2f0,67000.0,1882.0,0.0,...,,,,,,,,,,
302157,6a71281072f7470785e859df21d3c2c5,transaction,696,,1.82,51.0,6a71281072f7470785e859df21d3c2c5,42000.0,1984.0,1.0,...,,,,,,,,,,


In [349]:
# drop all the id columns as we do not need them for analysis
transaction_df.drop(columns=['person', 'offer_id'], inplace=True)
# drop all the offer related columns not related to transactions (nan values)
transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'email', 'mobile', 'social', 'web'], inplace=True)
transaction_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['person', 'offer_id'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'email', 'mobile', 'social', 'web'], inplace=True)


Unnamed: 0,event,time,amount,age,user_id,income,days_as_member,F,M,O,bogo,discount,informational
12654,transaction,0,0.83,20.0,02c083884c7d45b39cc68e1314fec56c,30000.0,2542.0,1.0,0.0,0.0,,,
12657,transaction,0,34.56,42.0,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,96000.0,2718.0,0.0,1.0,0.0,,,
12659,transaction,0,13.23,36.0,54890f68699049c2a04d415abc25e717,56000.0,2007.0,0.0,1.0,0.0,,,
12670,transaction,0,19.51,55.0,b2f1cd155b864803ad8334cdf13c4bd2,94000.0,2080.0,1.0,0.0,0.0,,,
12671,transaction,0,18.97,39.0,fe97aa22dd3e48c8b143116a8403dd52,67000.0,2018.0,1.0,0.0,0.0,,,


#### Write data to parquet files

In [350]:
# write output data to parquet files
# combined.to_parquet('data/combined.parquet', engine='pyarrow')
# offer_df.to_parquet('data/offer_df.parquet', engine='pyarrow')
offer_complete_df.to_parquet('data/offer_complete_df.parquet', engine='pyarrow')
transaction_df.to_parquet('data/transaction_df.parquet', engine='pyarrow')