# Data Preprocessing
We want to build a model that predicts whether or not someone will respond to an offer. Many variables need to be changed to be numeric types for a machine learning model to process it. We also need to combine our data. Please see the steps required below for each of our datasets.

In [181]:
import pandas as pd
import numpy as np
import math
import json
# % matplotlib inline
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Portfolio

In [182]:
portfolio.dtypes

reward         int64
channels      object
difficulty     int64
duration       int64
offer_type    object
id            object
dtype: object

In [183]:
portfolio.head(5)

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [184]:
# machine learning algorithms can only handle numerical features, change the channels column to dummy variables
# use channel column to create dummy variables
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['channels'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# drop channel column
portfolio.drop('channels', axis=1, inplace=True)

In [185]:
portfolio= portfolio.rename(columns={'id':'offer_id'})

In [186]:
portfolio.head()

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1


## Profile

In [187]:
profile.dtypes

gender               object
age                   int64
id                   object
became_member_on      int64
income              float64
dtype: object

In [188]:
# change the became_member_on column to datetime
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')

In [189]:
# machine learning algorithms cannot consume dates
# change became_member_on to number of days as a member 
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days

# drop became_member_on column
profile.drop('became_member_on', axis=1, inplace=True)

In [190]:
# filter for all the customers who have income data and no gender data
profile[profile['gender'].isnull() & profile['income'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [191]:
profile[profile['income'].isnull() & profile['gender'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


NOTE: because there are only 4 features and all the rows with missing genders also have missing income values, we can drop these rows

In [192]:
profile[profile['age'] >= 118]

Unnamed: 0,gender,age,id,income,days_as_member
0,,118,68be06ca386d4c31939f3a4f0e3dd783,,2325
2,,118,38fe809add3b4fcf9315a9694bb96ff5,,1810
4,,118,a03223e636434f42ac4c3df47e8bac43,,2152
6,,118,8ec6ce2a7e7949b1bf142def7d0e0586,,2100
7,,118,68617ca6246f4fbc85e91a2a49552598,,2093
...,...,...,...,...,...
16980,,118,5c686d09ca4d475a8f750f2ba07e0440,,2489
16982,,118,d9ca82f550ac4ee58b6299cf1e5c824a,,2628
16989,,118,ca45ee1883624304bac1e4c8a114f045,,1939
16991,,118,a9a20fa8b5504360beb4e7c8712f8306,,2718


NOTE: it is also the same records with invalid age values, we will drop these columns.

In [193]:
profile.shape

(17000, 5)

In [194]:
# drop nan values
profile.dropna(inplace=True)

In [195]:
profile.shape

(14825, 5)

In [196]:
print(f"{17000-14825} records dropped")

2175 records dropped


In [197]:
profile[profile['income'].isnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [198]:
profile.head()

Unnamed: 0,gender,age,id,income,days_as_member
1,F,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2172
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2239
5,M,68,e2127556f4f64592b11af22de27a7932,70000.0,1887
8,M,65,389bc3fa690240e798340f5a15918d5c,53000.0,1963
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2053


In [199]:
profile.gender.value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [200]:
# machine learning algorithms can only handle numerical features, change the gender column to dummy variables
# create gender dummy variables
profile = pd.concat([profile, pd.get_dummies(profile['gender'].apply(pd.Series).stack()).sum(level=0)], axis=1)

In [201]:
# drop channel column
profile.drop('gender', axis=1, inplace=True)

In [202]:
profile.head()

Unnamed: 0,age,id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2172,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2239,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1887,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1963,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2053,0,1,0


In [203]:
profile= profile.rename(columns={'id':'user_id'})

In [204]:
profile.head()

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2172,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2239,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1887,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1963,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2053,0,1,0


## Transcript

In [205]:
transcript.dtypes

person    object
event     object
value     object
time       int64
dtype: object

In [206]:
transcript.tail()

Unnamed: 0,person,event,value,time
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714
306533,c02b10e8752c4d8e9b73f918558531f7,transaction,{'amount': 4.05},714


In [207]:
transcript.sample(n=5)

Unnamed: 0,person,event,value,time
160732,c2390da2832148358ec29931c0487885,offer received,{'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'},408
179538,370c5f0ff5764c9898e22c51748be7fe,transaction,{'amount': 0.09},432
142937,49dbb5995ab543968648ffc5500c609c,transaction,{'amount': 0.63},378
38292,eaf82df1ddce4a1e87dbd946ffbf4a39,transaction,{'amount': 3.66},84
85971,6c5bbcee7e4e4e60b093c7b4ecee8309,transaction,{'amount': 1.98},216


In [208]:
# convert the json column to usable columns

In [209]:
# transcript['value_type'] = transcript['value'].apply(lambda x: list(x.keys())[0])
# transcript['value_details'] = transcript['value'].apply(lambda x: list(x.values())[0])
# transcript['value_details'] = transcript['value_details'].astype(str)
# this doesnt work for joining later

In [210]:
# unpack the value column
transcript['offer_id'] = transcript['value'].apply(lambda x: x.get('offer_id') or x.get('offer id'))
transcript['amount'] = transcript['value'].apply(lambda x: x.get('amount'))

In [211]:
# drop the value column
transcript.drop(columns=['value'], inplace=True)

In [212]:
# drop all the duplicate records
transcript.drop_duplicates(inplace=True)

## Combining data

In [213]:
portfolio.head(2)

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1


In [214]:
profile.head(2)

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2172,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2239,1,0,0


In [215]:
transcript.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount
274604,2956c29de4bb442cb1959389dfc541ca,offer completed,606,fafdcd668e3743c1bb461111dcafc2a4,
9185,bc8244e584fb4f06a55201e47b9c4664,offer received,0,f19421c1d4aa40978ebb69ca19b0e20d,
110703,057f6741e0e0438e827cc0b18c601160,transaction,330,,22.35
218587,0199681637524988a14245632b8376af,offer viewed,510,ae264e3637204a6fb9bb56bc8210ddfd,
102638,8b70ca1f5f5a45bdac2ac2f990b71526,transaction,288,,47.26


In [216]:
transcript.shape

(306137, 5)

In [217]:
# combine datasets on corresponding ids
combined = pd.merge(transcript, profile, left_on= 'person', right_on='user_id', how='left')
combined = pd.merge(combined, portfolio, on = 'offer_id', how='left')
combined.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
216529,974426519c1f4e6e876a7bffeec78254,transaction,504,,1.27,,,,,,,,,,,,,,,
134700,bca5a5ccd80d4c438ee767d2da84ebaa,offer viewed,354,f19421c1d4aa40978ebb69ca19b0e20d,,49.0,bca5a5ccd80d4c438ee767d2da84ebaa,36000.0,1868.0,0.0,1.0,0.0,5.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0
241241,d0de5df4a59845fc865659b10adfb80e,offer viewed,564,f19421c1d4aa40978ebb69ca19b0e20d,,35.0,d0de5df4a59845fc865659b10adfb80e,74000.0,2074.0,1.0,0.0,0.0,5.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0
225260,bcfbadce76f34099a68a10adc7f86c3b,offer completed,522,2906b810c7d4411798c6938adc9daaa5,,62.0,bcfbadce76f34099a68a10adc7f86c3b,84000.0,2409.0,1.0,0.0,0.0,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0
106264,0fc637442b954432b13c360af61d4256,transaction,306,,12.98,43.0,0fc637442b954432b13c360af61d4256,57000.0,2884.0,1.0,0.0,0.0,,,,,,,,


In [218]:
# no records gained, correct
combined.shape

(306137, 20)

In [219]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          138953
amount            167184
age                33749
user_id            33749
income             33749
days_as_member     33749
F                  33749
M                  33749
O                  33749
reward            138953
difficulty        138953
duration          138953
offer_type        138953
email             138953
mobile            138953
social            138953
web               138953
dtype: int64

In [220]:
# there are no users associated with these records
combined[combined['user_id'].isnull()].sample(n=15)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
296685,31d6a951d92b407fa8fa127aa0bf0e7e,offer completed,672,2906b810c7d4411798c6938adc9daaa5,,,,,,,,,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0
243347,8fdb780cf1844206bd605313965cacf9,transaction,570,,1.01,,,,,,,,,,,,,,,
289722,9f3a4c78181f4a138f0130ea027a2f3d,offer viewed,648,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,,,,,,,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
4039,b9d07d53d8944f4b865990b52e532710,offer received,0,3f207df678b143eea3cee63160fa8bed,,,,,,,,,0.0,0.0,4.0,informational,1.0,1.0,0.0,1.0
231168,f092d429217c4a18a2c326971bbc4b7e,transaction,534,,0.36,,,,,,,,,,,,,,,
216164,5135913f61fa4f15b35c45c01ab9fab3,offer viewed,504,fafdcd668e3743c1bb461111dcafc2a4,,,,,,,,,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
122021,97a94283e9a84bb2907d7fb3402c5e26,offer received,336,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,,,,,,,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
160660,29b4ba3741f040de95cba6ac2e436fa9,offer received,408,2298d6c36e964ae4a3e7e9706d1fb8c2,,,,,,,,,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
210417,143d05b3029c483a80c3ebb4fa3aa620,offer received,504,5a8bc65990b245e5a138643cd4eb9837,,,,,,,,,0.0,0.0,3.0,informational,1.0,1.0,1.0,0.0
44056,75c1b4d22fd24b35b2f0a52fd8ff1735,transaction,108,,2.48,,,,,,,,,,,,,,,


In [221]:
profile[profile.user_id == '5ae36f912be1492199ec2da838cc6dda']

Unnamed: 0,age,user_id,income,days_as_member,F,M,O


In [222]:
# since we are interested in how users respond to offers, we can drop all the records where there is no user associated
combined.dropna(subset=['user_id'], inplace=True)

In [223]:
# all transaction records dont correlate with any offers necessarily (no direct link) do we need to keep them?
combined[combined['reward'].isnull()].sample(n=15)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
304722,c1d6f806a92c46fdaacaca6b9c8d2e18,transaction,708,,29.42,46.0,c1d6f806a92c46fdaacaca6b9c8d2e18,78000.0,1852.0,0.0,1.0,0.0,,,,,,,,
302656,35dd2c597c7e43d8b8c53ade81e8350f,transaction,696,,4.11,22.0,35dd2c597c7e43d8b8c53ade81e8350f,31000.0,1943.0,1.0,0.0,0.0,,,,,,,,
147142,8bdec7a2b2264f52b7d3e168a291f429,transaction,390,,21.48,45.0,8bdec7a2b2264f52b7d3e168a291f429,66000.0,2088.0,1.0,0.0,0.0,,,,,,,,
41982,04362cece3104f4ebcb3d49ef0bc7e9c,transaction,96,,2.32,47.0,04362cece3104f4ebcb3d49ef0bc7e9c,38000.0,2808.0,0.0,1.0,0.0,,,,,,,,
194505,5a8427053973447e9f888e58b2ced8b2,transaction,474,,2.2,38.0,5a8427053973447e9f888e58b2ced8b2,73000.0,1934.0,0.0,1.0,0.0,,,,,,,,
265512,e242bb9c734c4c6093feadfd95cf5e87,transaction,588,,38.66,90.0,e242bb9c734c4c6093feadfd95cf5e87,71000.0,1852.0,1.0,0.0,0.0,,,,,,,,
25622,48c1f1b492d3451b804b81877bf957f5,transaction,30,,20.64,67.0,48c1f1b492d3451b804b81877bf957f5,77000.0,1946.0,0.0,1.0,0.0,,,,,,,,
26226,de8ea15f5f0545b6b45df1a7ae2dca11,transaction,36,,6.17,46.0,de8ea15f5f0545b6b45df1a7ae2dca11,32000.0,2086.0,0.0,1.0,0.0,,,,,,,,
233313,61767761f4f44c368b5b069170b28bd6,transaction,540,,34.78,70.0,61767761f4f44c368b5b069170b28bd6,114000.0,2813.0,0.0,1.0,0.0,,,,,,,,
173244,c79b5783fe67458d92e8559aa5e36da0,transaction,420,,1.41,88.0,c79b5783fe67458d92e8559aa5e36da0,32000.0,2403.0,1.0,0.0,0.0,,,,,,,,


In [224]:
# find all the values where amount is null
combined[combined['amount'].isnull()].event.value_counts()
# this is expected as amount null for offer records

offer received     66501
offer viewed       49860
offer completed    32070
Name: event, dtype: int64

In [225]:
# find all the values where offer_id is null
combined[combined['offer_id'].isnull()].event.value_counts()
# this is expected as offer_id null for transaction records

transaction    123957
Name: event, dtype: int64

### Split transaction and offer records for analysis


In [226]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          123957
amount            148431
age                    0
user_id                0
income                 0
days_as_member         0
F                      0
M                      0
O                      0
reward            123957
difficulty        123957
duration          123957
offer_type        123957
email             123957
mobile            123957
social            123957
web               123957
dtype: int64

In [227]:
# split the data into offer and transaction records
offer_df = combined[combined['offer_id'].notnull()]
transaction_df = combined[combined['offer_id'].isnull()]

In [228]:
print(combined.shape)
print(offer_df.shape)
print(transaction_df.shape)

(272388, 20)
(148431, 20)
(123957, 20)


#### Transaction records

In [229]:
transaction_df.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
185813,cc35fd7b202a43d19eddbec7b76e01ca,transaction,450,,17.09,41.0,cc35fd7b202a43d19eddbec7b76e01ca,76000.0,2299.0,0.0,1.0,0.0,,,,,,,,
136187,4bdbd489583245e6a3a39456dccfc1d2,transaction,360,,4.08,21.0,4bdbd489583245e6a3a39456dccfc1d2,71000.0,2784.0,0.0,1.0,0.0,,,,,,,,
35571,bf60d102f38c4f2aacb09c2df487b7fc,transaction,72,,0.34,53.0,bf60d102f38c4f2aacb09c2df487b7fc,60000.0,1903.0,0.0,1.0,0.0,,,,,,,,
72296,c8c44530553d43e4aae1edf97153cf38,transaction,180,,15.03,92.0,c8c44530553d43e4aae1edf97153cf38,113000.0,1851.0,0.0,1.0,0.0,,,,,,,,
295884,fd375e03ad394a7bb85b62eb15263f14,transaction,666,,16.79,27.0,fd375e03ad394a7bb85b62eb15263f14,53000.0,2721.0,1.0,0.0,0.0,,,,,,,,


In [230]:
# drop all the id columns as we do not need them for analysis
transaction_df.drop(columns=['person', 'offer_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['person', 'offer_id'], inplace=True)


In [231]:
# drop all the offer related columns not related to transactions (nan values)
transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'offer_type', 'email', 'mobile', 'social', 'web'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'offer_type', 'email', 'mobile', 'social', 'web'], inplace=True)


In [232]:
transaction_df.head()

Unnamed: 0,event,time,amount,age,user_id,income,days_as_member,F,M,O
12654,transaction,0,0.83,20.0,02c083884c7d45b39cc68e1314fec56c,30000.0,2541.0,1.0,0.0,0.0
12657,transaction,0,34.56,42.0,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,96000.0,2717.0,0.0,1.0,0.0
12659,transaction,0,13.23,36.0,54890f68699049c2a04d415abc25e717,56000.0,2006.0,0.0,1.0,0.0
12670,transaction,0,19.51,55.0,b2f1cd155b864803ad8334cdf13c4bd2,94000.0,2079.0,1.0,0.0,0.0
12671,transaction,0,18.97,39.0,fe97aa22dd3e48c8b143116a8403dd52,67000.0,2017.0,1.0,0.0,0.0


#### Offer records

In [233]:
offer_df.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
71597,e31bac27c3554312ba7b3db96d81e70f,offer viewed,174,4d5c57ea9a6940dd891ad53e9dbe8da0,,31.0,e31bac27c3554312ba7b3db96d81e70f,73000.0,2157.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
61866,06d34b0fb66e4feea9b2c765755a8e1f,offer received,168,9b98b8c7a33c4b65b9aebfe6a799e6d9,,65.0,06d34b0fb66e4feea9b2c765755a8e1f,82000.0,2101.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
163530,e25ba3813b224ea894e9d5bdd9f1bd64,offer viewed,408,2298d6c36e964ae4a3e7e9706d1fb8c2,,49.0,e25ba3813b224ea894e9d5bdd9f1bd64,65000.0,1980.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
211143,7c26e974fad34c159fbd3804e82e701e,offer received,504,2298d6c36e964ae4a3e7e9706d1fb8c2,,41.0,7c26e974fad34c159fbd3804e82e701e,59000.0,2157.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
268446,c9b534cdac834e5d9853828816300e57,offer completed,588,f19421c1d4aa40978ebb69ca19b0e20d,,55.0,c9b534cdac834e5d9853828816300e57,53000.0,2836.0,1.0,0.0,0.0,5.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0


In [235]:
# drop all the transactions related columns not related to offers (nan values)
offer_df.drop(columns=['amount'], inplace=True)
offer_df.drop(columns=['person'], inplace=True)

# the time column is no of interesting to us, we can drop it
# offer_df.drop(columns=['time'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_df.drop(columns=['amount'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_df.drop(columns=['person'], inplace=True)


In [236]:
portfolio

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


In [237]:
offer_df[offer_df['offer_type'] == 'informational'].event.value_counts()

offer received    13300
offer viewed       9360
Name: event, dtype: int64

In [238]:
# informational offers are never completed, so we can drop them 
offer_df = offer_df[offer_df['offer_type'] != 'informational']

In [239]:
offer_df.sample(n=5)

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
251424,offer received,576,0b1e1539f2cc45b7b9fa7c272da2e1d7,63.0,61426a58378543e0b56b2836dc94967d,88000.0,1887.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
2901,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,65.0,7378619969094009af10e2097b4a5f76,53000.0,3368.0,0.0,1.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
217672,offer viewed,504,fafdcd668e3743c1bb461111dcafc2a4,44.0,234f3b42e14245349935d57b937300dd,39000.0,2039.0,0.0,1.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
257606,offer received,576,2298d6c36e964ae4a3e7e9706d1fb8c2,64.0,efba45d36ae340838b34c74b9ebb1dc2,57000.0,2805.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
15324,offer viewed,0,4d5c57ea9a6940dd891ad53e9dbe8da0,43.0,f9269e4540b84e6da9ff8c351ec3d463,69000.0,2631.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0


In [240]:
offer_df[(offer_df.user_id == 'bea062a97557458a97f3e2df8d87755a') & (offer_df.offer_id == '4d5c57ea9a6940dd891ad53e9dbe8da0')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
206535,offer received,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2328.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
215894,offer viewed,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2328.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
231216,offer completed,534,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2328.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0


In [None]:
# use the event column to create dummy variables
offer_complete_df = pd.concat([offer_df, pd.get_dummies(offer_df['event'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# rename offer completed column to offer_completed, offer received to offer_received, offer viewed to offer_viewed
offer_complete_df.rename(columns={'offer completed':'offer_completed', 'offer received':'offer_received', 'offer viewed':'offer_viewed'}, inplace=True)


In [None]:
offer_complete_df[(offer_complete_df.user_id == 'bea062a97557458a97f3e2df8d87755a') & (offer_complete_df.offer_id == '4d5c57ea9a6940dd891ad53e9dbe8da0')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,difficulty,duration,offer_type,email,mobile,social,web,offer_completed,offer_received,offer_viewed
206535,offer received,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2328.0,1.0,0.0,0.0,...,10.0,5.0,bogo,1.0,1.0,1.0,1.0,0,1,0
215894,offer viewed,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2328.0,1.0,0.0,0.0,...,10.0,5.0,bogo,1.0,1.0,1.0,1.0,0,0,1
231216,offer completed,534,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2328.0,1.0,0.0,0.0,...,10.0,5.0,bogo,1.0,1.0,1.0,1.0,1,0,0


In [None]:
# group by user_id and offer_id and sum the dummy variables
# offer_complete_df = offer_complete_df.groupby(['user_id', 'offer_id']).sum().reset_index()
# offer_df['combined_id'] = offer_df.apply(lambda x: x['user_id'] + x['offer_id'], axis=1)
# grp = offer_df.groupby('combined_id')
# grp.get_group('d3355ffcdb58449f9497fd76a879fbf2f19421c1d4aa40978ebb69ca19b0e20d')

In [241]:
# create a combined id to group the data by
# we will use these groups and logic to see if an offer that was viewed was also completed
offer_complete_df['combined_id'] = offer_complete_df.apply(lambda x: x['user_id'] + x['offer_id'], axis=1)
combined_id_map = offer_complete_df.groupby('combined_id')

In [242]:
def offer_viewed_and_completed( row, combined_id_map):

    if row['event'] != 'offer viewed':
        return 0
    
    combined_id = row['user_id'] + row['offer_id']
    filtered_rows = combined_id_map.get_group(combined_id)

    # find the previous offer received event
    previous_offer_received = filtered_rows[(filtered_rows['time'] <= row['time']) & (filtered_rows['event'] == 'offer received')].tail(1)
    # find the next offer completed event
    next_offer_completed = filtered_rows[(filtered_rows['time'] >= row['time']) & (filtered_rows['event'] == 'offer completed')].head(1)

    if previous_offer_received.shape[0] == 0 or next_offer_completed.shape[0] == 0:
        return 0

    if (next_offer_completed['time'].iloc[0] - previous_offer_received['time'].iloc[0] < 24 * row['duration']):
        return 1
    
    return 0

offer_complete_df['offer_viewed_and_completed'] = offer_complete_df.apply(lambda x: offer_viewed_and_completed(x, combined_id_map), axis=1)

In [243]:
offer_complete_df[offer_complete_df['event'] == 'offer viewed'].sample(n=5)

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,offer_type,email,mobile,social,web,offer_completed,offer_received,offer_viewed,combined_id,offer_viewed_and_completed
261301,offer viewed,576,4d5c57ea9a6940dd891ad53e9dbe8da0,52.0,5532539d40724979849cc7b6e8cda9a9,58000.0,2520.0,1.0,0.0,0.0,...,bogo,1.0,1.0,1.0,1.0,0,0,1,5532539d40724979849cc7b6e8cda9a94d5c57ea9a6940...,1
188534,offer viewed,456,f19421c1d4aa40978ebb69ca19b0e20d,93.0,bfc1e0be52a141b5a89f3410d5f9eb37,73000.0,2111.0,1.0,0.0,0.0,...,bogo,1.0,1.0,1.0,1.0,0,0,1,bfc1e0be52a141b5a89f3410d5f9eb37f19421c1d4aa40...,0
304198,offer viewed,708,9b98b8c7a33c4b65b9aebfe6a799e6d9,41.0,69f965c1424d4b479a66639ea4e92285,68000.0,2625.0,0.0,1.0,0.0,...,bogo,1.0,1.0,0.0,1.0,0,0,1,69f965c1424d4b479a66639ea4e922859b98b8c7a33c4b...,0
260007,offer viewed,576,2298d6c36e964ae4a3e7e9706d1fb8c2,71.0,3381f8550e8944c89cdadf97a3461823,52000.0,2278.0,0.0,1.0,0.0,...,discount,1.0,1.0,1.0,1.0,0,0,1,3381f8550e8944c89cdadf97a34618232298d6c36e964a...,1
273244,offer viewed,600,f19421c1d4aa40978ebb69ca19b0e20d,72.0,3706f0e6065f445d8cf6ee400e5ecfb8,97000.0,2343.0,1.0,0.0,0.0,...,bogo,1.0,1.0,1.0,1.0,0,0,1,3706f0e6065f445d8cf6ee400e5ecfb8f19421c1d4aa40...,0


In [244]:
offer_complete_df[(offer_complete_df.user_id == '5532539d40724979849cc7b6e8cda9a9') & (offer_complete_df.offer_id == '4d5c57ea9a6940dd891ad53e9dbe8da0')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,offer_type,email,mobile,social,web,offer_completed,offer_received,offer_viewed,combined_id,offer_viewed_and_completed
255864,offer received,576,4d5c57ea9a6940dd891ad53e9dbe8da0,52.0,5532539d40724979849cc7b6e8cda9a9,58000.0,2520.0,1.0,0.0,0.0,...,bogo,1.0,1.0,1.0,1.0,0,1,0,5532539d40724979849cc7b6e8cda9a94d5c57ea9a6940...,0
261301,offer viewed,576,4d5c57ea9a6940dd891ad53e9dbe8da0,52.0,5532539d40724979849cc7b6e8cda9a9,58000.0,2520.0,1.0,0.0,0.0,...,bogo,1.0,1.0,1.0,1.0,0,0,1,5532539d40724979849cc7b6e8cda9a94d5c57ea9a6940...,1
268148,offer completed,588,4d5c57ea9a6940dd891ad53e9dbe8da0,52.0,5532539d40724979849cc7b6e8cda9a9,58000.0,2520.0,1.0,0.0,0.0,...,bogo,1.0,1.0,1.0,1.0,1,0,0,5532539d40724979849cc7b6e8cda9a94d5c57ea9a6940...,0


In [None]:
# drop all the events that are offer received
# offer_df = offer_df[offer_df['event'] != 'offer received']
# drop event column
# offer_complete_df.drop('event', axis=1, inplace=True)

In [None]:
# we not have a single record for each user and offer which indicates if an offer was received, viewed and completed or not

In [None]:
# as we are only interested in offers that were viewed and then completed, we can drop all the records where viewed is 0
offer_complete_df = offer_complete_df[offer_complete_df['offer_viewed'] != 0]

In [None]:
# the offer received column is no longer needed, they are all 1
offer_complete_df.drop(columns=['offer_received'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_complete_df.drop(columns=['offer_received'], inplace=True)


#### Multiple user ID/offers investigation

In [246]:
offer_complete_df.offer_received.value_counts()

0    72570
1    53201
Name: offer_received, dtype: int64

In [None]:
offer_complete_df[offer_complete_df.offer_completed>1]

Unnamed: 0,user_id,offer_id,time,age,income,days_as_member,F,M,O,reward,difficulty,duration,email,mobile,social,web,offer_completed,offer_viewed
8,0020c2b971eb4e9188eac86d93036a77,fafdcd668e3743c1bb461111dcafc2a4,912,295.0,450000.0,13345.0,5.0,0.0,0.0,10.0,50.0,50.0,5.0,5.0,5.0,5.0,2,1
13,003d66b6608740288d6cc97a6903f4f0,fafdcd668e3743c1bb461111dcafc2a4,2184,156.0,438000.0,13170.0,6.0,0.0,0.0,12.0,60.0,60.0,6.0,6.0,6.0,6.0,2,2
20,004c5799adbf42868b9cff0396190900,f19421c1d4aa40978ebb69ca19b0e20d,2826,324.0,594000.0,15852.0,0.0,6.0,0.0,30.0,30.0,30.0,6.0,6.0,6.0,6.0,2,2
21,004c5799adbf42868b9cff0396190900,fafdcd668e3743c1bb461111dcafc2a4,2364,324.0,594000.0,15852.0,0.0,6.0,0.0,12.0,60.0,60.0,6.0,6.0,6.0,6.0,2,2
28,00715b6e55c3431cb56ff7307eb19675,0b1e1539f2cc45b7b9fa7c272da2e1d7,1848,290.0,595000.0,10130.0,5.0,0.0,0.0,25.0,100.0,50.0,5.0,0.0,0.0,5.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44162,fff29fb549084123bd046dbc5ceb4faa,4d5c57ea9a6940dd891ad53e9dbe8da0,2640,354.0,558000.0,12744.0,6.0,0.0,0.0,60.0,60.0,30.0,6.0,6.0,6.0,6.0,2,2
44163,fff29fb549084123bd046dbc5ceb4faa,ae264e3637204a6fb9bb56bc8210ddfd,2070,295.0,465000.0,10620.0,5.0,0.0,0.0,50.0,50.0,35.0,5.0,5.0,5.0,0.0,2,1
44172,fff7576017104bcc8677a8d63322b5e1,fafdcd668e3743c1bb461111dcafc2a4,1836,426.0,438000.0,12378.0,0.0,6.0,0.0,12.0,60.0,60.0,6.0,6.0,6.0,6.0,2,2
44176,fffad4f4828548d1b5583907f2e9906b,f19421c1d4aa40978ebb69ca19b0e20d,1476,204.0,204000.0,14064.0,0.0,6.0,0.0,30.0,30.0,30.0,6.0,6.0,6.0,6.0,2,2


In [None]:
offer_complete_df[offer_complete_df.offer_completed >2]

Unnamed: 0,user_id,offer_id,time,age,income,days_as_member,F,M,O,reward,difficulty,duration,email,mobile,social,web,offer_completed,offer_viewed
248,018a49ffb8cf4812903e7c1f56fbb0b0,f19421c1d4aa40978ebb69ca19b0e20d,3414,549.0,306000.0,21564.0,0.0,9.0,0.0,45.0,45.0,45.0,9.0,9.0,9.0,9.0,3,3
511,0335d274249f4eb6b3c51527f02a3216,4d5c57ea9a6940dd891ad53e9dbe8da0,3144,189.0,666000.0,21636.0,9.0,0.0,0.0,90.0,90.0,45.0,9.0,9.0,9.0,9.0,3,3
660,040704e99ab84cd08977858fab9b9276,f19421c1d4aa40978ebb69ca19b0e20d,4476,585.0,792000.0,18144.0,0.0,9.0,0.0,45.0,45.0,45.0,9.0,9.0,9.0,9.0,3,3
694,043bcfeacb874bbc837300701ce25870,ae264e3637204a6fb9bb56bc8210ddfd,3642,630.0,648000.0,18306.0,9.0,0.0,0.0,90.0,90.0,63.0,9.0,9.0,9.0,0.0,3,3
737,0494aa6671414fab9837fa3cd45e72bc,0b1e1539f2cc45b7b9fa7c272da2e1d7,2226,497.0,252000.0,18844.0,7.0,0.0,0.0,35.0,140.0,70.0,7.0,0.0,0.0,7.0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43681,fcdc9e86b659499d882a4714ea53e974,ae264e3637204a6fb9bb56bc8210ddfd,2028,162.0,405000.0,21168.0,0.0,9.0,0.0,90.0,90.0,63.0,9.0,9.0,9.0,0.0,3,3
43780,fd90af4b9b784b268efa9d349a762491,fafdcd668e3743c1bb461111dcafc2a4,2790,270.0,576000.0,16650.0,0.0,9.0,0.0,18.0,90.0,90.0,9.0,9.0,9.0,9.0,3,3
43985,fee2d799672d4f81bfa3237207290f79,fafdcd668e3743c1bb461111dcafc2a4,1896,459.0,666000.0,23292.0,0.0,9.0,0.0,18.0,90.0,90.0,9.0,9.0,9.0,9.0,3,3
44098,ff932c6f8bb641bd816955337d153676,f19421c1d4aa40978ebb69ca19b0e20d,1650,585.0,684000.0,25434.0,0.0,9.0,0.0,45.0,45.0,45.0,9.0,9.0,9.0,9.0,3,3


In [None]:
offer_df[(offer_df.user_id == '0494aa6671414fab9837fa3cd45e72bc') & (offer_df.offer_id == '0b1e1539f2cc45b7b9fa7c272da2e1d7')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
10413,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
37992,offer viewed,78,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
51199,offer completed,150,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
161076,offer received,408,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
181994,offer completed,438,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
211840,offer received,504,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
290819,offer completed,648,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0


In [None]:
offer_df[(offer_df.user_id == '0020c2b971eb4e9188eac86d93036a77') & (offer_df.offer_id == 'fafdcd668e3743c1bb461111dcafc2a4')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
1889,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
18431,offer viewed,12,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
31327,offer completed,54,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
112684,offer received,336,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
218771,offer completed,510,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0


In [None]:
offer_df[(offer_df.user_id == 'edc7b04392144da9979f3077095f268a') & (offer_df.offer_id == 'fafdcd668e3743c1bb461111dcafc2a4')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
64630,offer received,168,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
74568,offer viewed,180,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
91966,offer completed,234,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
122267,offer received,336,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
141346,offer viewed,372,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
158736,offer received,408,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
190920,offer viewed,462,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
194743,offer completed,474,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
201278,offer completed,498,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
209549,offer received,504,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0


In [None]:
offer_df[(offer_df.user_id == '1c8cf4af93464dcaa971cfcffc2cc1e5') & (offer_df.offer_id == '2298d6c36e964ae4a3e7e9706d1fb8c2')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
9430,offer received,0,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
17478,offer completed,6,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
25652,offer viewed,30,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
62636,offer received,168,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
74147,offer viewed,180,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
80916,offer completed,198,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
210863,offer received,504,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
220863,offer viewed,510,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
229574,offer completed,528,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
254325,offer received,576,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0


In [None]:
offer_df[(offer_df.user_id == '073fce5708884b30a28b65b3cb15a919') & (offer_df.offer_id == '9b98b8c7a33c4b65b9aebfe6a799e6d9')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
2268,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
13189,offer viewed,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
38457,offer completed,84,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
55466,offer received,168,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
66499,offer viewed,168,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
87438,offer completed,222,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
152857,offer received,408,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
168232,offer completed,414,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
171687,offer viewed,420,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
203692,offer received,504,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0


#### Write data to parquet files

In [None]:
# write output data to parquet files
# combined.to_parquet('data/combined.parquet', engine='pyarrow')
# offer_df.to_parquet('data/offer_df.parquet', engine='pyarrow')
offer_complete_df.to_parquet('data/offer_complete_df.parquet', engine='pyarrow')
transaction_df.to_parquet('data/transaction_df.parquet', engine='pyarrow')

In [None]:
# drop all the id columns as we do not need them for modelling
# offer_complete_df.drop(columns=['user_id', 'offer_id'], inplace=True)