# Data Preprocessing
We want to build a model that predicts whether or not someone will respond to an offer. Many variables need to be changed to be numeric types for a machine learning model to process it. We also need to combine our data. Please see the steps required below for each of our datasets.

In [77]:
import pandas as pd
import numpy as np
import math
import json
# % matplotlib inline
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Portfolio

In [78]:
portfolio.dtypes

reward         int64
channels      object
difficulty     int64
duration       int64
offer_type    object
id            object
dtype: object

In [79]:
portfolio.head(5)

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [80]:
# machine learning algorithms can only handle numerical features, change the channels column to dummy variables
# use channel column to create dummy variables
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['channels'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# drop channel column
portfolio.drop('channels', axis=1, inplace=True)

In [81]:
portfolio= portfolio.rename(columns={'id':'offer_id'})

In [82]:
portfolio.head()

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1


## Profile

In [83]:
profile.dtypes

gender               object
age                   int64
id                   object
became_member_on      int64
income              float64
dtype: object

In [84]:
# change the became_member_on column to datetime
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')

In [85]:
# machine learning algorithms cannot consume dates
# change became_member_on to number of days as a member 
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days

# drop became_member_on column
profile.drop('became_member_on', axis=1, inplace=True)

In [86]:
# filter for all the customers who have income data and no gender data
profile[profile['gender'].isnull() & profile['income'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [87]:
profile[profile['income'].isnull() & profile['gender'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


NOTE: because there are only 4 features and all the rows with missing genders also have missing income values, we can drop these rows

In [88]:
profile[profile['age'] >= 118]

Unnamed: 0,gender,age,id,income,days_as_member
0,,118,68be06ca386d4c31939f3a4f0e3dd783,,2324
2,,118,38fe809add3b4fcf9315a9694bb96ff5,,1809
4,,118,a03223e636434f42ac4c3df47e8bac43,,2151
6,,118,8ec6ce2a7e7949b1bf142def7d0e0586,,2099
7,,118,68617ca6246f4fbc85e91a2a49552598,,2092
...,...,...,...,...,...
16980,,118,5c686d09ca4d475a8f750f2ba07e0440,,2488
16982,,118,d9ca82f550ac4ee58b6299cf1e5c824a,,2627
16989,,118,ca45ee1883624304bac1e4c8a114f045,,1938
16991,,118,a9a20fa8b5504360beb4e7c8712f8306,,2717


NOTE: it is also the same records with invalid age values, we will drop these columns.

In [89]:
profile.shape

(17000, 5)

In [90]:
# drop nan values
profile.dropna(inplace=True)

In [91]:
profile.shape

(14825, 5)

In [92]:
print(f"{17000-14825} records dropped")

2175 records dropped


In [93]:
profile[profile['income'].isnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [94]:
profile.head()

Unnamed: 0,gender,age,id,income,days_as_member
1,F,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2171
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2238
5,M,68,e2127556f4f64592b11af22de27a7932,70000.0,1886
8,M,65,389bc3fa690240e798340f5a15918d5c,53000.0,1962
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2052


In [95]:
profile.gender.value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [96]:
# machine learning algorithms can only handle numerical features, change the gender column to dummy variables
# create gender dummy variables
profile = pd.concat([profile, pd.get_dummies(profile['gender'].apply(pd.Series).stack()).sum(level=0)], axis=1)

In [97]:
# drop channel column
profile.drop('gender', axis=1, inplace=True)

In [98]:
profile.head()

Unnamed: 0,age,id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2171,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2238,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1886,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1962,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2052,0,1,0


In [99]:
profile= profile.rename(columns={'id':'user_id'})

In [100]:
profile.head()

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2171,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2238,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1886,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1962,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2052,0,1,0


## Transcript

In [101]:
transcript.dtypes

person    object
event     object
value     object
time       int64
dtype: object

In [102]:
transcript.tail()

Unnamed: 0,person,event,value,time
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714
306533,c02b10e8752c4d8e9b73f918558531f7,transaction,{'amount': 4.05},714


In [103]:
transcript[transcript.event == 'transaction']

Unnamed: 0,person,event,value,time
12654,02c083884c7d45b39cc68e1314fec56c,transaction,{'amount': 0.8300000000000001},0
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,{'amount': 34.56},0
12659,54890f68699049c2a04d415abc25e717,transaction,{'amount': 13.23},0
12670,b2f1cd155b864803ad8334cdf13c4bd2,transaction,{'amount': 19.51},0
12671,fe97aa22dd3e48c8b143116a8403dd52,transaction,{'amount': 18.97},0
...,...,...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714


In [104]:
transcript.sample(n=5)

Unnamed: 0,person,event,value,time
80759,10b64048925443fd8ec2792ea63de78c,offer completed,{'offer_id': '9b98b8c7a33c4b65b9aebfe6a799e6d9...,198
60566,37e4b2119d754558bebf92f3a06b9650,offer received,{'offer id': '5a8bc65990b245e5a138643cd4eb9837'},168
279829,4438fece00ed4f1a8b108c3cd85b19c1,offer completed,{'offer_id': '0b1e1539f2cc45b7b9fa7c272da2e1d7...,618
263687,d087fd0166404163b7d1e1e7cf2a9ac7,transaction,{'amount': 5.57},582
68409,306f60b7ff784c2eaa236a24498c8507,transaction,{'amount': 0.8200000000000001},168


In [105]:
# convert the json column to usable columns

In [106]:
# transcript['value_type'] = transcript['value'].apply(lambda x: list(x.keys())[0])
# transcript['value_details'] = transcript['value'].apply(lambda x: list(x.values())[0])
# transcript['value_details'] = transcript['value_details'].astype(str)
# this doesnt work for joining later

In [107]:
# unpack the value column
transcript['offer_id'] = transcript['value'].apply(lambda x: x.get('offer_id') or x.get('offer id'))
transcript['amount'] = transcript['value'].apply(lambda x: x.get('amount'))

In [108]:
# drop the value column
transcript.drop(columns=['value'], inplace=True)

In [109]:
# drop all the duplicate records
transcript.drop_duplicates(inplace=True)

In [110]:
# transcript= transcript.rename(columns={'person':'user_id'})

## Combining data

In [111]:
portfolio.head(2)

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1


In [112]:
profile.head(2)

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2171,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2238,1,0,0


In [113]:
transcript.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount
82300,2d417224c49c4d1eb3ad45275ae43662,offer viewed,204,2298d6c36e964ae4a3e7e9706d1fb8c2,
114930,e7ebd61ead2d4958a32dc214939a03f9,offer received,336,fafdcd668e3743c1bb461111dcafc2a4,
18951,d4a7ab8bfb7940479867e911e984505b,offer viewed,12,2298d6c36e964ae4a3e7e9706d1fb8c2,
253432,3b65578368194a428d38390dd27cd447,offer received,576,5a8bc65990b245e5a138643cd4eb9837,
293213,20570cb0c03947699a75887bb6a65145,transaction,660,,12.52


In [114]:
transcript.shape

(306137, 5)

In [115]:
combined = pd.merge(transcript, profile, left_on= 'person', right_on='user_id', how='left')
combined = pd.merge(combined, portfolio, on = 'offer_id', how='left')
combined.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
96318,e110e63527c24ad1b482f76acde24a42,offer completed,252,f19421c1d4aa40978ebb69ca19b0e20d,,54.0,e110e63527c24ad1b482f76acde24a42,94000.0,2090.0,0.0,1.0,0.0,5.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0
284950,b56e747d7f6641c88cacc52c5eaebf57,offer viewed,630,fafdcd668e3743c1bb461111dcafc2a4,,31.0,b56e747d7f6641c88cacc52c5eaebf57,58000.0,2643.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
143801,390d897a08e94263bc968889972bb65c,transaction,384,,11.16,,,,,,,,,,,,,,,
128952,c706c573a9114bd48e87cdae246b1afb,transaction,342,,23.81,64.0,c706c573a9114bd48e87cdae246b1afb,72000.0,2618.0,1.0,0.0,0.0,,,,,,,,
78120,8e6a1fd8b5fb499ba8e765db85803e7d,transaction,192,,22.0,58.0,8e6a1fd8b5fb499ba8e765db85803e7d,82000.0,2754.0,1.0,0.0,0.0,,,,,,,,


In [116]:
# no records gained, correct
combined.shape

(306137, 20)

In [117]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          138953
amount            167184
age                33749
user_id            33749
income             33749
days_as_member     33749
F                  33749
M                  33749
O                  33749
reward            138953
difficulty        138953
duration          138953
offer_type        138953
email             138953
mobile            138953
social            138953
web               138953
dtype: int64

In [118]:
# there are no users associated with these records
combined[combined['user_id'].isnull()].sample(n=15)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
251744,c006941260674255b5b74fe2d6bdb217,offer received,576,9b98b8c7a33c4b65b9aebfe6a799e6d9,,,,,,,,,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
99540,3bc207e9e6094bac989a24489c370765,transaction,270,,7.17,,,,,,,,,,,,,,,
202119,d5693531c45e49bda3af20db740101af,offer received,504,2906b810c7d4411798c6938adc9daaa5,,,,,,,,,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0
214021,4342cb3414534aaca96ac523edc82654,offer received,504,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,,,,,,,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
13152,b685bf9de0794b44ba299464a95aa679,offer viewed,0,2298d6c36e964ae4a3e7e9706d1fb8c2,,,,,,,,,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
138744,125562e434cd48fd9297a5f67111d5ce,transaction,366,,1.92,,,,,,,,,,,,,,,
68392,62d6bfcd9c554637b92dcd556003a2e2,transaction,168,,5.78,,,,,,,,,,,,,,,
269545,1bdb72b74a1c45dfaa55fde628c23580,offer viewed,594,ae264e3637204a6fb9bb56bc8210ddfd,,,,,,,,,10.0,10.0,7.0,bogo,1.0,1.0,1.0,0.0
120472,e15d54f8aaab498fa20d77365b91061f,offer received,336,4d5c57ea9a6940dd891ad53e9dbe8da0,,,,,,,,,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
93909,2bac7b1a8ffe4fb8bb3be7f06d8f0db4,transaction,246,,0.61,,,,,,,,,,,,,,,


In [119]:
profile[profile.user_id == '5ae36f912be1492199ec2da838cc6dda']

Unnamed: 0,age,user_id,income,days_as_member,F,M,O


In [120]:
# since we are interested in how users respond to offers, we can drop all the records where there is no user associated
combined.dropna(subset=['user_id'], inplace=True)

In [121]:
# all transaction records dont correlate with any offers necessarily (no direct link) do we need to keep them?
combined[combined['reward'].isnull()].sample(n=15)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
141736,fff8957ea8b240a6b5e634b6ee8eafcf,transaction,378,,3.42,71.0,fff8957ea8b240a6b5e634b6ee8eafcf,56000.0,1953.0,0.0,1.0,0.0,,,,,,,,
96912,7eef6a95040a49e2aa8863beb943d381,transaction,258,,19.28,77.0,7eef6a95040a49e2aa8863beb943d381,66000.0,2253.0,1.0,0.0,0.0,,,,,,,,
24128,babea5d61c914da09740cd3bf4322782,transaction,24,,14.65,75.0,babea5d61c914da09740cd3bf4322782,94000.0,2296.0,0.0,1.0,0.0,,,,,,,,
268523,86d03d35d7e0434b935e7743e83be3a0,transaction,588,,11.42,57.0,86d03d35d7e0434b935e7743e83be3a0,53000.0,2372.0,1.0,0.0,0.0,,,,,,,,
106911,d178bcf02f2640d0abfd6a720d8619ca,transaction,306,,16.23,50.0,d178bcf02f2640d0abfd6a720d8619ca,62000.0,1868.0,0.0,1.0,0.0,,,,,,,,
229629,43fbc1418ee14268a5d3797006cc69be,transaction,528,,15.23,55.0,43fbc1418ee14268a5d3797006cc69be,81000.0,2025.0,1.0,0.0,0.0,,,,,,,,
49958,8286ac169418406d8845180f46634ade,transaction,144,,14.57,56.0,8286ac169418406d8845180f46634ade,57000.0,1902.0,1.0,0.0,0.0,,,,,,,,
258848,3a9a78a58d3548798dc4b1b518ea64cf,transaction,576,,17.5,71.0,3a9a78a58d3548798dc4b1b518ea64cf,94000.0,2285.0,0.0,1.0,0.0,,,,,,,,
301388,7ee8608d770e4c469c1589c1fe5869f9,transaction,690,,2.69,58.0,7ee8608d770e4c469c1589c1fe5869f9,34000.0,3024.0,1.0,0.0,0.0,,,,,,,,
180757,a9fbb81cea3a41eab9cdf1d2952c9cd4,transaction,438,,3.07,47.0,a9fbb81cea3a41eab9cdf1d2952c9cd4,44000.0,3439.0,0.0,1.0,0.0,,,,,,,,


In [122]:
# find all the values where amount is null
combined[combined['amount'].isnull()].event.value_counts()
# this is expected as amount null for offer records

offer received     66501
offer viewed       49860
offer completed    32070
Name: event, dtype: int64

In [123]:
# find all the values where offer_id is null
combined[combined['offer_id'].isnull()].event.value_counts()
# this is expected as offer_id null for transaction records

transaction    123957
Name: event, dtype: int64

### Split transaction and offer records for analysis


In [124]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          123957
amount            148431
age                    0
user_id                0
income                 0
days_as_member         0
F                      0
M                      0
O                      0
reward            123957
difficulty        123957
duration          123957
offer_type        123957
email             123957
mobile            123957
social            123957
web               123957
dtype: int64

In [125]:
# split the data into offer and transaction records
offer_df = combined[combined['offer_id'].notnull()]
transaction_df = combined[combined['offer_id'].isnull()]

In [126]:
print(combined.shape)
print(offer_df.shape)
print(transaction_df.shape)

(272388, 20)
(148431, 20)
(123957, 20)


#### Transaction records

In [127]:
transaction_df.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
230881,325401ecec2e4570833f25eafccf16da,transaction,534,,2.04,28.0,325401ecec2e4570833f25eafccf16da,31000.0,1930.0,0.0,1.0,0.0,,,,,,,,
290122,1d2bac27fdf84d6aac5f19d3f8f6d3a0,transaction,648,,1.83,45.0,1d2bac27fdf84d6aac5f19d3f8f6d3a0,60000.0,2774.0,0.0,1.0,0.0,,,,,,,,
104958,8df3bb97a75a47b08c87d2cc3187eeb7,transaction,300,,3.9,74.0,8df3bb97a75a47b08c87d2cc3187eeb7,73000.0,1917.0,0.0,1.0,0.0,,,,,,,,
292275,5af71c1246834a02b6d671e3b93f8695,transaction,654,,9.49,19.0,5af71c1246834a02b6d671e3b93f8695,60000.0,2480.0,1.0,0.0,0.0,,,,,,,,
293344,23e9ff13012844d2b66bed7c0195469b,transaction,660,,6.05,60.0,23e9ff13012844d2b66bed7c0195469b,56000.0,2533.0,0.0,1.0,0.0,,,,,,,,


In [128]:
# drop all the id columns as we do not need them for modelling
transaction_df.drop(columns=['person', 'offer_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['person', 'offer_id'], inplace=True)


In [129]:
# drop all the offer related columns not related to transactions (nan values)
transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'offer_type', 'email', 'mobile', 'social', 'web'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'offer_type', 'email', 'mobile', 'social', 'web'], inplace=True)


In [130]:
transaction_df.head()

Unnamed: 0,event,time,amount,age,user_id,income,days_as_member,F,M,O
12654,transaction,0,0.83,20.0,02c083884c7d45b39cc68e1314fec56c,30000.0,2540.0,1.0,0.0,0.0
12657,transaction,0,34.56,42.0,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,96000.0,2716.0,0.0,1.0,0.0
12659,transaction,0,13.23,36.0,54890f68699049c2a04d415abc25e717,56000.0,2005.0,0.0,1.0,0.0
12670,transaction,0,19.51,55.0,b2f1cd155b864803ad8334cdf13c4bd2,94000.0,2078.0,1.0,0.0,0.0
12671,transaction,0,18.97,39.0,fe97aa22dd3e48c8b143116a8403dd52,67000.0,2016.0,1.0,0.0,0.0


#### Offer records

In [131]:
offer_df.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
183366,e685472e5137400db362c80bc216a915,offer completed,444,fafdcd668e3743c1bb461111dcafc2a4,,90.0,e685472e5137400db362c80bc216a915,99000.0,2175.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
6078,3da3248fc0c2453fbc0246e3df717cf2,offer received,0,3f207df678b143eea3cee63160fa8bed,,21.0,3da3248fc0c2453fbc0246e3df717cf2,62000.0,2405.0,0.0,1.0,0.0,0.0,0.0,4.0,informational,1.0,1.0,0.0,1.0
201878,c224c3eb8ca34ea0b5c5f2981cb4085f,offer received,504,fafdcd668e3743c1bb461111dcafc2a4,,50.0,c224c3eb8ca34ea0b5c5f2981cb4085f,63000.0,1993.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
247464,caad2d7ff50c4ccebbfe61ccb6ef9998,offer received,576,4d5c57ea9a6940dd891ad53e9dbe8da0,,56.0,caad2d7ff50c4ccebbfe61ccb6ef9998,72000.0,2724.0,0.0,1.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
71229,ef17a961d61f4ad0852a77af25b8756d,offer viewed,174,2298d6c36e964ae4a3e7e9706d1fb8c2,,27.0,ef17a961d61f4ad0852a77af25b8756d,72000.0,1814.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0


In [132]:
# drop all the transaction related columns not related to offers (nan values)
offer_df.drop(columns=['amount'], inplace=True)
offer_df.drop(columns=['person'], inplace=True)

# the time column is no of interesting to us, we can drop it
# offer_df.drop(columns=['time'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_df.drop(columns=['amount'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_df.drop(columns=['person'], inplace=True)


In [133]:
portfolio

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


In [134]:
offer_df[offer_df['offer_type'] == 'informational'].event.value_counts()

offer received    13300
offer viewed       9360
Name: event, dtype: int64

In [135]:
# informational offers are never completed, so we can drop them 
offer_df = offer_df[offer_df['offer_type'] != 'informational']

In [136]:
# drop all the events that are offer received
# offer_df = offer_df[offer_df['event'] != 'offer received']

In [137]:
offer_df.sample(n=5)

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
292031,offer viewed,654,4d5c57ea9a6940dd891ad53e9dbe8da0,92.0,2b45d1b9f77440538af83122e3d14a6c,65000.0,2661.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
207942,offer received,504,ae264e3637204a6fb9bb56bc8210ddfd,24.0,48fe136d44a84d59a84c56a975672368,32000.0,3040.0,0.0,1.0,0.0,10.0,10.0,7.0,bogo,1.0,1.0,1.0,0.0
61219,offer received,168,ae264e3637204a6fb9bb56bc8210ddfd,57.0,5fc8181b4b6646df8f4c9f22f0cf0f63,49000.0,2227.0,1.0,0.0,0.0,10.0,10.0,7.0,bogo,1.0,1.0,1.0,0.0
247249,offer received,576,0b1e1539f2cc45b7b9fa7c272da2e1d7,51.0,138ed93630ec4e59bda8aed2d92f42eb,57000.0,2083.0,0.0,1.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
161974,offer received,408,2298d6c36e964ae4a3e7e9706d1fb8c2,35.0,3d3bad0437e3459d873772f8023653ac,64000.0,2488.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0


In [138]:
offer_df[(offer_df.user_id == 'bea062a97557458a97f3e2df8d87755a') & (offer_df.offer_id == '4d5c57ea9a6940dd891ad53e9dbe8da0')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
206535,offer received,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2327.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
215894,offer viewed,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2327.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
231216,offer completed,534,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2327.0,1.0,0.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0


In [139]:
# use the event column to create dummy variables
offer_complete_df = pd.concat([offer_df, pd.get_dummies(offer_df['event'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# rename offer completed column to offer_completed, offer received to offer_received, offer viewed to offer_viewed
offer_complete_df.rename(columns={'offer completed':'offer_completed', 'offer received':'offer_received', 'offer viewed':'offer_viewed'}, inplace=True)
# drop event column
# offer_complete_df.drop('event', axis=1, inplace=True)

In [140]:
offer_complete_df[(offer_complete_df.user_id == 'bea062a97557458a97f3e2df8d87755a') & (offer_complete_df.offer_id == '4d5c57ea9a6940dd891ad53e9dbe8da0')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,difficulty,duration,offer_type,email,mobile,social,web,offer_completed,offer_received,offer_viewed
206535,offer received,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2327.0,1.0,0.0,0.0,...,10.0,5.0,bogo,1.0,1.0,1.0,1.0,0,1,0
215894,offer viewed,504,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2327.0,1.0,0.0,0.0,...,10.0,5.0,bogo,1.0,1.0,1.0,1.0,0,0,1
231216,offer completed,534,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,bea062a97557458a97f3e2df8d87755a,99000.0,2327.0,1.0,0.0,0.0,...,10.0,5.0,bogo,1.0,1.0,1.0,1.0,1,0,0


In [141]:
# group by user_id and offer_id and sum the dummy variables
# offer_complete_df = offer_complete_df.groupby(['user_id', 'offer_id']).sum().reset_index()

In [170]:
offer_complete_df['combined_id'] = offer_complete_df.apply(lambda x: x['user_id'] + x['offer_id'], axis=1)
combined_id_map = offer_complete_df.groupby('combined_id')

In [171]:

def offer_viewed_and_completed( row, combined_id_map):

    if row['event'] != 'offer viewed':
        return 0
    
    combined_id = row['user_id'] + row['offer_id']
    filtered_rows = combined_id_map.get_group(combined_id)

    # find the previous offer received event
    previous_offer_received = filtered_rows[(filtered_rows['time'] <= row['time']) & (filtered_rows['event'] == 'offer received')].tail(1)
    # find the next offer completed event
    next_offer_completed = filtered_rows[(filtered_rows['time'] >= row['time']) & (filtered_rows['event'] == 'offer completed')].head(1)

    if previous_offer_received.shape[0] == 0 or next_offer_completed.shape[0] == 0:
        return 0

    if (next_offer_completed['time'].iloc[0] - previous_offer_received['time'].iloc[0] < 24 * row['duration']):
        return 1
    
    return 0

offer_complete_df['offer_viewed_and_completed'] = offer_complete_df.apply(lambda x: offer_viewed_and_completed(x, combined_id_map), axis=1)

UnboundLocalError: local variable 'count' referenced before assignment

In [None]:
offer_complete_df

In [None]:
fgd

In [None]:
offer_complete_df

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,duration,offer_type,email,mobile,social,web,offer_completed,offer_received,offer_viewed,offer_viewed_and_completed
0,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,75.0,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2238.0,1.0,0.0,0.0,...,7.0,bogo,1.0,1.0,0.0,1.0,0,1,0,0
2,offer received,0,2906b810c7d4411798c6938adc9daaa5,68.0,e2127556f4f64592b11af22de27a7932,70000.0,1886.0,0.0,1.0,0.0,...,7.0,discount,1.0,1.0,0.0,1.0,0,1,0,0
5,offer received,0,f19421c1d4aa40978ebb69ca19b0e20d,65.0,389bc3fa690240e798340f5a15918d5c,53000.0,1962.0,0.0,1.0,0.0,...,5.0,bogo,1.0,1.0,1.0,1.0,0,1,0,0
8,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,61.0,aa4862eba776480b8bb9c68455b8c2e1,57000.0,2113.0,1.0,0.0,0.0,...,10.0,discount,1.0,0.0,0.0,1.0,0,1,0,0
9,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,62.0,31dda685af34476cad5bc968bdb01c53,71000.0,2691.0,1.0,0.0,0.0,...,10.0,discount,1.0,0.0,0.0,1.0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306073,offer completed,714,fafdcd668e3743c1bb461111dcafc2a4,58.0,452cdae2c68e4732a4177d3929562690,86000.0,1826.0,0.0,1.0,0.0,...,10.0,discount,1.0,1.0,1.0,1.0,1,0,0,0
306078,offer completed,714,2298d6c36e964ae4a3e7e9706d1fb8c2,56.0,0c027f5f34dd4b9eba0a25785c611273,61000.0,2070.0,0.0,1.0,0.0,...,7.0,discount,1.0,1.0,1.0,1.0,1,0,0,0
306110,offer viewed,714,0b1e1539f2cc45b7b9fa7c272da2e1d7,60.0,8dda575c2a1d44b9ac8e8b07b93d1f8e,64000.0,2116.0,1.0,0.0,0.0,...,10.0,discount,1.0,0.0,0.0,1.0,0,0,1,1
306112,offer completed,714,fafdcd668e3743c1bb461111dcafc2a4,39.0,8431c16f8e1d440880db371a68f82dd0,39000.0,1824.0,0.0,1.0,0.0,...,10.0,discount,1.0,1.0,1.0,1.0,1,0,0,0


In [None]:
# we not have a single record for each user and offer which indicates if an offer was received, viewed and completed or not

In [None]:
# this should return no results, checking data integrity
offer_complete_df[(offer_complete_df['offer_viewed'] == 1) & (offer_complete_df['offer_received'] ==0)]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,...,duration,offer_type,email,mobile,social,web,offer_completed,offer_received,offer_viewed,offer_viewed_and_completed
12650,offer viewed,0,f19421c1d4aa40978ebb69ca19b0e20d,65.0,389bc3fa690240e798340f5a15918d5c,53000.0,1962.0,0.0,1.0,0.0,...,5.0,bogo,1.0,1.0,1.0,1.0,0,0,1,1
12652,offer viewed,0,4d5c57ea9a6940dd891ad53e9dbe8da0,69.0,102e9454054946fda62242d2e176fdce,57000.0,2506.0,1.0,0.0,0.0,...,5.0,bogo,1.0,1.0,1.0,1.0,0,0,1,1
12653,offer viewed,0,ae264e3637204a6fb9bb56bc8210ddfd,20.0,02c083884c7d45b39cc68e1314fec56c,30000.0,2540.0,1.0,0.0,0.0,...,7.0,bogo,1.0,1.0,1.0,0.0,0,0,1,1
12656,offer viewed,0,2906b810c7d4411798c6938adc9daaa5,42.0,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,96000.0,2716.0,0.0,1.0,0.0,...,7.0,discount,1.0,1.0,0.0,1.0,0,0,1,1
12660,offer viewed,0,2298d6c36e964ae4a3e7e9706d1fb8c2,53.0,8c7df0c393db488aac3e58b06a7ea5f9,72000.0,2545.0,1.0,0.0,0.0,...,7.0,discount,1.0,1.0,1.0,1.0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305942,offer viewed,714,2298d6c36e964ae4a3e7e9706d1fb8c2,44.0,d1a824d43588413981d90146cc781a13,66000.0,1866.0,0.0,1.0,0.0,...,7.0,discount,1.0,1.0,1.0,1.0,0,0,1,1
305996,offer viewed,714,0b1e1539f2cc45b7b9fa7c272da2e1d7,53.0,345f9431137a49b3963221a55567c4a8,77000.0,2272.0,0.0,1.0,0.0,...,10.0,discount,1.0,0.0,0.0,1.0,0,0,1,1
306009,offer viewed,714,2298d6c36e964ae4a3e7e9706d1fb8c2,64.0,eb1ebe497d654c17b2fa99e40eb3d2d0,93000.0,2047.0,1.0,0.0,0.0,...,7.0,discount,1.0,1.0,1.0,1.0,0,0,1,1
306017,offer viewed,714,f19421c1d4aa40978ebb69ca19b0e20d,66.0,8ea3ffcc740848f99c9679e3bc3c9c80,30000.0,2047.0,0.0,1.0,0.0,...,5.0,bogo,1.0,1.0,1.0,1.0,0,0,1,1


In [None]:
# as we are only interested in offers that were viewed and then completed, we can drop all the records where viewed is 0
offer_complete_df = offer_complete_df[offer_complete_df['offer_viewed'] != 0]

In [None]:
# the offer received column is no longer needed, they are all 1
offer_complete_df.drop(columns=['offer_received'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_complete_df.drop(columns=['offer_received'], inplace=True)


#### Multiple user ID/offers investigation

In [None]:
offer_complete_df.offer_received.value_counts()

AttributeError: 'DataFrame' object has no attribute 'offer_received'

In [None]:
offer_complete_df[offer_complete_df.offer_completed>1]

Unnamed: 0,user_id,offer_id,time,age,income,days_as_member,F,M,O,reward,difficulty,duration,email,mobile,social,web,offer_completed,offer_viewed
8,0020c2b971eb4e9188eac86d93036a77,fafdcd668e3743c1bb461111dcafc2a4,912,295.0,450000.0,13345.0,5.0,0.0,0.0,10.0,50.0,50.0,5.0,5.0,5.0,5.0,2,1
13,003d66b6608740288d6cc97a6903f4f0,fafdcd668e3743c1bb461111dcafc2a4,2184,156.0,438000.0,13170.0,6.0,0.0,0.0,12.0,60.0,60.0,6.0,6.0,6.0,6.0,2,2
20,004c5799adbf42868b9cff0396190900,f19421c1d4aa40978ebb69ca19b0e20d,2826,324.0,594000.0,15852.0,0.0,6.0,0.0,30.0,30.0,30.0,6.0,6.0,6.0,6.0,2,2
21,004c5799adbf42868b9cff0396190900,fafdcd668e3743c1bb461111dcafc2a4,2364,324.0,594000.0,15852.0,0.0,6.0,0.0,12.0,60.0,60.0,6.0,6.0,6.0,6.0,2,2
28,00715b6e55c3431cb56ff7307eb19675,0b1e1539f2cc45b7b9fa7c272da2e1d7,1848,290.0,595000.0,10130.0,5.0,0.0,0.0,25.0,100.0,50.0,5.0,0.0,0.0,5.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44162,fff29fb549084123bd046dbc5ceb4faa,4d5c57ea9a6940dd891ad53e9dbe8da0,2640,354.0,558000.0,12744.0,6.0,0.0,0.0,60.0,60.0,30.0,6.0,6.0,6.0,6.0,2,2
44163,fff29fb549084123bd046dbc5ceb4faa,ae264e3637204a6fb9bb56bc8210ddfd,2070,295.0,465000.0,10620.0,5.0,0.0,0.0,50.0,50.0,35.0,5.0,5.0,5.0,0.0,2,1
44172,fff7576017104bcc8677a8d63322b5e1,fafdcd668e3743c1bb461111dcafc2a4,1836,426.0,438000.0,12378.0,0.0,6.0,0.0,12.0,60.0,60.0,6.0,6.0,6.0,6.0,2,2
44176,fffad4f4828548d1b5583907f2e9906b,f19421c1d4aa40978ebb69ca19b0e20d,1476,204.0,204000.0,14064.0,0.0,6.0,0.0,30.0,30.0,30.0,6.0,6.0,6.0,6.0,2,2


In [None]:
offer_complete_df[offer_complete_df.offer_completed >2]

Unnamed: 0,user_id,offer_id,time,age,income,days_as_member,F,M,O,reward,difficulty,duration,email,mobile,social,web,offer_completed,offer_viewed
248,018a49ffb8cf4812903e7c1f56fbb0b0,f19421c1d4aa40978ebb69ca19b0e20d,3414,549.0,306000.0,21564.0,0.0,9.0,0.0,45.0,45.0,45.0,9.0,9.0,9.0,9.0,3,3
511,0335d274249f4eb6b3c51527f02a3216,4d5c57ea9a6940dd891ad53e9dbe8da0,3144,189.0,666000.0,21636.0,9.0,0.0,0.0,90.0,90.0,45.0,9.0,9.0,9.0,9.0,3,3
660,040704e99ab84cd08977858fab9b9276,f19421c1d4aa40978ebb69ca19b0e20d,4476,585.0,792000.0,18144.0,0.0,9.0,0.0,45.0,45.0,45.0,9.0,9.0,9.0,9.0,3,3
694,043bcfeacb874bbc837300701ce25870,ae264e3637204a6fb9bb56bc8210ddfd,3642,630.0,648000.0,18306.0,9.0,0.0,0.0,90.0,90.0,63.0,9.0,9.0,9.0,0.0,3,3
737,0494aa6671414fab9837fa3cd45e72bc,0b1e1539f2cc45b7b9fa7c272da2e1d7,2226,497.0,252000.0,18844.0,7.0,0.0,0.0,35.0,140.0,70.0,7.0,0.0,0.0,7.0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43681,fcdc9e86b659499d882a4714ea53e974,ae264e3637204a6fb9bb56bc8210ddfd,2028,162.0,405000.0,21168.0,0.0,9.0,0.0,90.0,90.0,63.0,9.0,9.0,9.0,0.0,3,3
43780,fd90af4b9b784b268efa9d349a762491,fafdcd668e3743c1bb461111dcafc2a4,2790,270.0,576000.0,16650.0,0.0,9.0,0.0,18.0,90.0,90.0,9.0,9.0,9.0,9.0,3,3
43985,fee2d799672d4f81bfa3237207290f79,fafdcd668e3743c1bb461111dcafc2a4,1896,459.0,666000.0,23292.0,0.0,9.0,0.0,18.0,90.0,90.0,9.0,9.0,9.0,9.0,3,3
44098,ff932c6f8bb641bd816955337d153676,f19421c1d4aa40978ebb69ca19b0e20d,1650,585.0,684000.0,25434.0,0.0,9.0,0.0,45.0,45.0,45.0,9.0,9.0,9.0,9.0,3,3


In [None]:
offer_df[(offer_df.user_id == '0494aa6671414fab9837fa3cd45e72bc') & (offer_df.offer_id == '0b1e1539f2cc45b7b9fa7c272da2e1d7')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
10413,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
37992,offer viewed,78,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
51199,offer completed,150,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
161076,offer received,408,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
181994,offer completed,438,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
211840,offer received,504,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
290819,offer completed,648,0b1e1539f2cc45b7b9fa7c272da2e1d7,71.0,0494aa6671414fab9837fa3cd45e72bc,36000.0,2692.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0


In [None]:
offer_df[(offer_df.user_id == '0020c2b971eb4e9188eac86d93036a77') & (offer_df.offer_id == 'fafdcd668e3743c1bb461111dcafc2a4')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
1889,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
18431,offer viewed,12,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
31327,offer completed,54,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
112684,offer received,336,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
218771,offer completed,510,fafdcd668e3743c1bb461111dcafc2a4,59.0,0020c2b971eb4e9188eac86d93036a77,90000.0,2669.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0


In [None]:
offer_df[(offer_df.user_id == 'edc7b04392144da9979f3077095f268a') & (offer_df.offer_id == 'fafdcd668e3743c1bb461111dcafc2a4')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
64630,offer received,168,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
74568,offer viewed,180,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
91966,offer completed,234,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
122267,offer received,336,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
141346,offer viewed,372,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
158736,offer received,408,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
190920,offer viewed,462,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
194743,offer completed,474,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
201278,offer completed,498,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
209549,offer received,504,fafdcd668e3743c1bb461111dcafc2a4,64.0,edc7b04392144da9979f3077095f268a,51000.0,3027.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0


In [None]:
offer_df[(offer_df.user_id == '1c8cf4af93464dcaa971cfcffc2cc1e5') & (offer_df.offer_id == '2298d6c36e964ae4a3e7e9706d1fb8c2')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
9430,offer received,0,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
17478,offer completed,6,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
25652,offer viewed,30,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
62636,offer received,168,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
74147,offer viewed,180,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
80916,offer completed,198,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
210863,offer received,504,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
220863,offer viewed,510,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
229574,offer completed,528,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
254325,offer received,576,2298d6c36e964ae4a3e7e9706d1fb8c2,73.0,1c8cf4af93464dcaa971cfcffc2cc1e5,97000.0,2161.0,0.0,1.0,0.0,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0


In [None]:
offer_df[(offer_df.user_id == '073fce5708884b30a28b65b3cb15a919') & (offer_df.offer_id == '9b98b8c7a33c4b65b9aebfe6a799e6d9')]

Unnamed: 0,event,time,offer_id,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
2268,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
13189,offer viewed,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
38457,offer completed,84,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
55466,offer received,168,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
66499,offer viewed,168,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
87438,offer completed,222,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
152857,offer received,408,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
168232,offer completed,414,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
171687,offer viewed,420,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
203692,offer received,504,9b98b8c7a33c4b65b9aebfe6a799e6d9,58.0,073fce5708884b30a28b65b3cb15a919,96000.0,2354.0,0.0,1.0,0.0,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0


#### Write data to parquet files

In [None]:
# write output data to parquet files
# combined.to_parquet('data/combined.parquet', engine='pyarrow')
# offer_df.to_parquet('data/offer_df.parquet', engine='pyarrow')
offer_complete_df.to_parquet('data/offer_complete_df.parquet', engine='pyarrow')
transaction_df.to_parquet('data/transaction_df.parquet', engine='pyarrow')

In [None]:
# drop all the id columns as we do not need them for modelling
# offer_complete_df.drop(columns=['user_id', 'offer_id'], inplace=True)