# Data Preprocessing

In [682]:
import pandas as pd
import numpy as np
import math
import json
# % matplotlib inline
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Portfolio

In [683]:
portfolio.dtypes

reward         int64
channels      object
difficulty     int64
duration       int64
offer_type    object
id            object
dtype: object

In [684]:
portfolio.head(5)

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [685]:
# machine learning algorithms can only handle numerical features, change the channels column to dummy variables
# use channel column to create dummy variables
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['channels'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# drop channel column
portfolio.drop('channels', axis=1, inplace=True)

In [686]:
portfolio= portfolio.rename(columns={'id':'offer_id'})

In [687]:
portfolio.head()

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1


## Profile

In [688]:
profile.dtypes

gender               object
age                   int64
id                   object
became_member_on      int64
income              float64
dtype: object

In [689]:
# change the became_member_on column to datetime
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')

In [690]:
# machine learning algorithms cannot consume dates
# change became_member_on to number of days as a member 
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days

# drop became_member_on column
profile.drop('became_member_on', axis=1, inplace=True)

In [691]:
# filter for all the customers who have income data and no gender data
profile[profile['gender'].isnull() & profile['income'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [692]:
profile[profile['income'].isnull() & profile['gender'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


NOTE: because there are only 4 features and all the rows with missing genders also have missing income values, we can drop these rows

In [693]:
profile[profile['age'] >= 118]

Unnamed: 0,gender,age,id,income,days_as_member
0,,118,68be06ca386d4c31939f3a4f0e3dd783,,2323
2,,118,38fe809add3b4fcf9315a9694bb96ff5,,1808
4,,118,a03223e636434f42ac4c3df47e8bac43,,2150
6,,118,8ec6ce2a7e7949b1bf142def7d0e0586,,2098
7,,118,68617ca6246f4fbc85e91a2a49552598,,2091
...,...,...,...,...,...
16980,,118,5c686d09ca4d475a8f750f2ba07e0440,,2487
16982,,118,d9ca82f550ac4ee58b6299cf1e5c824a,,2626
16989,,118,ca45ee1883624304bac1e4c8a114f045,,1937
16991,,118,a9a20fa8b5504360beb4e7c8712f8306,,2716


NOTE: it is also the same records with invalid age values, we will drop these columns.

In [694]:
profile.shape

(17000, 5)

In [695]:
# drop nan values
profile.dropna(inplace=True)

In [696]:
profile.shape

(14825, 5)

In [697]:
print(f"{17000-14825} records dropped")

2175 records dropped


In [698]:
profile[profile['income'].isnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [699]:
profile.head()

Unnamed: 0,gender,age,id,income,days_as_member
1,F,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2170
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2237
5,M,68,e2127556f4f64592b11af22de27a7932,70000.0,1885
8,M,65,389bc3fa690240e798340f5a15918d5c,53000.0,1961
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2051


In [700]:
profile.gender.value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [701]:
# machine learning algorithms can only handle numerical features, change the gender column to dummy variables
# create gender dummy variables
profile = pd.concat([profile, pd.get_dummies(profile['gender'].apply(pd.Series).stack()).sum(level=0)], axis=1)

In [702]:
# drop channel column
profile.drop('gender', axis=1, inplace=True)

In [703]:
profile.head()

Unnamed: 0,age,id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2170,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2237,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1885,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1961,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2051,0,1,0


In [704]:
profile= profile.rename(columns={'id':'user_id'})

In [705]:
profile.head()

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2170,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2237,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1885,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1961,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2051,0,1,0


## Transcript

In [706]:
transcript.dtypes

person    object
event     object
value     object
time       int64
dtype: object

In [707]:
transcript.tail()

Unnamed: 0,person,event,value,time
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714
306533,c02b10e8752c4d8e9b73f918558531f7,transaction,{'amount': 4.05},714


In [708]:
transcript[transcript.event == 'transaction']

Unnamed: 0,person,event,value,time
12654,02c083884c7d45b39cc68e1314fec56c,transaction,{'amount': 0.8300000000000001},0
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,{'amount': 34.56},0
12659,54890f68699049c2a04d415abc25e717,transaction,{'amount': 13.23},0
12670,b2f1cd155b864803ad8334cdf13c4bd2,transaction,{'amount': 19.51},0
12671,fe97aa22dd3e48c8b143116a8403dd52,transaction,{'amount': 18.97},0
...,...,...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714


In [709]:
transcript.sample(n=5)

Unnamed: 0,person,event,value,time
154489,5ba3453b92c04d02937e446e4155256d,offer received,{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'},408
298826,da47c4418d464beb8fcd91c25d143d5b,transaction,{'amount': 2.15},678
157430,7ca6bcfd20fc4b8488d8e7e590f0d44e,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},408
116653,62f918cdbfef4e4c89220fec49a98969,offer received,{'offer id': '5a8bc65990b245e5a138643cd4eb9837'},336
129582,7a26250ec6ff4941a7415e1366b6fb3c,transaction,{'amount': 2.12},342


In [710]:
# convert the json column to usable columns

In [711]:
# transcript['value_type'] = transcript['value'].apply(lambda x: list(x.keys())[0])
# transcript['value_details'] = transcript['value'].apply(lambda x: list(x.values())[0])
# transcript['value_details'] = transcript['value_details'].astype(str)
# this doesnt work for joining later

In [712]:
# unpack the value column
transcript['offer_id'] = transcript['value'].apply(lambda x: x.get('offer_id') or x.get('offer id'))
transcript['amount'] = transcript['value'].apply(lambda x: x.get('amount'))

In [713]:
# drop the value column
transcript.drop(columns=['value'], inplace=True)

In [714]:
# drop all the duplicate records
transcript.drop_duplicates(inplace=True)

In [715]:
# transcript= transcript.rename(columns={'person':'user_id'})

## Combining data

In [716]:
portfolio.head(2)

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1


In [717]:
profile.head(2)

Unnamed: 0,age,user_id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2170,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2237,1,0,0


In [718]:
transcript.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount
97964,bd5b31dca4a94430a759aa9c4e89daeb,transaction,264,,1.1
246486,7b58f6fe0a654970a4299a84fc36d8be,offer received,576,5a8bc65990b245e5a138643cd4eb9837,
130293,7f881de7bb3a4e55b51230bf4daa0994,offer viewed,348,2298d6c36e964ae4a3e7e9706d1fb8c2,
268998,f3949ede37024d96bc306c64aebf3695,offer completed,594,fafdcd668e3743c1bb461111dcafc2a4,
61479,878adaf38ab74ed39fd287be074ef1a0,offer received,168,9b98b8c7a33c4b65b9aebfe6a799e6d9,


In [719]:
transcript.shape

(306137, 5)

In [720]:
combined = pd.merge(transcript, profile, left_on= 'person', right_on='user_id', how='left')
combined = pd.merge(combined, portfolio, on = 'offer_id', how='left')
combined.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
241842,61ab10334b19403a9afb9e02ad05c08b,transaction,564,,23.98,57.0,61ab10334b19403a9afb9e02ad05c08b,93000.0,2619.0,1.0,0.0,0.0,,,,,,,,
191028,fd7ed8ba5ae84ceba227970f2243fad3,transaction,462,,1.28,57.0,fd7ed8ba5ae84ceba227970f2243fad3,74000.0,2982.0,0.0,1.0,0.0,,,,,,,,
55138,34fe7c1f20874ccc81e2c4506407fa72,offer received,168,f19421c1d4aa40978ebb69ca19b0e20d,,68.0,34fe7c1f20874ccc81e2c4506407fa72,39000.0,1893.0,0.0,1.0,0.0,5.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0
278771,30656ebd73d84c0ba0b8784ed2b27bbe,offer viewed,612,3f207df678b143eea3cee63160fa8bed,,56.0,30656ebd73d84c0ba0b8784ed2b27bbe,63000.0,2117.0,1.0,0.0,0.0,0.0,0.0,4.0,informational,1.0,1.0,0.0,1.0
72427,0aaeeafa15f44a7fbd10feb66bb9534e,transaction,180,,1.06,30.0,0aaeeafa15f44a7fbd10feb66bb9534e,40000.0,3009.0,1.0,0.0,0.0,,,,,,,,


In [721]:
# no records gained, correct
combined.shape

(306137, 20)

In [722]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          138953
amount            167184
age                33749
user_id            33749
income             33749
days_as_member     33749
F                  33749
M                  33749
O                  33749
reward            138953
difficulty        138953
duration          138953
offer_type        138953
email             138953
mobile            138953
social            138953
web               138953
dtype: int64

In [723]:
# there are no users associated with these records
combined[combined['user_id'].isnull()].sample(n=15)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
223769,4c6f06b22b6043528d1f194fd5c539f7,offer viewed,516,4d5c57ea9a6940dd891ad53e9dbe8da0,,,,,,,,,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
149604,ed1dc821bebd4d97ae7239ff25d85914,transaction,402,,6.34,,,,,,,,,,,,,,,
212809,570f7cc3a63249d9b295d5fb8a7c1d73,offer received,504,fafdcd668e3743c1bb461111dcafc2a4,,,,,,,,,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
89239,8fdb780cf1844206bd605313965cacf9,transaction,228,,2.45,,,,,,,,,,,,,,,
192655,2d0006044f9b46ee8f65298b3abe8447,transaction,468,,1.45,,,,,,,,,,,,,,,
193911,bdb10049fb154a558df75400bf32afcc,transaction,474,,2.93,,,,,,,,,,,,,,,
280561,2ca26dc17c8644118f094f0c0e887e65,transaction,618,,2.4,,,,,,,,,,,,,,,
30336,a855e5bfef6f4faca5399b09e613db19,transaction,48,,8.2,,,,,,,,,,,,,,,
207225,ecb591a8efae462c9f24f5a36aa4a541,offer received,504,ae264e3637204a6fb9bb56bc8210ddfd,,,,,,,,,10.0,10.0,7.0,bogo,1.0,1.0,1.0,0.0
202863,fb531e0001874b2e9d55076aafb40c4a,offer received,504,2906b810c7d4411798c6938adc9daaa5,,,,,,,,,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0


In [724]:
profile[profile.user_id == '5ae36f912be1492199ec2da838cc6dda']

Unnamed: 0,age,user_id,income,days_as_member,F,M,O


In [725]:
# since we are interested in how users respond to offers, we can drop all the records where there is no user associated
combined.dropna(subset=['user_id'], inplace=True)

In [726]:
# all transaction records dont correlate with any offers necessarily (no direct link) do we need to keep them?
combined[combined['reward'].isnull()].sample(n=15)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
283398,ab7e052ddbc040938db0e74a56d97b0f,transaction,624,,18.75,37.0,ab7e052ddbc040938db0e74a56d97b0f,96000.0,2375.0,0.0,1.0,0.0,,,,,,,,
295261,3587049edaf84b8f963ecdbb65beb24e,transaction,666,,2.55,46.0,3587049edaf84b8f963ecdbb65beb24e,32000.0,2423.0,0.0,1.0,0.0,,,,,,,,
216252,d836d01e48ac45859eba1f947ff07bf7,transaction,504,,3.4,38.0,d836d01e48ac45859eba1f947ff07bf7,64000.0,2456.0,0.0,1.0,0.0,,,,,,,,
181709,25c34888681c46acb8aff3ef0a842aca,transaction,438,,23.06,51.0,25c34888681c46acb8aff3ef0a842aca,79000.0,3206.0,0.0,1.0,0.0,,,,,,,,
195751,56be387667484ed98219c34cad38d8c7,transaction,480,,29.59,52.0,56be387667484ed98219c34cad38d8c7,91000.0,2154.0,0.0,1.0,0.0,,,,,,,,
52419,0d50a9abf52442e29e484474cc8d0d68,transaction,162,,29.17,49.0,0d50a9abf52442e29e484474cc8d0d68,73000.0,1929.0,1.0,0.0,0.0,,,,,,,,
288806,eb5db7f1468847288b3caa2600d33eeb,transaction,642,,22.01,76.0,eb5db7f1468847288b3caa2600d33eeb,99000.0,2239.0,1.0,0.0,0.0,,,,,,,,
272816,21e4f5128ff94baf89619d6be8bdc77f,transaction,600,,1.11,53.0,21e4f5128ff94baf89619d6be8bdc77f,41000.0,2410.0,0.0,1.0,0.0,,,,,,,,
90204,bb3e095a8e4f4b42a92a19f2d099b6ed,transaction,228,,16.96,29.0,bb3e095a8e4f4b42a92a19f2d099b6ed,73000.0,2251.0,0.0,1.0,0.0,,,,,,,,
230673,03deb7cf0ebd4c7889a8a9e44df1c50a,transaction,534,,1.09,53.0,03deb7cf0ebd4c7889a8a9e44df1c50a,64000.0,2553.0,0.0,1.0,0.0,,,,,,,,


In [727]:
# find all the values where amount is null
combined[combined['amount'].isnull()].event.value_counts()
# this is expected as amount null for offer records

offer received     66501
offer viewed       49860
offer completed    32070
Name: event, dtype: int64

In [728]:
# find all the values where offer_id is null
combined[combined['offer_id'].isnull()].event.value_counts()
# this is expected as offer_id null for transaction records

transaction    123957
Name: event, dtype: int64

### Split transaction and offer records for analysis


In [729]:
# find all the nan values
combined.isnull().sum()

person                 0
event                  0
time                   0
offer_id          123957
amount            148431
age                    0
user_id                0
income                 0
days_as_member         0
F                      0
M                      0
O                      0
reward            123957
difficulty        123957
duration          123957
offer_type        123957
email             123957
mobile            123957
social            123957
web               123957
dtype: int64

In [730]:
# split the data into offer and transaction records
offer_df = combined[combined['offer_id'].notnull()]
transaction_df = combined[combined['offer_id'].isnull()]

In [731]:
print(combined.shape)
print(offer_df.shape)
print(transaction_df.shape)

(272388, 20)
(148431, 20)
(123957, 20)


In [732]:
# drop all the id columns as we do not need them for modelling
# combined.drop(columns=['person', 'user_id', 'offer_id'], inplace=True)

In [733]:
transaction_df.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
216778,842e92322fbd4314abdd73561b694b53,transaction,504,,10.02,36.0,842e92322fbd4314abdd73561b694b53,42000.0,1803.0,0.0,1.0,0.0,,,,,,,,
299122,3fe342e44bcc4bbc8d1771e1cf9d9ca3,transaction,684,,2.81,51.0,3fe342e44bcc4bbc8d1771e1cf9d9ca3,62000.0,2149.0,0.0,1.0,0.0,,,,,,,,
235207,c062ae783c3245aea6203f072ae931c3,transaction,546,,2.28,72.0,c062ae783c3245aea6203f072ae931c3,33000.0,3376.0,0.0,1.0,0.0,,,,,,,,
183286,8cc1d48359af4d51bc4649728bf95ff6,transaction,444,,0.77,34.0,8cc1d48359af4d51bc4649728bf95ff6,73000.0,3386.0,1.0,0.0,0.0,,,,,,,,
83449,576e6eed3c6a4ac682ebd35b7ea672f4,transaction,210,,2.27,20.0,576e6eed3c6a4ac682ebd35b7ea672f4,49000.0,3311.0,0.0,1.0,0.0,,,,,,,,


In [734]:
# drop all the id columns as we do not need them for modelling
transaction_df.drop(columns=['person', 'user_id', 'offer_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['person', 'user_id', 'offer_id'], inplace=True)


In [735]:
# drop all the offer related columns not related to transactions (nan values)
transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'offer_type', 'email', 'mobile', 'social', 'web'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transaction_df.drop(columns=['reward', 'difficulty', 'duration', 'offer_type', 'email', 'mobile', 'social', 'web'], inplace=True)


In [736]:
offer_df.sample(n=5)

Unnamed: 0,person,event,time,offer_id,amount,age,user_id,income,days_as_member,F,M,O,reward,difficulty,duration,offer_type,email,mobile,social,web
216389,ac1fa6dad6284be5bee99d94a353e688,offer viewed,504,fafdcd668e3743c1bb461111dcafc2a4,,59.0,ac1fa6dad6284be5bee99d94a353e688,88000.0,2692.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
194359,892a5522371f4e199fc5b9cc514c1ff9,offer completed,474,0b1e1539f2cc45b7b9fa7c272da2e1d7,,54.0,892a5522371f4e199fc5b9cc514c1ff9,48000.0,2568.0,1.0,0.0,0.0,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
27267,65a9ceafbc3a42c696cf7ff2e45a3363,offer completed,36,fafdcd668e3743c1bb461111dcafc2a4,,63.0,65a9ceafbc3a42c696cf7ff2e45a3363,76000.0,2519.0,1.0,0.0,0.0,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
220405,d555689c336b45f4ae0a7b9fd99fbc8a,offer completed,510,4d5c57ea9a6940dd891ad53e9dbe8da0,,55.0,d555689c336b45f4ae0a7b9fd99fbc8a,61000.0,2175.0,0.0,1.0,0.0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
203626,b070360c5c3d4fec976a188a6972aa08,offer received,504,2906b810c7d4411798c6938adc9daaa5,,44.0,b070360c5c3d4fec976a188a6972aa08,67000.0,1986.0,0.0,1.0,0.0,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0


In [737]:
# drop all the id columns as we do not need them for modelling
offer_df.drop(columns=['person', 'user_id', 'offer_id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_df.drop(columns=['person', 'user_id', 'offer_id'], inplace=True)


In [738]:
# drop all the transaction related columns not related to offers (nan values)
offer_df.drop(columns=['amount'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  offer_df.drop(columns=['amount'], inplace=True)


In [739]:
# write output data to parquet files
# combined.to_parquet('data/combined.parquet', engine='pyarrow')
offer_df.to_parquet('data/offer_df.parquet', engine='pyarrow')
transaction_df.to_parquet('data/transaction_df.parquet', engine='pyarrow')