# Data Preprocessing

In [143]:
import pandas as pd
import numpy as np
import math
import json
# % matplotlib inline
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Portfolio

In [144]:
portfolio.dtypes

reward         int64
channels      object
difficulty     int64
duration       int64
offer_type    object
id            object
dtype: object

In [145]:
# machine learning algorithms can only handle numerical features, change the channels column to dummy variables
# use channel column to create dummy variables
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['channels'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# drop channel column
portfolio.drop('channels', axis=1, inplace=True)

In [146]:
portfolio.rename(columns={'id':'offer_id'})

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


In [147]:
portfolio

Unnamed: 0,reward,difficulty,duration,offer_type,id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


## Profile

In [148]:
profile.dtypes

gender               object
age                   int64
id                   object
became_member_on      int64
income              float64
dtype: object

In [149]:
# change the became_member_on column to datetime
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')

In [150]:
# machine learning algorithms cannot consume dates
# change became_member_on to number of days as a member 
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days

# drop became_member_on column
profile.drop('became_member_on', axis=1, inplace=True)

In [151]:
# filter for all the customers who have income data and no gender data
profile[profile['gender'].isnull() & profile['income'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [152]:
profile[profile['income'].isnull() & profile['gender'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


NOTE: because there are only 4 features and all the rows with missing genders also have missing income values, we can drop these rows

In [153]:
profile[profile['age'] >= 118]

Unnamed: 0,gender,age,id,income,days_as_member
0,,118,68be06ca386d4c31939f3a4f0e3dd783,,2290
2,,118,38fe809add3b4fcf9315a9694bb96ff5,,1775
4,,118,a03223e636434f42ac4c3df47e8bac43,,2117
6,,118,8ec6ce2a7e7949b1bf142def7d0e0586,,2065
7,,118,68617ca6246f4fbc85e91a2a49552598,,2058
...,...,...,...,...,...
16980,,118,5c686d09ca4d475a8f750f2ba07e0440,,2454
16982,,118,d9ca82f550ac4ee58b6299cf1e5c824a,,2593
16989,,118,ca45ee1883624304bac1e4c8a114f045,,1904
16991,,118,a9a20fa8b5504360beb4e7c8712f8306,,2683


NOTE: it is also the same records with invalid age values, we will drop these columns.

In [154]:
profile.shape

(17000, 5)

In [155]:
# drop nan values
profile.dropna(inplace=True)

In [156]:
profile.shape

(14825, 5)

In [157]:
print(f"{17000-14825} records dropped")

2175 records dropped


In [158]:
profile[profile['income'].isnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [159]:
profile.head()

Unnamed: 0,gender,age,id,income,days_as_member
1,F,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2137
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2204
5,M,68,e2127556f4f64592b11af22de27a7932,70000.0,1852
8,M,65,389bc3fa690240e798340f5a15918d5c,53000.0,1928
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2018


In [160]:
profile.gender.value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [161]:
# machine learning algorithms can only handle numerical features, change the gender column to dummy variables
# create gender dummy variables
profile = pd.concat([profile, pd.get_dummies(profile['gender'].apply(pd.Series).stack()).sum(level=0)], axis=1)

In [162]:
# drop channel column
profile.drop('gender', axis=1, inplace=True)

In [163]:
profile

Unnamed: 0,age,id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2137,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2204,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1852,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1928,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2018,0,1,0
...,...,...,...,...,...,...,...
16995,45,6d5f3a774f3d4714ab0c092238f3a1d7,54000.0,1813,1,0,0
16996,61,2cb4f97358b841b9a9773a7aa05a9d77,72000.0,1774,0,1,0
16997,49,01d26f638c274aa0b965d24cefe3183f,73000.0,2307,0,1,0
16998,83,9dc1421481194dcd9400aec7c9ae6366,50000.0,2632,1,0,0


## Transcript

In [164]:
transcript.dtypes

person    object
event     object
value     object
time       int64
dtype: object

In [165]:
transcript.tail()

Unnamed: 0,person,event,value,time
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714
306533,c02b10e8752c4d8e9b73f918558531f7,transaction,{'amount': 4.05},714


In [166]:
transcript[transcript.event == 'transaction']

Unnamed: 0,person,event,value,time
12654,02c083884c7d45b39cc68e1314fec56c,transaction,{'amount': 0.8300000000000001},0
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,transaction,{'amount': 34.56},0
12659,54890f68699049c2a04d415abc25e717,transaction,{'amount': 13.23},0
12670,b2f1cd155b864803ad8334cdf13c4bd2,transaction,{'amount': 19.51},0
12671,fe97aa22dd3e48c8b143116a8403dd52,transaction,{'amount': 18.97},0
...,...,...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,transaction,{'amount': 1.5899999999999999},714
306530,68213b08d99a4ae1b0dcb72aebd9aa35,transaction,{'amount': 9.53},714
306531,a00058cf10334a308c68e7631c529907,transaction,{'amount': 3.61},714
306532,76ddbd6576844afe811f1a3c0fbb5bec,transaction,{'amount': 3.5300000000000002},714


In [167]:
# drop duplicate records
transcript.drop_duplicates(inplace=True)

TypeError: unhashable type: 'dict'

In [None]:
# unpack the value column
transcript['offer_id'] = transcript['value'].apply(lambda x: x.get('offer_id'))
transcript['amount'] = transcript['value'].apply(lambda x: x.get('amount'))
transcript['offer id'] = transcript['value'].apply(lambda x: x.get('offer id'))