# Data Preprocessing

In [26]:
import pandas as pd
import numpy as np
import math
import json
# % matplotlib inline
import datetime
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Portfolio

In [27]:
portfolio.dtypes

reward         int64
channels      object
difficulty     int64
duration       int64
offer_type    object
id            object
dtype: object

In [28]:
# machine learning algorithms can only handle numerical features, change the channels column to dummy variables
# use channel column to create dummy variables
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['channels'].apply(pd.Series).stack()).sum(level=0)], axis=1)
# drop channel column
portfolio.drop('channels', axis=1, inplace=True)

In [29]:
portfolio

Unnamed: 0,reward,difficulty,duration,offer_type,id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


In [30]:
# write to csv
# portfolio.to_csv('data/portfolio.csv', index=False)

## Profile

In [31]:
profile.dtypes

gender               object
age                   int64
id                   object
became_member_on      int64
income              float64
dtype: object

In [32]:
# change the became_member_on column to datetime
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')

In [33]:
# machine learning algorithms cannot consume dates
# change became_member_on to number of days as a member 
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days

# drop became_member_on column
profile.drop('became_member_on', axis=1, inplace=True)

In [34]:
# filter for all the customers who have income data and no gender data
profile[profile['gender'].isnull() & profile['income'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [35]:
profile[profile['income'].isnull() & profile['gender'].notnull()]

Unnamed: 0,gender,age,id,income,days_as_member


NOTE: because there are only 4 features and all the rows with missing genders also have missing income values, we can drop these rows

In [36]:
profile.shape

(17000, 5)

In [37]:
# drop nan values
profile.dropna(inplace=True)

In [38]:
profile.shape

(14825, 5)

In [39]:
print(f"{17000-14825} records dropped")

2175 records dropped


In [40]:
profile[profile['income'].isnull()]

Unnamed: 0,gender,age,id,income,days_as_member


In [41]:
profile.head()

Unnamed: 0,gender,age,id,income,days_as_member
1,F,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2136
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2203
5,M,68,e2127556f4f64592b11af22de27a7932,70000.0,1851
8,M,65,389bc3fa690240e798340f5a15918d5c,53000.0,1927
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2017


In [42]:
profile.gender.value_counts()

M    8484
F    6129
O     212
Name: gender, dtype: int64

In [44]:
# machine learning algorithms can only handle numerical features, change the gender column to dummy variables
# create gender dummy variables
profile = pd.concat([profile, pd.get_dummies(profile['gender'].apply(pd.Series).stack()).sum(level=0)], axis=1)

In [46]:
# drop channel column
profile.drop('gender', axis=1, inplace=True)

In [47]:
profile

Unnamed: 0,age,id,income,days_as_member,F,M,O
1,55,0610b486422d4921ae7d2bf64640c50b,112000.0,2136,1,0,0
3,75,78afa995795e4d85b5d9ceeca43f5fef,100000.0,2203,1,0,0
5,68,e2127556f4f64592b11af22de27a7932,70000.0,1851,0,1,0
8,65,389bc3fa690240e798340f5a15918d5c,53000.0,1927,0,1,0
12,58,2eeac8d8feae4a8cad5a6af0499a211d,51000.0,2017,0,1,0
...,...,...,...,...,...,...,...
16995,45,6d5f3a774f3d4714ab0c092238f3a1d7,54000.0,1812,1,0,0
16996,61,2cb4f97358b841b9a9773a7aa05a9d77,72000.0,1773,0,1,0
16997,49,01d26f638c274aa0b965d24cefe3183f,73000.0,2306,0,1,0
16998,83,9dc1421481194dcd9400aec7c9ae6366,50000.0,2631,1,0,0


In [None]:
# rename columns F, M, O to Female, Male, Other
profile