In [1]:
import pandas as pd
import numpy as np
import math
import json
%matplotlib inline

In [44]:
# read in the json files
portfolio = pd.read_json('portfolio.json', orient='records', lines=True)
profile = pd.read_json('profile.json', orient='records', lines=True)
transcript = pd.read_json('transcript.json', orient='records', lines=True)

In [45]:
portfolio.shape, profile.shape, transcript.shape

((10, 6), (17000, 5), (306534, 4))

In [46]:
portfolio

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
5,3,"[web, email, mobile, social]",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
6,2,"[web, email, mobile, social]",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
7,0,"[email, mobile, social]",0,3,informational,5a8bc65990b245e5a138643cd4eb9837
8,5,"[web, email, mobile, social]",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
9,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5


In [47]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [48]:
transcript.sort_index().head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


# Format and Merge offer portfolio, customer profile, and transcript data

In [49]:
portfolio.rename(columns = {'id': 'offer_id'}, inplace = True)
profile.rename(columns = {'id': 'person'}, inplace = True)

In [50]:
# generate dummy variables for offer type
portfolio = pd.concat([portfolio, pd.get_dummies(portfolio['offer_type'])], axis = 1).drop('offer_type', axis = 1)

In [51]:
# create dummy columns for the channels
# initiate dummy variables. If created in the following for loop, it is created as a float rather than int. 
for c in ['web', 'email', 'mobile', 'social']:
    portfolio[c] = 0
    
for i in range(len(portfolio)):
    for c in ['web', 'email', 'mobile', 'social']:
        if c in portfolio.loc[i, 'channels']:
            portfolio.loc[i, c] = 1

In [52]:
portfolio.drop('channels', axis = 1, inplace = True)

In [53]:
portfolio

Unnamed: 0,reward,difficulty,duration,offer_id,bogo,discount,informational,web,email,mobile,social
0,10,10,7,ae264e3637204a6fb9bb56bc8210ddfd,1,0,0,0,1,1,1
1,10,10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,1,0,0,1,1,1,1
2,0,0,4,3f207df678b143eea3cee63160fa8bed,0,0,1,1,1,1,0
3,5,5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,0,0,1,1,1,0
4,5,20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,1,0,1,1,0,0
5,3,7,7,2298d6c36e964ae4a3e7e9706d1fb8c2,0,1,0,1,1,1,1
6,2,10,10,fafdcd668e3743c1bb461111dcafc2a4,0,1,0,1,1,1,1
7,0,0,3,5a8bc65990b245e5a138643cd4eb9837,0,0,1,0,1,1,1
8,5,5,5,f19421c1d4aa40978ebb69ca19b0e20d,1,0,0,1,1,1,1
9,2,10,7,2906b810c7d4411798c6938adc9daaa5,0,1,0,1,1,1,0


In [54]:
# split transcript['value'] column
# can also use "pd.json_normalize(transcript['value'])" to split the dictionary column, just slightly slower
value = pd.DataFrame(transcript['value'].values.tolist(), index=transcript.index)

In [55]:
value.sample(n = 5)

Unnamed: 0,offer id,amount,offer_id,reward
252275,2298d6c36e964ae4a3e7e9706d1fb8c2,,,
51814,,28.79,,
126806,2298d6c36e964ae4a3e7e9706d1fb8c2,,,
237599,,12.22,,
140216,ae264e3637204a6fb9bb56bc8210ddfd,,,


In [56]:
# concatenate 'offer id'  with 'offer_id' to create a whole list of offers
value['offer'] = value['offer id'].str.cat(value['offer_id'], join='outer', na_rep = '')

In [57]:
# for those rows with no offers, set the offer column to be missing
value.loc[(value['offer id'].isnull()) & (value['offer_id'].isnull()), 'offer'] = np.nan

In [58]:
value.drop(['offer id', 'offer_id'], axis = 1, inplace = True)
value.rename(columns = {'offer': 'offer_id'}, inplace = True)

In [59]:
# merge transcript and value by index 
transcript2 = transcript.join(value)
transcript2.drop('value', axis = 1, inplace = True)

transcript2.shape

(306534, 6)

In [60]:
# reorder the columns
transcript2 = transcript2[['person', 'event', 'offer_id', 'time', 'amount', 'reward']]

In [61]:
# merge transcript with customer profile
trans_profile = transcript2.merge(profile, on = 'person', how = 'outer')

In [62]:
trans_profile.head()

Unnamed: 0,person,event,offer_id,time,amount,reward,gender,age,became_member_on,income
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,,,F,75,20170509,100000.0
1,78afa995795e4d85b5d9ceeca43f5fef,offer viewed,9b98b8c7a33c4b65b9aebfe6a799e6d9,6,,,F,75,20170509,100000.0
2,78afa995795e4d85b5d9ceeca43f5fef,transaction,,132,19.89,,F,75,20170509,100000.0
3,78afa995795e4d85b5d9ceeca43f5fef,offer completed,9b98b8c7a33c4b65b9aebfe6a799e6d9,132,,5.0,F,75,20170509,100000.0
4,78afa995795e4d85b5d9ceeca43f5fef,transaction,,144,17.78,,F,75,20170509,100000.0


In [63]:
# merge trans_profile with offer portfolio
data = trans_profile.merge(portfolio, on = 'offer_id', how = 'outer')

In [64]:
data.shape

(306534, 20)

In [65]:
data.head()

Unnamed: 0,person,event,offer_id,time,amount,reward_x,gender,age,became_member_on,income,reward_y,difficulty,duration,bogo,discount,informational,web,email,mobile,social
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,,,F,75,20170509,100000.0,5.0,5.0,7.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
1,78afa995795e4d85b5d9ceeca43f5fef,offer viewed,9b98b8c7a33c4b65b9aebfe6a799e6d9,6,,,F,75,20170509,100000.0,5.0,5.0,7.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
2,78afa995795e4d85b5d9ceeca43f5fef,offer completed,9b98b8c7a33c4b65b9aebfe6a799e6d9,132,,5.0,F,75,20170509,100000.0,5.0,5.0,7.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
3,e2127556f4f64592b11af22de27a7932,offer received,9b98b8c7a33c4b65b9aebfe6a799e6d9,408,,,M,68,20180426,70000.0,5.0,5.0,7.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
4,e2127556f4f64592b11af22de27a7932,offer viewed,9b98b8c7a33c4b65b9aebfe6a799e6d9,420,,,M,68,20180426,70000.0,5.0,5.0,7.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0


In [66]:
data[['reward_x', 'reward_y']].count()

reward_x     33579
reward_y    167581
dtype: int64

In [67]:
data.loc[data['reward_x'] != data['reward_y'], 'reward_x'].value_counts()

Series([], Name: reward_x, dtype: int64)

In [68]:
data.drop('reward_x', axis = 1, inplace = True)
data.rename(columns = {'reward_y' : 'reward'}, inplace = True)

In [69]:
data.columns

Index(['person', 'event', 'offer_id', 'time', 'amount', 'gender', 'age',
       'became_member_on', 'income', 'reward', 'difficulty', 'duration',
       'bogo', 'discount', 'informational', 'web', 'email', 'mobile',
       'social'],
      dtype='object')

In [70]:
# reorder columns
data = data[['event', 'time', 'amount', 'person', 'gender', 'age', 'became_member_on', 'income',
 'offer_id', 'bogo', 'discount', 'informational', 'web', 'email', 'mobile', 'social', 'reward', 'difficulty', 'duration']]

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306534 entries, 0 to 306533
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   event             306534 non-null  object 
 1   time              306534 non-null  int64  
 2   amount            138953 non-null  float64
 3   person            306534 non-null  object 
 4   gender            272762 non-null  object 
 5   age               306534 non-null  int64  
 6   became_member_on  306534 non-null  int64  
 7   income            272762 non-null  float64
 8   offer_id          167581 non-null  object 
 9   bogo              167581 non-null  float64
 10  discount          167581 non-null  float64
 11  informational     167581 non-null  float64
 12  web               167581 non-null  float64
 13  email             167581 non-null  float64
 14  mobile            167581 non-null  float64
 15  social            167581 non-null  float64
 16  reward            16

In [76]:
# convert became_member_on to datetime object
data['became_member_on'] =  pd.to_datetime(data['became_member_on'], format = '%Y%m%d')

In [78]:
data.sample(n = 5)

Unnamed: 0,event,time,amount,person,gender,age,became_member_on,income,offer_id,bogo,discount,informational,web,email,mobile,social,reward,difficulty,duration
265707,offer viewed,348,,7336c1e98e9149f7b38a987c1c72614c,F,75,2017-05-06,95000.0,fafdcd668e3743c1bb461111dcafc2a4,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,10.0,10.0
12450,offer viewed,366,,b3883064435140ce8feac3c1da259948,M,48,2018-02-12,36000.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,1.0,0.0,0.0,1.0,1.0,1.0,0.0,5.0,5.0,7.0
280662,offer received,504,,e18b89d585fb495caf439dae566a4d5e,M,21,2017-08-14,39000.0,4d5c57ea9a6940dd891ad53e9dbe8da0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,10.0,10.0,5.0
266573,offer completed,708,,e590c974d8274f89a02fecaa11b6670b,M,44,2018-07-22,41000.0,fafdcd668e3743c1bb461111dcafc2a4,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,10.0,10.0
64986,transaction,72,5.26,31f72426dff344ed8b1b1258366caf8d,M,40,2018-02-05,34000.0,,,,,,,,,,,


In [80]:
data.to_csv('data.csv', index = None)

# Prepare data for predictive models

In [4]:
data = pd.read_csv('data.csv', dtype = {'income': np.float64})
data.shape

  interactivity=interactivity, compiler=compiler, result=result)


(306534, 19)

In [5]:
data.sample(n = 5)

Unnamed: 0,event,time,amount,person,gender,age,became_member_on,income,offer_id,bogo,discount,informational,web,email,mobile,social,reward,difficulty,duration
139503,transaction,588,34.64,73afdeca19e349b98f09e928644610f8,M,50,2016-05-12,86000.0,,,,,,,,,,,
192921,offer viewed,168,,b50987e063ec45fca2b6343837d6a139,M,74,2017-08-17,33000.0,f19421c1d4aa40978ebb69ca19b0e20d,1.0,0.0,0.0,1.0,1.0,1.0,1.0,5.0,5.0,5.0
188121,offer received,408,,0cc13b3094c7473aa4fd1a6b35d25652,F,79,2018-04-10,61000.0,f19421c1d4aa40978ebb69ca19b0e20d,1.0,0.0,0.0,1.0,1.0,1.0,1.0,5.0,5.0,5.0
253088,offer completed,444,,d12bcab81d0e4479817a2c0cbd9909a6,M,34,2017-09-04,53000.0,fafdcd668e3743c1bb461111dcafc2a4,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,10.0,10.0
299301,offer completed,330,,ac76ca1ff69c456dad050ae58ad82647,,118,2017-05-17,,2298d6c36e964ae4a3e7e9706d1fb8c2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,3.0,7.0,7.0


In [8]:
data.event.value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [10]:
with_offers = data[data['event'] != 'transaction']
no_offers = data[data['event'] == 'transaction']

In [19]:
with_offers.loc[(with_offers['event'] == 'offer viewed') | (with_offers['event'] == 'offer completed'), 'responded'] = 1
with_offers.loc[with_offers['responded'].isnull(), 'responded'] = 0 
with_offers['responded'].value_counts()

1.0    91304
0.0    76277
Name: responded, dtype: int64

In [30]:
with_offers.columns

Index(['event', 'time', 'amount', 'person', 'gender', 'age',
       'became_member_on', 'income', 'offer_id', 'bogo', 'discount',
       'informational', 'web', 'email', 'mobile', 'social', 'reward',
       'difficulty', 'duration', 'responded'],
      dtype='object')

In [None]:
X = with_offers[[]]
Y = with_offers['responded']

# Predictive Modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklear.model_selection import train_test_split

from sklear.metrics import confusion_matrix, classification_report
import sklearn.metrics as metrics

In [None]:
steps = ['scaler': StandardScaler(), 'LR': LogisticRegression()]
clf = Pipeline(steps) # define the pipeline object

In [None]:
# divide the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=30, stratify=Y)

In [None]:
# fit on the training data
clf.fit(X_train, y_train)

In [None]:
# accuracy score of the model
print('training accuracy score': clf.score(X_train, y_train))
print('testing accuracy score': clf.score(X_tes, y_test))

In [None]:
# confusion matrix
y_predict =  clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predict)

In [None]:
# classification report
print(classification_report(y_test, y_predict, target_names = ['0', '1']))

In [None]:
# ROC Curve
# calcualte the fpr and tpr for all thresholds of the classification
probs = clf.predict_proba(X_test)

preds = probs[:, 1]

fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# plot the roc curve
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1], 'r--')

plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.show()

### 1. Who responds to promotional offers (including all offer types)? What characteristics are most important?

### 2. For people responding to different offers, how do their demographic characteristics differ?

### 3. How long does it take for a person to complete the offer? How does this period differ for differnt offers?

### 4. How much do people spend based on demographics and offer type?

### 5. For people who make purchases without receiving promotions, what types of offers should be sent?