In [72]:
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize

# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

# Starbucks Capstone Challenge

## Project Overview

This data science project aims to provide insights into how Starbucks can better offer promotionals to the Starbucks rewards app users. Using demographics, transactional, and offer data, I will provide exploratory data analysis, perform data wrangling, perform feature engineering, and build analytic models. The challenge of the project is to find out which demographics respond to which types of promotionals. The data is simulated based on real Starbucks reward app usage data. 

## Exploratory Data Analysis

We're provided with 3 data files, 'portfolio' - data about the offers for one Starbuck product, 'profile' - data about Starbucks rewards app users whose transactional data is collected in 'transcript' file, and finally, 'transcript' - transactional data with timestamp related to the promotional offers.

In [73]:
# Explore profile data
print('Total number of users in profile: {}'.format(profile.shape[0]))
profile.head()

Total number of users in profile: 17000


Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [74]:
# Explore missing values in profile
print('Missing gender values: {}'.format(profile.gender.isnull().sum()))
print('Missing income values: {}'.format(profile.income.isnull().sum()))
print('Users with age value of 118: {}'.format(profile[profile.age == 118].shape[0]))

#Check if missing gender and income values also have age value of 118
print('Missing gender and income values also have age value of 118: {}'.format(profile[(profile.gender.isnull()) & (profile.income.isnull()) & (profile.age == 118)].shape[0] == 2175))

Missing gender values: 2175
Missing income values: 2175
Users with age value of 118: 2175
Missing gender and income values also have age value of 118: True


In [75]:
# Explore offer data
print('Total number of offers for this product: {}'.format(portfolio.shape[0]))
portfolio.head(5)

Total number of offers for this product: 10


Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [105]:
portfolio.offer_type.value_counts()

discount         4
bogo             4
informational    2
Name: offer_type, dtype: int64

In [76]:
# Explore transcript data
print('Total number of transcript data: {}'.format(transcript.shape[0]))
transcript.head(5)

Total number of transcript data: 306534


Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


## Data Wrangling

In [77]:
# For profile data, perform these cleaning steps
# 1. Handle missing values
# 2. Convert columns to correct data types
# 3. Change 'id' column name to 'person' to match transcript

In [78]:
# Drop missing values
profile = profile.dropna()

In [79]:
# Convert column to datetime
profile.became_member_on = pd.to_datetime(profile.became_member_on, format='%Y%m%d', errors='coerce')
# Create an int column to store length of membership in days (which will be helpful in the model building section)
profile['membership_length'] = profile.became_member_on -  pd.to_datetime("now")
profile.membership_length = abs(profile.membership_length//np.timedelta64(1,'D')) 

In [80]:
# Rename 'id' column to 'person'
profile = profile.rename(columns={'id':'person'})

In [81]:
# For portfolio data, perform these cleaning steps
# 1. Convert channels column to 3 separate channel columns and drop 'channels' column
# 2. Rename 'id' column to 'offer_id' to match transcript

In [82]:
# 1. Convert channels column to 3 separate columns for mobile, social, and web
# We're not creating a column for email because all offers have email as a channel
portfolio['channel_mobile'] = portfolio['channels'].apply(lambda x: 'mobile' in x)
portfolio['channel_social'] = portfolio['channels'].apply(lambda x: 'social' in x)
portfolio['channel_web'] = portfolio['channels'].apply(lambda x: 'web' in x)
portfolio = portfolio.drop(columns=['channels'])

In [83]:
# 2. Rename 'id' column to 'offer_id' to match transcript
portfolio = portfolio.rename(columns={'id':'offer_id'})

In [84]:
# For transcript data, perform these cleaning steps
# 1. Convert 'value' column into separate columns
# 2. Combine 'offer id' and 'offer_id' columns
# 3. Separate the dataset into 4 separate dataframes, 'received', 'viewed', 'completed', 'transactions' 
# 4. Merge the data frames into a final dataframe for data modeling
# 5. Create a new column to determine whether a completed offer record is 'influenced' by the offer, meaning that the offer was completed after it was viewed and received

In [85]:
# 1. Convert 'value' column into separate columns
transcript_values = json_normalize(transcript['value'])
transcript = pd.concat([transcript, transcript_values],axis=1)
transcript = transcript.drop(columns=['value'])

In [86]:
# 2. Combine 'offer id' and 'offer_id' columns
transcript['offer_id'] = transcript['offer_id'].fillna(transcript['offer id'])
transcript = transcript.drop(columns='offer id')

In [87]:
# 3. Separate the dataset into 4 separate dataframes, 'received', 'viewed', 'completed', 'transactions'
received = transcript[transcript['event']=='offer received']
viewed = transcript[transcript['event']=='offer viewed']
completed = transcript[transcript['event']=='offer completed']
transaction = transcript[transcript['event']=='transaction']

In [88]:
# 4. Merge the data frames into a final dataframe for data modeling

# Merge received and viewed dataframes
received_w_viewed = pd.merge(received[['person','offer_id','time']], 
                             viewed[['person','offer_id','time']], 
                             how='left', 
                             on=['person', 'offer_id'])
received_w_viewed = received_w_viewed.rename(columns={"time_x": "received_time", 
                                                      "time_y": "viewed_time"})

# Merge received and viewed and completed
completed_w_viewed = pd.merge(received_w_viewed[['person','offer_id','received_time','viewed_time']], 
                             completed[['person','offer_id','time','reward']], 
                             how='left', 
                             on=['person', 'offer_id'])
completed_w_viewed = completed_w_viewed.rename(columns={'time':'completed_time'})

# Merge portfolio and completed
completed_offers = pd.merge(completed_w_viewed, 
                             portfolio, 
                             how='left', 
                             on=['offer_id'])

In [89]:
# 5. Create a new column to determine whether a completed offer record is 'influenced' by the offer, meaning that the offer was completed after it was viewed and received
completed_offers['completed_offer'] =  (completed_offers.completed_time != None) & (completed_offers.viewed_time != 0) & (completed_offers.viewed_time != None) & ((completed_offers.completed_time - completed_offers.received_time)/24 < completed_offers.duration) & (completed_offers.viewed_time < completed_offers.completed_time) 

In [90]:
# Merge profile and completed
df = pd.merge(completed_offers, 
              profile, 
              how='left', 
              on=['person'])
df = df.dropna()

In [91]:
df.completed_offer.value_counts()

False    29749
True     29459
Name: completed_offer, dtype: int64

### Observations
#### 1. Dropping monetary transaction data from the dataframe
The `df` dataframe is created by combining `received`, `viewed`, and `completed` data frames using common keys `person` and `offer_id`. There's another dataframe, `transaction`, that was also split from `transcript`, containing transactions that users made with actual dollar amount, that can't not be merged into `completed_offers` because it doesn't contain the id of the offer. 

If some of these transactions imply a completed offer, we'll need to include the data by checking every transaction and see if each transaction happens after an offer is received and before the offer ends. We'll then create a completed offer record in the `df` if we find a transaction that meets this criterion.    

On the other hand, we do not have enough information about the data to know if these purchases include the product that the offers are promoting. We can ignore the monetary transactions in our data analysis until we have further insights on the data

#### 2. Dropping **informational** offer type from the dataframe
By dropping all missing values, we're dropping all **informational** offers from the dataframe. We potentially can check if there's a monetary transaction by the user that occurs after an **informational** offer is viewed and before offer 'expires'. However, similarly to the first observation, since we don't know whether the user purchases the product that the offer promotes, it'd be speculation that the purchase is related to the offer at all.

### Feature Engineering

In [92]:
df.head()

Unnamed: 0,person,offer_id,received_time,viewed_time,completed_time,reward_x,reward_y,difficulty,duration,offer_type,channel_mobile,channel_social,channel_web,completed_offer,gender,age,became_member_on,income,membership_length
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,6.0,132.0,5.0,5,5,7,bogo,True,False,True,True,F,75.0,2017-05-09,100000.0,1017.0
7,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,0,0.0,60.0,5.0,5,5,5,bogo,True,True,True,False,M,65.0,2018-02-09,53000.0,741.0
8,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,0,0.0,600.0,5.0,5,5,5,bogo,True,True,True,False,M,65.0,2018-02-09,53000.0,741.0
9,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,0,504.0,60.0,5.0,5,5,5,bogo,True,True,True,False,M,65.0,2018-02-09,53000.0,741.0
10,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,0,504.0,600.0,5.0,5,5,5,bogo,True,True,True,False,M,65.0,2018-02-09,53000.0,741.0


In [93]:
df = df.drop(columns=['received_time','viewed_time','completed_time','reward_x','became_member_on'])
df = df.rename(columns={'reward_y':'reward'})
df = df.replace({True: 1, False: 0})

In [95]:
df = df.reset_index(drop=True)

In [97]:
# Convert categorical variables into dummy variables
cleaned_data = pd.get_dummies(df, columns=['offer_type','gender'])

In [102]:
cleaned_data.head()

Unnamed: 0,person,offer_id,reward,difficulty,duration,channel_mobile,channel_social,channel_web,completed_offer,age,income,membership_length,offer_type_bogo,offer_type_discount,gender_F,gender_M,gender_O
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,1,0,1,1,75.0,100000.0,1017.0,1,0,1,0,0
1,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,1,1,1,0,65.0,53000.0,741.0,1,0,0,1,0
2,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,1,1,1,0,65.0,53000.0,741.0,1,0,0,1,0
3,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,1,1,1,0,65.0,53000.0,741.0,1,0,0,1,0
4,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,1,1,1,0,65.0,53000.0,741.0,1,0,0,1,0


In [106]:
cleaned_data.shape[0]

59208

In [107]:
cleaned_data = cleaned_data.drop_duplicates()

In [108]:
cleaned_data.shape[0]

28112

## Clustering and Segmentation Modeling

### Feature scaling

In [103]:
clean_df = cleaned_data.drop(columns=['person','offer_id'])

# Apply feature scaling using Standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(clean_df)

# transform data
scaled_data = scaler.transform(clean_df)

In [104]:
scaled_data.shape

(59208, 15)

## Conclusion