In [1]:
#import necessary library
import numpy as np # linear algebra
import pandas as pd # data manipulation and analysis
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
sns.set_style('whitegrid') # set style for visualization
import warnings # ignore warnings
warnings.filterwarnings('ignore')

from initial_report import *

In [2]:
#import customer data
df_customer=pd.read_csv("cleaned_customer_data.csv")

In [3]:
#check df_customer sample
df_customer.sample(5)

Unnamed: 0,customer_id,became_member_on,gender,age,income
14584,61e7bf20af4c434ab2d89cdc565d8046,2017-09-26,M,33,40000.0
5171,322e2846ec574c6e8aee62fef3eea33e,2016-07-06,F,54,61000.0
871,720d1757d1d8444294aea1f0b05cf3fc,2014-03-06,F,20,47000.0
13709,ab4939d2f7a24601bc487a2c711879d3,2016-09-28,M,26,39000.0
4485,d7d5dc9730ff479c840e10eaa190e8bc,2018-06-24,F,36,64000.0


In [4]:
#import offer data
df_offer=pd.read_csv("cleaned_offers.csv")

In [5]:
#check df_event sample
df_offer.sample(5)

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,web,email,mobile,social
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,1,1,0,0
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,5,7,1,1,1,0
9,2906b810c7d4411798c6938adc9daaa5,discount,10,2,7,1,1,1,0
8,f19421c1d4aa40978ebb69ca19b0e20d,bogo,5,5,5,1,1,1,1
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,1,1,1,1


In [6]:
#import event data
df_event=pd.read_csv("cleaned_events.csv")

In [7]:
#load df_event
df_event.sample(5)

Unnamed: 0,customer_id,event,time,offer_id,amount,reward
205060,5164a5b8f90d4d4d8404d645af18193d,offer received,504,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
163366,99e9a1916821461ebd5d5048dc2dff7d,offer completed,408,0b1e1539f2cc45b7b9fa7c272da2e1d7,,5.0
52284,0872d0fa41104fcaaf143ddc1b81a420,transaction,162,,16.09,
240591,08f5b62adc9143de953cf983f435536b,offer viewed,558,4d5c57ea9a6940dd891ad53e9dbe8da0,,
13848,a6173195564d4c7ba641bc441b28c4f0,offer viewed,0,fafdcd668e3743c1bb461111dcafc2a4,,


In [8]:
#get_initial_report of df_customer
initial_report(df_customer)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 14825
- Total Columns: 5
- Column Names: ['customer_id', 'became_member_on', 'gender', 'age', 'income']

*** Data Types:
  customer_id: object
  became_member_on: object
  gender: object
  age: int64
  income: float64

*** Mixed Data Types:

*** Distinct Values per Column:
  customer_id: 14825
  became_member_on: 1707
  gender: 3
  age: 84
  income: 91

*** Null Values and Percentages:


*** Duplicates: 0

*** Negative or Zero Values:

*** Basic Statistics:
                age         income
count  14825.000000   14825.000000
mean      54.393524   65404.991568
std       17.383705   21598.299410
min       18.000000   30000.000000
25%       42.000000   49000.000000
50%       55.000000   64000.000000
75%       66.000000   80000.000000
max      101.000000  120000.000000

*** Category Description:
                             customer_id became_member_on gender
count                       

In [9]:
# Convert 'became_member_on' to datetime
df_customer['became_member_on'] = pd.to_datetime(df_customer['became_member_on'])

In [10]:
df_customer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14825 entries, 0 to 14824
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   customer_id       14825 non-null  object        
 1   became_member_on  14825 non-null  datetime64[ns]
 2   gender            14825 non-null  object        
 3   age               14825 non-null  int64         
 4   income            14825 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 579.2+ KB


In [11]:
#bin age,income and membership year for grouped analysis
df_customer['age_group'] = pd.cut(df_customer['age'], bins=[0, 30, 45, 60, 101], labels=['<30', '30-45', '45-60', '60+'])
df_customer['income_group'] = pd.qcut(df_customer['income'], q=4, labels=['low', 'mid-low', 'mid-high', 'high'])
df_customer['membership_year'] = df_customer['became_member_on'].astype(str).str[:4].astype(int)
df_customer.sample(5)

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group,income_group,membership_year
6995,4455e61955bf4b6ba269ed2856f504e8,2017-10-12,M,21,37000.0,<30,low,2017
12642,a030a3876c974c529ad8acfed1ba0e3e,2016-10-19,M,29,59000.0,<30,mid-low,2016
3515,5998802507bd4845b0531295cafaa2a4,2018-03-17,F,45,61000.0,30-45,mid-low,2018
7825,f8049ce7e45445ba8bd28204bcce33b2,2018-03-11,M,22,44000.0,<30,low,2018
2931,6c1a4e344d4b4215a07e2a3dfd655069,2017-03-03,F,57,55000.0,45-60,mid-low,2017


In [12]:
#combine age+income
df_customer['age_income_segment'] = df_customer['age_group'].astype(str) + ' - ' + df_customer['income_group'].astype(str)
df_customer.sample(5)


Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group,income_group,membership_year,age_income_segment
5504,2059cc920c6f493c811776fbf72d2017,2016-12-22,M,52,72000.0,45-60,mid-high,2016,45-60 - mid-high
1211,015c3d28c67e46aa95e9ec97c27220e8,2016-03-22,M,56,99000.0,45-60,high,2016,45-60 - high
11918,2a7014d0d87945a1aa9e4748ff7ee6df,2016-01-24,F,66,73000.0,60+,mid-high,2016,60+ - mid-high
5916,1bfcd756898445fbb1cf02cbe082b89e,2017-07-26,M,32,32000.0,30-45,low,2017,30-45 - low
912,4fa5fba731c94e74918af175bf6f6bac,2016-08-07,F,64,102000.0,60+,high,2016,60+ - high


In [13]:
#check age_income_segment
df_customer.age_income_segment.value_counts()

age_income_segment
60+ - high          1815
45-60 - high        1510
60+ - mid-high      1417
60+ - mid-low       1280
45-60 - mid-high    1256
45-60 - mid-low     1174
60+ - low           1030
45-60 - low          987
30-45 - low          942
30-45 - mid-low      869
<30 - low            822
30-45 - mid-high     600
<30 - mid-low        540
<30 - mid-high       343
30-45 - high         240
Name: count, dtype: int64

In [14]:
#get a new column offer_combo in df_offer
df_offer['offer_combo'] = (
    df_offer['offer_type'].astype(str) + '-' +
    df_offer['difficulty'].astype(str) + '-' +
    df_offer['reward'].astype(str) + '-' +
    df_offer['duration'].astype(str)
)
df_offer.sample(5)

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,web,email,mobile,social,offer_combo
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,1,1,1,1,bogo-10-10-5
6,fafdcd668e3743c1bb461111dcafc2a4,discount,10,2,10,1,1,1,1,discount-10-2-10
9,2906b810c7d4411798c6938adc9daaa5,discount,10,2,7,1,1,1,0,discount-10-2-7
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,1,1,0,0,discount-20-5-10
5,2298d6c36e964ae4a3e7e9706d1fb8c2,discount,7,3,7,1,1,1,1,discount-7-3-7


In [15]:
#check offer combo counts
df_offer.offer_combo.value_counts()

offer_combo
bogo-10-10-7           1
bogo-10-10-5           1
informational-0-0-4    1
bogo-5-5-7             1
discount-20-5-10       1
discount-7-3-7         1
discount-10-2-10       1
informational-0-0-3    1
bogo-5-5-5             1
discount-10-2-7        1
Name: count, dtype: int64

In [16]:
#merge df_customer and df_event
df_merged = df_event.merge(df_customer, on='customer_id', how='left')

In [17]:
#check_sample
df_merged.sample(5)

Unnamed: 0,customer_id,event,time,offer_id,amount,reward,became_member_on,gender,age,income,age_group,income_group,membership_year,age_income_segment
154532,404bff404fbc4024bfd1da1134594b3f,offer received,408,5a8bc65990b245e5a138643cd4eb9837,,,2016-07-18,F,61.0,58000.0,60+,mid-low,2016.0,60+ - mid-low
291577,d95370555c8145d6a463b7d8a8adffee,offer completed,654,0b1e1539f2cc45b7b9fa7c272da2e1d7,,5.0,2016-10-30,F,97.0,41000.0,60+,low,2016.0,60+ - low
210559,bf4daf7b9b204d9f9b5e65a3c994009a,offer received,504,4d5c57ea9a6940dd891ad53e9dbe8da0,,,2017-04-21,M,46.0,61000.0,45-60,mid-low,2017.0,45-60 - mid-low
42944,fc804c90fc79418180ead4b5b212d927,transaction,102,,9.89,,2015-10-06,F,41.0,39000.0,30-45,low,2015.0,30-45 - low
260841,4e3cfb3b10cb4832b72871f60c0630d0,offer completed,576,4d5c57ea9a6940dd891ad53e9dbe8da0,,10.0,2018-01-04,F,29.0,43000.0,<30,low,2018.0,<30 - low


In [18]:
# drop unnecessary column from df_merged
df_merged = df_merged.drop(columns=['age', 'income', 'became_member_on'])
df_merged.sample(5)

Unnamed: 0,customer_id,event,time,offer_id,amount,reward,gender,age_group,income_group,membership_year,age_income_segment
96353,cbebad9819494b479dbf0c1207b4c0c5,transaction,258,,20.38,,M,60+,mid-high,2017.0,60+ - mid-high
2047,94aabcec49134cf797c18f719afa32d3,offer received,0,ae264e3637204a6fb9bb56bc8210ddfd,,,F,45-60,mid-low,2016.0,45-60 - mid-low
101285,e2e9ab9c2f044040994c79d9819f670d,transaction,276,,11.91,,F,60+,mid-high,2018.0,60+ - mid-high
205119,aa4d0ea86b1b4689a0a49a9290b43e85,offer received,504,2298d6c36e964ae4a3e7e9706d1fb8c2,,,M,30-45,mid-high,2017.0,30-45 - mid-high
81622,f87b27581615423f892a6ff0d40fa696,offer viewed,204,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,M,45-60,mid-high,2017.0,45-60 - mid-high


In [19]:
#trim df_offer
df_offer_trimmed = df_offer.drop(columns=['offer_type', 'difficulty', 'reward',"duration"])

In [20]:
#merge df_merged and df_offer
df_full = df_merged.merge(df_offer_trimmed, on='offer_id', how='left')
df_full.sample(5)

Unnamed: 0,customer_id,event,time,offer_id,amount,reward,gender,age_group,income_group,membership_year,age_income_segment,web,email,mobile,social,offer_combo
179181,640052628384401ea425fba9441a8d2f,transaction,432,,24.81,,M,45-60,mid-high,2016.0,45-60 - mid-high,,,,,
67268,1bfcd756898445fbb1cf02cbe082b89e,offer viewed,168,fafdcd668e3743c1bb461111dcafc2a4,,,M,30-45,low,2017.0,30-45 - low,1.0,1.0,1.0,1.0,discount-10-2-10
47175,f8638593542b4a5b8739b5d12dbb3680,transaction,126,,3.88,,M,45-60,low,2018.0,45-60 - low,,,,,
50193,8583bb774d2e4496a96c1550a036c8b8,transaction,144,,3.14,,M,45-60,mid-low,2014.0,45-60 - mid-low,,,,,
119364,81d8b9eb710c49f6ada2ae0af9fa9be1,offer received,336,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,F,45-60,high,2016.0,45-60 - high,1.0,1.0,0.0,0.0,discount-20-5-10
