In [1]:
#import necessary library
import numpy as np # linear algebra
import pandas as pd # data manipulation and analysis
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
sns.set_style('whitegrid') # set style for visualization
import warnings # ignore warnings
warnings.filterwarnings('ignore')

from initial_report import *

In [2]:
#import customer data
df_customer=pd.read_csv("cleaned_customer_data.csv")
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0


In [3]:
#get initial report
initial_report(df_customer)

 *** DATA CLEANING CHECKLIST ***
----------------------------------------
*** Structure:
- Total Rows: 14825
- Total Columns: 5
- Column Names: ['customer_id', 'became_member_on', 'gender', 'age', 'income']

*** Data Types:
  customer_id: object
  became_member_on: object
  gender: object
  age: int64
  income: float64

*** Mixed Data Types:

*** Distinct Values per Column:
  customer_id: 14825
  became_member_on: 1707
  gender: 3
  age: 84
  income: 91

*** Null Values and Percentages:


*** Duplicates: 0

*** Negative or Zero Values:

*** Basic Statistics:
                age         income
count  14825.000000   14825.000000
mean      54.393524   65404.991568
std       17.383705   21598.299410
min       18.000000   30000.000000
25%       42.000000   49000.000000
50%       55.000000   64000.000000
75%       66.000000   80000.000000
max      101.000000  120000.000000

*** Category Description:
                             customer_id became_member_on gender
count                       

Problems:
1. became member on column is object type- [change to datetype]

In [4]:
# Convert 'became_member_on' to datetime
df_customer['became_member_on'] = pd.to_datetime(df_customer['became_member_on'])
print(df_customer['became_member_on'].dtype)

datetime64[ns]


In [5]:
# Create age group column
df_customer['age_group'] = np.select(
    [
        (df_customer['age'] >= 18) & (df_customer['age'] <= 34),
        (df_customer['age'] >= 35) & (df_customer['age'] <= 49),
        (df_customer['age'] >= 50) & (df_customer['age'] <= 64),
        (df_customer['age'] >= 65) & (df_customer['age'] <= 79),
        (df_customer['age'] >= 80) & (df_customer['age'] <= 110)
    ],
    [
        'Young Adult',
        'Middle Age Adult',
        'Older Adult',
        'Senior',
        'Elderly'
    ],
    default='Unknown'
)

#print
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0,Older Adult
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0,Senior
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0,Senior
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0,Senior
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0,Older Adult


In [6]:
#get value counts for age group
df_customer.age_group.value_counts()

age_group
Older Adult         5150
Senior              3164
Middle Age Adult    3153
Young Adult         2256
Elderly             1102
Name: count, dtype: int64

In [7]:
#create income group

bins = [0, 44000, 84000, float('inf')]
labels = ['Low Income', 'Middle Income', 'High Income']

df_customer['income_group'] = pd.cut(df_customer['income'], bins=bins, labels=labels, right=True)

#check dataframe
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group,income_group
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0,Older Adult,High Income
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0,Senior,High Income
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0,Senior,Middle Income
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0,Senior,Middle Income
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0,Older Adult,Middle Income


In [8]:
#get value counts for income group
df_customer.income_group.value_counts()

income_group
Middle Income    8941
High Income      3015
Low Income       2869
Name: count, dtype: int64

In [9]:
#get gender percentage
gender_percent = (
    df_customer.groupby('gender').size() / len(df_customer) * 100
)
gender_percent=round(gender_percent,2)
print(gender_percent)


gender
F    41.34
M    57.23
O     1.43
dtype: float64


In [10]:
#create group for membership year
df_customer['membership_year'] = df_customer['became_member_on'].astype(str).str[:4].astype(int)
df_customer.head()

Unnamed: 0,customer_id,became_member_on,gender,age,income,age_group,income_group,membership_year
0,0610b486422d4921ae7d2bf64640c50b,2017-07-15,F,55,112000.0,Older Adult,High Income,2017
1,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,F,75,100000.0,Senior,High Income,2017
2,e2127556f4f64592b11af22de27a7932,2018-04-26,M,68,70000.0,Senior,Middle Income,2018
3,389bc3fa690240e798340f5a15918d5c,2018-02-09,M,65,53000.0,Senior,Middle Income,2018
4,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,M,58,51000.0,Older Adult,Middle Income,2017


In [11]:
#get value counts for membership year
df_customer.membership_year.value_counts()

membership_year
2017    5599
2018    3669
2016    3024
2015    1597
2014     662
2013     274
Name: count, dtype: int64

In [12]:
#drop unnecessary columns
df_customer = df_customer.drop(columns=['became_member_on', 'age','income'])
df_customer.head()

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017
1,78afa995795e4d85b5d9ceeca43f5fef,F,Senior,High Income,2017
2,e2127556f4f64592b11af22de27a7932,M,Senior,Middle Income,2018
3,389bc3fa690240e798340f5a15918d5c,M,Senior,Middle Income,2018
4,2eeac8d8feae4a8cad5a6af0499a211d,M,Older Adult,Middle Income,2017


In [13]:
#import customer data
df_event=pd.read_csv("cleaned_events.csv")
df_event.head()

Unnamed: 0,customer_id,event,time,offer_id,amount,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,,
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,,
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,,


In [14]:
#merge df_customer and df_event
df_customer_events = pd.merge(df_customer, df_event, on='customer_id', how='left')
df_customer_events.head()

Unnamed: 0,customer_id,gender,age_group,income_group,membership_year,event,time,offer_id,amount,reward
0,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,18,,21.51,
1,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,144,,32.28,
2,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer received,408,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
3,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,offer received,504,3f207df678b143eea3cee63160fa8bed,,
4,0610b486422d4921ae7d2bf64640c50b,F,Older Adult,High Income,2017,transaction,528,,23.22,


In [15]:
df_customer_events.event.value_counts()

event
transaction        123957
offer received      66501
offer viewed        49860
offer completed     32070
Name: count, dtype: int64

In [16]:
#total_transaction_amount
total_transaction_amount = df_customer_events['amount'].sum()
print(total_transaction_amount)

1734942.4


In [17]:
# Average transaction_amount
avg_transaction_amount = df_customer_events['amount'].mean()

print(avg_transaction_amount)


13.996324531894123


In [31]:
#get average spend per customer
distinct_customers_count = df_customer_events['customer_id'].nunique()
#Calculate the total transaction amount per customer
df_transactions=df_customer_events[df_customer_events.event=="transaction"]
total_transaction_amount = df_transactions['amount'].sum()
#get average
avg_spend_per_customer=total_transaction_amount/distinct_customers_count
print(avg_spend_per_customer)


117.02815514333896


In [15]:
#import offer data
df_offer=pd.read_csv("cleaned_offers.csv")
df_offer.head()

Unnamed: 0,offer_id,offer_type,difficulty,reward,duration,web,email,mobile,social
0,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10,10,7,0,1,1,1
1,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10,10,5,1,1,1,1
2,3f207df678b143eea3cee63160fa8bed,informational,0,0,4,1,1,1,0
3,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5,5,7,1,1,1,0
4,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,20,5,10,1,1,0,0
