In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy
import matplotlib.pyplot as mlt
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

# Define Color Pallette

In [2]:
color1 = '#00704A'
color2 = '#FF9FE5'
color3 = '#45062E'
backgroundColor = '#B8A085'
ccs = ['#ACDDA9', '#00704A', '#002F20']
dcs = ['#002619','#008256', '#00de92', '#a1ffdf', '#fdfffe']

# Import Data

In [3]:
df = pd.read_csv('transactions.csv')
df.drop(['Unnamed: 0.1','Unnamed: 0' ], axis =1, inplace=True)
df = df.loc[df['anonymous'] == 0]
df.head()

Unnamed: 0,person,event,time,offer_id,reward,difficulty,duration,offer_type,web,email,mobile,social,gender,age,became_member_on,income,anonymous,income_cluster
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,bogo,1,1,1,0,F,75,2017-05-09,100000.0,0,0
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,2,10,7,discount,1,1,1,0,M,68,2018-04-26,70000.0,0,1
5,389bc3fa690240e798340f5a15918d5c,offer received,0,f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,bogo,1,1,1,1,M,65,2018-02-09,53000.0,0,1
7,2eeac8d8feae4a8cad5a6af0499a211d,offer received,0,3f207df678b143eea3cee63160fa8bed,0,0,4,informational,1,1,1,0,M,58,2017-11-11,51000.0,0,1
8,aa4862eba776480b8bb9c68455b8c2e1,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,discount,1,1,0,0,F,61,2017-09-11,57000.0,0,1


# Feature Engineering
## We want an indivdual observation for each customer with relevant data

## Offer Count for each Customer   
#### Number of BOGO, Discount, and Information Offers
#### 'offerNumDF'

In [4]:
# Start with getting a offer count for each customer
A = df.groupby(['person', 'offer_type']).count().event.reset_index()
A.columns = ['person', 'offer_type', 'Number of Offers']
A
# Pivot longer
A = A.pivot(columns = 'offer_type', values = 'Number of Offers',index = 'person').reset_index()
A.columns = ['person', 'BOGO_Number', 'Discount_Number', 'Informational_Number']
A.fillna(0, inplace = True)
offerNumDF = A
offerNumDF

Unnamed: 0,person,BOGO_Number,Discount_Number,Informational_Number
0,0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0
1,0011e0d4e6b944f998e987f904e8c1e5,3.0,6.0,4.0
2,0020c2b971eb4e9188eac86d93036a77,4.0,5.0,2.0
3,0020ccbbb6d84e358d3414a3ff76cffd,6.0,3.0,2.0
4,003d66b6608740288d6cc97a6903f4f0,0.0,8.0,4.0
...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,3.0,6.0,3.0
14816,fff7576017104bcc8677a8d63322b5e1,6.0,6.0,0.0
14817,fff8957ea8b240a6b5e634b6ee8eafcf,2.0,2.0,1.0
14818,fffad4f4828548d1b5583907f2e9906b,9.0,0.0,2.0


## Aggregate offers channels (web, mobile, social, email) and Reward, difficulty and offer duration
#### 'aggDF'

In [5]:
A = df.loc[:, ['person', 'reward', 'difficulty', 'duration', 'web', 'email', 'social', 'mobile']]
aggDF = A.groupby('person').agg(
    reward_avg = ('reward', 'mean'),
    difficulty_avg = ('difficulty', 'mean'),
    duration_avg = ('duration', 'mean'),
    web = ('web', 'sum'), 
    email = ('email', 'sum'),
    social = ('social', 'sum'),
    mobile = ('mobile', 'sum')
).reset_index()
aggDF

Unnamed: 0,person,reward_avg,difficulty_avg,duration_avg,web,email,social,mobile
0,0009655768c64bdeb2e877511632db8f,2.083333,5.416667,6.083333,10,12,8,12
1,0011e0d4e6b944f998e987f904e8c1e5,3.000000,7.384615,6.615385,11,13,5,10
2,0020c2b971eb4e9188eac86d93036a77,4.545455,8.181818,7.090909,8,11,11,11
3,0020ccbbb6d84e358d3414a3ff76cffd,3.545455,4.636364,5.727273,9,11,8,11
4,003d66b6608740288d6cc97a6903f4f0,1.833333,8.333333,7.833333,10,12,8,10
...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,2.250000,6.250000,6.750000,9,12,6,12
14816,fff7576017104bcc8677a8d63322b5e1,5.166667,9.166667,8.166667,10,12,10,12
14817,fff8957ea8b240a6b5e634b6ee8eafcf,4.800000,8.000000,6.800000,5,5,4,5
14818,fffad4f4828548d1b5583907f2e9906b,4.090909,4.090909,5.181818,9,11,8,11


## Event aggregation (How many offers received, viewed and completed)
#### 'eventDF'

In [6]:
A = df.loc[:, ['person', 'event', 'web']]
A = A.groupby(['person', 'event']).count().reset_index()
eventDF = A.pivot(index = 'person', columns = 'event', values = 'web').reset_index()
eventDF.fillna(0, inplace = True)
eventDF['ratio_completed'] = round(eventDF['offer completed']/eventDF['offer received'] ,2)
eventDF['ratio_viewed'] = round(eventDF['offer viewed']/ eventDF['offer received'],2)
eventDF

event,person,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed
0,0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,0.60,0.80
1,0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,0.60,1.00
2,0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,0.60,0.60
3,0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,0.75,1.00
4,003d66b6608740288d6cc97a6903f4f0,3.0,5.0,4.0,0.60,0.80
...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,3.0,6.0,3.0,0.50,0.50
14816,fff7576017104bcc8677a8d63322b5e1,3.0,5.0,4.0,0.60,0.80
14817,fff8957ea8b240a6b5e634b6ee8eafcf,0.0,3.0,2.0,0.00,0.67
14818,fffad4f4828548d1b5583907f2e9906b,3.0,4.0,4.0,0.75,1.00


## Create Demographics DF (age, income, year been a  member, gender)
#### 'demoDF'

In [7]:
demoDF = df.loc[:,['person', 'gender', 'age', 'became_member_on', 'income']]
demoDF = demoDF.groupby('person').agg(
    gender = ('gender', 'first'),
    age = ('age', 'first'),
    income = ('income', 'first'),
    became_member_on = ('became_member_on', 'first')
).reset_index()
# convert became a member on to date
demoDF['became_member_on'] = pd.to_datetime(demoDF['became_member_on'])
demoDF['year_became_member'] = demoDF['became_member_on'].dt.year
demoDF.drop(['became_member_on'], axis = 1, inplace = True)
demoDF

Unnamed: 0,person,gender,age,income,year_became_member
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017
...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017


## Attach how much money they spent and most recent transaction date

In [8]:
money = pd.read_csv('transcriptT.csv')
money.drop(['Unnamed: 0', 'value'], axis = 1, inplace = True)
money = money.groupby('person').agg(
    most_recent = ('time', 'last'),
    total_spent = ('spent', 'sum'),
    transactions_number = ('event', 'count')
).reset_index()
money

Unnamed: 0,person,most_recent,total_spent,transactions_number
0,0009655768c64bdeb2e877511632db8f,696,127.60,8
1,00116118485d4dfda04fdbaba9a87b5c,474,4.09,3
2,0011e0d4e6b944f998e987f904e8c1e5,654,79.46,5
3,0020c2b971eb4e9188eac86d93036a77,708,196.86,8
4,0020ccbbb6d84e358d3414a3ff76cffd,672,154.05,12
...,...,...,...,...
16573,fff3ba4757bd42088c044ca26d73817a,552,580.98,11
16574,fff7576017104bcc8677a8d63322b5e1,696,29.94,6
16575,fff8957ea8b240a6b5e634b6ee8eafcf,576,12.15,5
16576,fffad4f4828548d1b5583907f2e9906b,678,88.83,12


## Merge

In [9]:
mergeDF = offerNumDF.merge(aggDF, how = 'left', on = 'person')
mergeDF = mergeDF.merge(eventDF, how = 'left', on = 'person')

mergeDF = demoDF.merge(mergeDF, how = 'left', on = 'person')
mergeDF = mergeDF.merge(money, how = 'left', on = 'person')
mergeDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,social,mobile,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,8,12,3.0,5.0,4.0,0.60,0.80,696.0,127.60,8.0
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,5,10,3.0,5.0,5.0,0.60,1.00,654.0,79.46,5.0
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,11,11,3.0,5.0,3.0,0.60,0.60,708.0,196.86,8.0
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,8,11,3.0,4.0,4.0,0.75,1.00,672.0,154.05,12.0
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,8,10,3.0,5.0,4.0,0.60,0.80,696.0,48.34,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,6,12,3.0,6.0,3.0,0.50,0.50,552.0,580.98,11.0
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,10,12,3.0,5.0,4.0,0.60,0.80,696.0,29.94,6.0
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,4,5,0.0,3.0,2.0,0.00,0.67,576.0,12.15,5.0
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,8,11,3.0,4.0,4.0,0.75,1.00,678.0,88.83,12.0


### Look for NAs

In [10]:
mergeDF.isna().any()

person                  False
gender                  False
age                     False
income                  False
year_became_member      False
BOGO_Number             False
Discount_Number         False
Informational_Number    False
reward_avg              False
difficulty_avg          False
duration_avg            False
web                     False
email                   False
social                  False
mobile                  False
offer completed         False
offer received          False
offer viewed            False
ratio_completed         False
ratio_viewed            False
most_recent              True
total_spent              True
transactions_number      True
dtype: bool

### Only NA values are in actual transactions, meaning they never bought anything, fill nas with 0

In [11]:
mergeDF.fillna(0, inplace=True)
mergeDF.isna().any()

person                  False
gender                  False
age                     False
income                  False
year_became_member      False
BOGO_Number             False
Discount_Number         False
Informational_Number    False
reward_avg              False
difficulty_avg          False
duration_avg            False
web                     False
email                   False
social                  False
mobile                  False
offer completed         False
offer received          False
offer viewed            False
ratio_completed         False
ratio_viewed            False
most_recent             False
total_spent             False
transactions_number     False
dtype: bool

## Calculate RFM score (recency, frequency, monetary)

In [12]:
mergeDF['M_score'] = pd.cut(mergeDF['total_spent'], bins=[-1,
       np.percentile(mergeDF['total_spent'], 20),
       np.percentile(mergeDF['total_spent'], 40),
       np.percentile(mergeDF['total_spent'], 60),
       np.percentile(mergeDF['total_spent'], 80),
       mergeDF['total_spent'].max()],
                           labels = [1,2,3,4,5]).astype('int')

In [13]:
mergeDF[['M_score', 'total_spent']]

Unnamed: 0,M_score,total_spent
0,4,127.60
1,3,79.46
2,5,196.86
3,4,154.05
4,2,48.34
...,...,...
14815,5,580.98
14816,2,29.94
14817,1,12.15
14818,3,88.83


In [14]:
mergeDF['R_score'] = pd.cut(mergeDF['most_recent'], bins=[-1,
       np.percentile(mergeDF['most_recent'], 20),
       np.percentile(mergeDF['most_recent'], 40),
       np.percentile(mergeDF['most_recent'], 60),
       np.percentile(mergeDF['most_recent'], 80),
       mergeDF['most_recent'].max()],
                           labels = [1,2,3,4,5]).astype('int')

In [15]:
mergeDF[['R_score', 'most_recent']]

Unnamed: 0,R_score,most_recent
0,4,696.0
1,3,654.0
2,5,708.0
3,4,672.0
4,4,696.0
...,...,...
14815,1,552.0
14816,4,696.0
14817,2,576.0
14818,4,678.0


In [16]:
mergeDF['F_score'] = pd.cut(mergeDF['transactions_number'], bins=[-1,
       np.percentile(mergeDF['transactions_number'], 20),
       np.percentile(mergeDF['transactions_number'], 40),
       np.percentile(mergeDF['transactions_number'], 60),
       np.percentile(mergeDF['transactions_number'], 80),
       mergeDF['transactions_number'].max()],
                           labels = [1,2,3,4,5]).astype('int')

In [17]:
mergeDF[['F_score', 'transactions_number']]

Unnamed: 0,F_score,transactions_number
0,3,8.0
1,2,5.0
2,3,8.0
3,4,12.0
4,5,18.0
...,...,...
14815,4,11.0
14816,2,6.0
14817,2,5.0
14818,4,12.0


In [18]:
mergeDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,5.0,4.0,0.60,0.80,696.0,127.60,8.0,4,4,3
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,5.0,5.0,0.60,1.00,654.0,79.46,5.0,3,3,2
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,5.0,3.0,0.60,0.60,708.0,196.86,8.0,5,5,3
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,4.0,4.0,0.75,1.00,672.0,154.05,12.0,4,4,4
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,5.0,4.0,0.60,0.80,696.0,48.34,18.0,2,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,6.0,3.0,0.50,0.50,552.0,580.98,11.0,5,1,4
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,5.0,4.0,0.60,0.80,696.0,29.94,6.0,2,4,2
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,3.0,2.0,0.00,0.67,576.0,12.15,5.0,1,2,2
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,4.0,4.0,0.75,1.00,678.0,88.83,12.0,3,4,4


# Dimentionality Reduction with PCA
### ordinal encode gender field
### Scale data

In [19]:
mergeDF['gender_encode'] = np.where(mergeDF['gender']=='M', 0, np.where(mergeDF['gender'] == 'F', 1,2))
mergeDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,4.0,0.60,0.80,696.0,127.60,8.0,4,4,3,0
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,5.0,0.60,1.00,654.0,79.46,5.0,3,3,2,2
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,3.0,0.60,0.60,708.0,196.86,8.0,5,5,3,1
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,4.0,0.75,1.00,672.0,154.05,12.0,4,4,4,1
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,4.0,0.60,0.80,696.0,48.34,18.0,2,4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,3.0,0.50,0.50,552.0,580.98,11.0,5,1,4,1
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,4.0,0.60,0.80,696.0,29.94,6.0,2,4,2,0
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,2.0,0.00,0.67,576.0,12.15,5.0,1,2,2,0
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,4.0,0.75,1.00,678.0,88.83,12.0,3,4,4,0


In [20]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [21]:
scaledDF = pd.DataFrame(ss.fit_transform(mergeDF.iloc[:,2:]))
scaledDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,-1.230488,0.305275,0.316911,-0.453489,0.274803,1.585800,-1.352218,-0.976632,-0.468545,0.578338,...,0.492730,0.339644,0.196280,0.650670,0.081163,-0.070019,0.707190,0.776362,0.117739,-0.842316
1,-0.827786,-0.389256,1.151419,-0.453489,0.620259,1.585800,-0.816084,-0.178736,0.025330,0.882792,...,1.267917,0.339644,1.042102,0.310075,-0.289214,-0.648872,0.000143,0.046908,-0.590416,2.969908
2,0.265262,1.138711,-0.517598,-0.106596,0.274803,0.302263,0.087812,0.144486,0.466734,-0.030569,...,-0.282457,0.339644,-0.649542,0.747983,0.614032,-0.070019,1.414237,1.505816,0.117739,1.063796
3,-1.748248,-0.250350,-0.517598,0.587190,-0.416109,0.302263,-0.497062,-1.293002,-0.799057,0.273885,...,0.492730,0.792722,1.042102,0.456044,0.284663,0.701783,0.707190,0.776362,0.825894,1.063796
4,-1.633190,0.351577,0.316911,-1.494167,1.311171,1.585800,-1.498436,0.205917,1.155888,0.578338,...,0.492730,0.339644,0.196280,0.650670,-0.528643,1.859487,-0.706904,0.776362,1.534049,1.063796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,0.840550,0.814597,-1.352107,-0.453489,0.620259,0.944032,-1.254739,-0.638761,0.150287,0.273885,...,-0.282457,0.037591,-1.072453,-0.517085,3.569356,0.508833,1.414237,-1.412001,0.825894,1.063796
14816,0.955608,0.351577,0.316911,0.587190,0.620259,-0.981273,0.451142,0.543789,1.465303,0.578338,...,0.492730,0.339644,0.196280,0.650670,-0.670208,-0.455921,-0.706904,0.776362,-0.590416,-0.842316
14817,0.955608,-0.435558,1.151419,-0.800381,-0.761565,-0.339505,0.236689,0.070769,0.196699,-0.943929,...,-1.057644,-1.472669,-0.353504,-0.322459,-0.807080,-0.648872,-1.413951,-0.682547,-0.590416,-0.842316
14818,-1.172959,-1.454202,0.316911,1.627868,-1.452477,0.302263,-0.178040,-1.514155,-1.305374,0.273885,...,0.492730,0.792722,1.042102,0.504701,-0.217123,0.701783,0.000143,0.776362,0.825894,-0.842316


In [22]:
from sklearn.decomposition import PCA
pca = PCA()

In [23]:
pca.set_params(n_components = None)
pca.fit(scaledDF)

PCA()

In [24]:
pca.explained_variance_ratio_

array([2.97048785e-01, 1.25851418e-01, 1.02472991e-01, 8.61348898e-02,
       8.12626492e-02, 4.71114165e-02, 4.46920181e-02, 3.58557617e-02,
       3.41681723e-02, 2.82355801e-02, 2.68438825e-02, 1.97084254e-02,
       1.59533917e-02, 1.49934666e-02, 1.07067717e-02, 9.77178461e-03,
       5.36986268e-03, 4.96523329e-03, 3.00844780e-03, 2.52966933e-03,
       1.39911457e-03, 1.17918505e-03, 7.37082333e-04, 4.31235893e-33,
       9.94794712e-34])

In [25]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [113]:
fig = go.Figure()

fig.add_trace(go.Bar(x = list(range(25)), y = pca.explained_variance_ratio_))
fig.add_trace(go.Scatter(x= list(range(25)), y=np.cumsum(pca.explained_variance_ratio_)))
fig.update_layout(
    colorway = [color1, color1],
    showlegend= False,
    title = 'Principal Component Analysis <br> Explained Variance',
    xaxis_title = 'Principal Components',
    yaxis_title = 'Explained Variance',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,20]
        #showticklabels = False
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
        
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)

#newnames = {'Percent Anonymous Completed':'Anonymous', 'Percent of Known Completed':'Known'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                         legendgroup = newnames[t.name],
#                                        hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                        )
#                  )

## 90% of variance can be explained by the first 11 components

In [27]:
pca.set_params(n_components = 11)
pcaDF = pd.DataFrame(pca.fit_transform(scaledDF))

# Clustering

In [28]:
from sklearn.cluster import KMeans
from PlottingFunctions import plot_inertia, plot_silhouette
from sklearn.metrics import silhouette_score, silhouette_samples

In [29]:
kmeans = KMeans()
distortions = []
for k in range(1,15):
    kmeanModel = KMeans(n_clusters = k)
    kmeanModel.fit(pcaDF)
    distortions.append(kmeanModel.inertia_)

In [30]:
distortions

[337035.53782382177,
 262136.8473037376,
 242185.20988075377,
 225918.0103742891,
 214156.98561219708,
 204022.15450669013,
 195934.14867914666,
 189291.23636728272,
 183295.35874469703,
 178227.4974723419,
 173893.4288781227,
 170303.70986598323,
 167261.07859448524,
 164559.50006028917]

In [31]:
silhos = []
for k in range(2,15):
    kmeanModel = KMeans(n_clusters = k)
    kmeanModel.fit(pcaDF)
    silhos.append(silhouette_score(pcaDF, kmeanModel.labels_, metric = 'euclidean'))
silhos

[0.19244004956877472,
 0.13020999329884353,
 0.11777643545967983,
 0.1169738614733342,
 0.11632148189470634,
 0.10935946635433813,
 0.10790207307808504,
 0.10617030014174086,
 0.10520830657599758,
 0.10427468869383749,
 0.10246316642691106,
 0.10128060251471181,
 0.10171388537656592]

In [32]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(x = list(range(1,15)), y = distortions, name = 'Distortion'), secondary_y = False
)
fig.add_trace(
    go.Scatter(x = list(range(2,15)), y = silhos, name = 'Silhouette Score'), secondary_y = True
)
fig.add_trace(
    go.Scatter(x = [4,4],y = [0,338000], name = 'Clusters = 4',line = dict(dash = 'dash')), secondary_y = False
)
fig.update_layout(
    colorway = [color1, color3, 'black'],
    title = 'Inertia and Silhouette Scores K-Means Fit <br> on Transformed PCA Data',
#    xaxis_title = 'Principal Components',
#    yaxis_title = 'Explained Variance',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size = 25),
        linecolor = 'black',
        nticks = 10,
        showgrid = False
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        title = 'Number of Clusters',
        titlefont = dict(size =25),
        linecolor = 'black'
        
          
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.9,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
        
    )   
)
fig.update_yaxes(title_text = 'Distortion', secondary_y=False)
fig.update_yaxes(title_text = 'Silhouette Score', secondary_y=True, title_font = {'size':20}, nticks=12, showgrid=False)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'Percent Anonymous Completed':'Anonymous', 'Percent of Known Completed':'Known'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                         legendgroup = newnames[t.name],
#                                        hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                        )
#                  )

In [33]:
kmeans = KMeans()
kmeans.set_params(n_clusters = 4)
kmeans.fit(pcaDF)

KMeans(n_clusters=4)

In [34]:
kmeans.labels_

array([0, 0, 3, ..., 2, 0, 1])

In [35]:
clusterDF = pd.concat([mergeDF, pd.Series(kmeans.labels_, name = 'cluster')], axis = 1)
clusterDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,0.60,0.80,696.0,127.60,8.0,4,4,3,0,0
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,0.60,1.00,654.0,79.46,5.0,3,3,2,2,0
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,0.60,0.60,708.0,196.86,8.0,5,5,3,1,3
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,0.75,1.00,672.0,154.05,12.0,4,4,4,1,1
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,0.60,0.80,696.0,48.34,18.0,2,4,5,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,0.50,0.50,552.0,580.98,11.0,5,1,4,1,1
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,0.60,0.80,696.0,29.94,6.0,2,4,2,0,1
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,0.00,0.67,576.0,12.15,5.0,1,2,2,0,2
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,0.75,1.00,678.0,88.83,12.0,3,4,4,0,0


In [36]:
cluster0DF = clusterDF.loc[clusterDF['cluster'] == 0]
cluster1DF = clusterDF.loc[clusterDF['cluster'] == 1]
cluster2DF = clusterDF.loc[clusterDF['cluster'] == 2]
cluster3DF = clusterDF.loc[clusterDF['cluster'] == 3]

# Cluster Analysis

In [37]:
cluster0DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,...,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0,3579.0
mean,55.731769,65986.309025,2017.061749,4.286952,3.348701,2.650461,3.97711,6.363353,5.974531,7.73596,...,0.292422,0.831618,575.49036,65.405845,5.677843,2.327186,2.455993,2.072646,0.423023,0.0
std,16.942745,21013.293519,1.035609,2.442766,2.397177,1.79027,1.608444,1.982311,0.923382,2.439284,...,0.217769,0.173014,153.074282,65.098965,3.730944,1.120452,1.351956,1.174433,0.524829,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.17,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
25%,45.0,51000.0,2017.0,3.0,2.0,2.0,2.8,5.0,5.333333,6.0,...,0.17,0.75,546.0,21.56,3.0,1.0,1.0,1.0,0.0,0.0
50%,56.0,64000.0,2017.0,4.0,3.0,2.0,3.9,6.363636,5.923077,8.0,...,0.25,0.8,618.0,48.79,5.0,2.0,2.0,2.0,0.0,0.0
75%,67.0,80000.0,2018.0,6.0,5.0,4.0,5.0,7.727273,6.583333,9.0,...,0.5,1.0,672.0,94.095,7.0,3.0,4.0,3.0,1.0,0.0
max,101.0,120000.0,2018.0,13.0,13.0,10.0,10.0,14.0,9.5,15.0,...,1.0,1.0,714.0,1016.93,25.0,5.0,5.0,5.0,2.0,0.0


In [38]:
cluster1DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,...,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0,3922.0
mean,55.536206,69258.031617,2016.31846,6.489036,5.935747,1.328914,4.778732,8.119559,6.742818,11.429883,...,0.772746,0.865291,656.134115,210.989258,11.791943,4.196073,3.349567,3.779959,0.507649,1.0
std,16.834488,21172.533443,1.099865,3.100626,3.203615,1.435487,1.470189,1.784812,0.832453,2.667554,...,0.180166,0.155819,49.641298,165.956998,5.075513,0.921037,1.194622,1.105631,0.531168,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,1.2,3.4,4.2,0.0,...,0.17,0.33,384.0,22.87,1.0,1.0,1.0,1.0,0.0,1.0
25%,44.0,54000.0,2016.0,4.0,3.0,0.0,3.714286,6.857143,6.142857,10.0,...,0.67,0.8,630.0,121.645,8.0,4.0,2.0,3.0,0.0,1.0
50%,56.0,68000.0,2016.0,6.0,6.0,1.0,4.6875,8.0,6.714286,12.0,...,0.8,1.0,666.0,176.665,11.0,4.0,3.0,4.0,0.0,1.0
75%,67.0,84000.0,2017.0,9.0,8.0,2.0,5.785714,9.235294,7.25,13.0,...,1.0,1.0,696.0,245.065,15.0,5.0,4.0,5.0,1.0,1.0
max,101.0,120000.0,2018.0,18.0,17.0,7.0,10.0,16.0,9.4,18.0,...,1.0,1.0,714.0,1608.69,36.0,5.0,5.0,5.0,2.0,1.0


In [39]:
cluster2DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,...,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0,3467.0
mean,51.432362,58793.76983,2017.040669,2.66859,2.374676,1.277473,4.307212,7.672243,6.424577,4.942025,...,0.168125,0.58019,566.855495,37.052553,5.03692,1.710701,2.430343,1.874243,0.328238,2.0
std,17.824292,20458.72899,1.124052,1.886739,1.814882,1.26211,1.988803,2.905954,1.207701,1.927221,...,0.226863,0.267902,163.92942,45.4782,3.15448,0.940963,1.369009,1.044753,0.49067,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0
25%,37.0,42000.0,2017.0,1.0,1.0,0.0,2.833333,5.833333,5.625,4.0,...,0.0,0.4,528.0,11.395,3.0,1.0,1.0,1.0,0.0,2.0
50%,53.0,56000.0,2017.0,2.0,2.0,1.0,4.25,7.5,6.375,5.0,...,0.0,0.5,618.0,20.81,5.0,1.0,2.0,2.0,0.0,2.0
75%,64.0,71000.0,2018.0,4.0,4.0,2.0,5.666667,9.75,7.166667,6.0,...,0.33,0.75,672.0,47.5,7.0,2.0,4.0,3.0,1.0,2.0
max,101.0,119000.0,2018.0,9.0,9.0,6.0,10.0,20.0,10.0,10.0,...,1.0,1.0,714.0,743.07,23.0,5.0,5.0,5.0,2.0,2.0


In [40]:
cluster3DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,...,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0,3852.0
mean,54.634735,66899.53271,2016.138889,3.579699,4.883956,0.917186,4.4728,9.022382,7.147819,7.891745,...,0.665987,0.723424,656.098131,141.392407,10.359813,3.566978,3.414849,3.44107,0.494808,3.0
std,17.625206,22240.278712,1.223068,2.376666,2.506866,1.092286,1.661431,2.320923,0.971652,2.227974,...,0.237578,0.227362,55.722629,115.547687,4.756641,1.084069,1.243531,1.23184,0.529301,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,0.857143,2.5,4.166667,0.0,...,0.0,0.0,360.0,14.03,1.0,1.0,1.0,1.0,0.0,3.0
25%,42.0,49000.0,2015.0,2.0,3.0,0.0,3.222222,7.363636,6.428571,6.0,...,0.5,0.5,630.0,69.61,7.0,3.0,2.0,3.0,0.0,3.0
50%,56.0,65000.0,2016.0,3.0,5.0,0.0,4.333333,8.833333,7.090909,8.0,...,0.67,0.75,672.0,122.965,10.0,4.0,4.0,4.0,0.0,3.0
75%,67.0,83000.0,2017.0,5.0,6.0,2.0,5.545455,10.181818,7.818182,9.0,...,0.8,1.0,696.0,178.18,13.0,4.0,4.0,4.0,1.0,3.0
max,101.0,120000.0,2018.0,11.0,14.0,5.0,10.0,20.0,10.0,14.0,...,1.0,1.0,714.0,1211.76,36.0,5.0,5.0,5.0,2.0,3.0


### Use Radar Plots to compare like metrics of the 4 clusters

In [41]:
radarDF = pd.concat([cluster0DF.iloc[:,2:].describe().loc[['mean']],cluster1DF.iloc[:,2:].describe().loc[['mean']],cluster2DF.iloc[:,2:].describe().loc[['mean']],cluster3DF.iloc[:,2:].describe().loc[['mean']]])
radarDF['cluster'] = [0,1,2,3]
radarDF

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
mean,55.731769,65986.309025,2017.061749,4.286952,3.348701,2.650461,3.97711,6.363353,5.974531,7.73596,...,0.292422,0.831618,575.49036,65.405845,5.677843,2.327186,2.455993,2.072646,0.423023,0
mean,55.536206,69258.031617,2016.31846,6.489036,5.935747,1.328914,4.778732,8.119559,6.742818,11.429883,...,0.772746,0.865291,656.134115,210.989258,11.791943,4.196073,3.349567,3.779959,0.507649,1
mean,51.432362,58793.76983,2017.040669,2.66859,2.374676,1.277473,4.307212,7.672243,6.424577,4.942025,...,0.168125,0.58019,566.855495,37.052553,5.03692,1.710701,2.430343,1.874243,0.328238,2
mean,54.634735,66899.53271,2016.138889,3.579699,4.883956,0.917186,4.4728,9.022382,7.147819,7.891745,...,0.665987,0.723424,656.098131,141.392407,10.359813,3.566978,3.414849,3.44107,0.494808,3


In [42]:
A = pd.melt(radarDF.loc[:, ['BOGO_Number','Discount_Number', 'Informational_Number', 'reward_avg', 'difficulty_avg', 'duration_avg', 'cluster']],id_vars='cluster' )
A

Unnamed: 0,cluster,variable,value
0,0,BOGO_Number,4.286952
1,1,BOGO_Number,6.489036
2,2,BOGO_Number,2.66859
3,3,BOGO_Number,3.579699
4,0,Discount_Number,3.348701
5,1,Discount_Number,5.935747
6,2,Discount_Number,2.374676
7,3,Discount_Number,4.883956
8,0,Informational_Number,2.650461
9,1,Informational_Number,1.328914


In [43]:
px.line_polar(A, r = 'value', theta = 'variable', color = 'cluster')

In [44]:
radarDF.columns

Index(['age', 'income', 'year_became_member', 'BOGO_Number', 'Discount_Number',
       'Informational_Number', 'reward_avg', 'difficulty_avg', 'duration_avg',
       'web', 'email', 'social', 'mobile', 'offer completed', 'offer received',
       'offer viewed', 'ratio_completed', 'ratio_viewed', 'most_recent',
       'total_spent', 'transactions_number', 'M_score', 'R_score', 'F_score',
       'gender_encode', 'cluster'],
      dtype='object')

In [45]:
A = pd.melt(radarDF.loc[:, ['web', 'email', 'social', 'mobile', 'offer completed', 'offer received', 'offer viewed', 'M_score', 'F_score', 'R_score','cluster']],id_vars='cluster' )

In [46]:
px.line_polar(A, r = 'value', theta = 'variable', color = 'cluster')

## A little confusing could be useful in presentation for select metrics

## Let's try a more direct approach to differentiating cluster metrics

In [47]:
pd.set_option("display.max_columns", None)

In [48]:
radarDF

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,email,social,mobile,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
mean,55.731769,65986.309025,2017.061749,4.286952,3.348701,2.650461,3.97711,6.363353,5.974531,7.73596,10.286113,7.130763,9.843811,1.374686,4.899413,4.012015,0.292422,0.831618,575.49036,65.405845,5.677843,2.327186,2.455993,2.072646,0.423023,0
mean,55.536206,69258.031617,2016.31846,6.489036,5.935747,1.328914,4.778732,8.119559,6.742818,11.429883,13.753697,9.575982,12.824834,4.008669,5.242733,4.502295,0.772746,0.865291,656.134115,210.989258,11.791943,4.196073,3.349567,3.779959,0.507649,1
mean,51.432362,58793.76983,2017.040669,2.66859,2.374676,1.277473,4.307212,7.672243,6.424577,4.942025,6.320738,3.763196,5.662244,0.544851,3.733199,2.042688,0.168125,0.58019,566.855495,37.052553,5.03692,1.710701,2.430343,1.874243,0.328238,2
mean,54.634735,66899.53271,2016.138889,3.579699,4.883956,0.917186,4.4728,9.022382,7.147819,7.891745,9.380841,5.668484,8.130062,2.573468,4.013759,2.793614,0.665987,0.723424,656.098131,141.392407,10.359813,3.566978,3.414849,3.44107,0.494808,3


## Cluster 0 represents our best customers

In [49]:
cluster0DF.groupby('gender').count().reset_index()

Unnamed: 0,gender,person,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,email,social,mobile,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
0,F,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402,1402
1,M,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121,2121
2,O,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56


In [50]:
fig = px.bar(cluster0DF.groupby('gender').count().reset_index() ,x = 'gender', y = 'person',color = 'gender', color_discrete_sequence=dcs,
             width = 800,
             #barmode = 'group',
                  title = 'Cluster 0 Gender Distribution',
                   labels = {
                       'gender': 'Gender'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Gender',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                          legendgroup = newnames[t.name],
                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                         )
                   )

In [51]:
fig = px.bar(cluster1DF.groupby('gender').count().reset_index() ,x = 'gender', y = 'person',color = 'gender', color_discrete_sequence=dcs,
             width = 800,
             #barmode = 'group',
                  title = 'Cluster 1 Gender Distribution',
                   labels = {
                       'gender': 'Gender'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Gender',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                          legendgroup = newnames[t.name],
                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                         )
                   )

In [52]:
fig = px.bar(cluster2DF.groupby('gender').count().reset_index() ,x = 'gender', y = 'person',color = 'gender', color_discrete_sequence=dcs,
             width = 800,
             #barmode = 'group',
                  title = 'Cluster 2 Gender Distribution',
                   labels = {
                       'gender': 'Gender'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Gender',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                          legendgroup = newnames[t.name],
                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                         )
                   )

In [53]:
fig = px.bar(cluster3DF.groupby('gender').count().reset_index() ,x = 'gender', y = 'person',color = 'gender', color_discrete_sequence=dcs,
             width = 800,
             #barmode = 'group',
                  title = 'Cluster 3 Gender Distribution',
                   labels = {
                       'gender': 'Gender'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Gender',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                          legendgroup = newnames[t.name],
                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                         )
                   )

In [54]:
fig = px.bar(df.groupby('gender').count().reset_index() ,x = 'gender', y = 'person',color = 'gender', color_discrete_sequence=dcs,
             width = 800,
             #barmode = 'group',
                  title = 'Overall Gender Distribution',
                   labels = {
                       'gender': 'Gender'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Gender',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
                                          legendgroup = newnames[t.name],
                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
                                         )
                   )

In [55]:
G0 = cluster0DF.groupby('gender').count().reset_index().iloc[:, :2]
G0['cluster'] = '0'
G1 = cluster1DF.groupby('gender').count().reset_index().iloc[:, :2]
G1['cluster'] = '1'
G2 = cluster2DF.groupby('gender').count().reset_index().iloc[:, :2]
G2['cluster'] = '2'
G3 = cluster3DF.groupby('gender').count().reset_index().iloc[:, :2]
G3['cluster'] = '3'
GAll = pd.concat([G0, G1, G2, G3])
GAll


Unnamed: 0,gender,person,cluster
0,F,1402,0
1,M,2121,0
2,O,56,0
0,F,1865,1
1,M,1994,1
2,O,63,1
0,F,1068,2
1,M,2364,2
2,O,35,2
0,F,1790,3


In [56]:
fig = px.bar(GAll ,x = 'gender', y = 'person',color = 'cluster', color_discrete_sequence=dcs,
             width = 800,
             barmode = 'group',
                  title = 'Overall Gender Distribution',
                   labels = {
                       'cluster': 'Cluster'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Gender',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                          legendgroup = newnames[t.name],
#                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                         )
#                   )

## Income Distribution Analysis

In [57]:
x1 = cluster0DF.income
x2 = cluster1DF.income
x3 = cluster2DF.income
x4 = cluster3DF.income
histdata = [x1,x2, x3, x4]
group_labels = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Income by Cluster',
    xaxis_title = 'Income ($)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [114]:
x1 = cluster0DF.income
#x2 = cluster1DF.income
x3 = cluster2DF.income
#x4 = cluster3DF.income
histdata = [x1,x3,]
group_labels = ['Cluster 0', 'Cluster 2']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Income by Cluster',
    xaxis_title = 'Income ($)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [124]:
#x1 = cluster0DF.income
x2 = cluster1DF.income
#x3 = cluster2DF.income
x4 = cluster3DF.income
histdata = [x2,x4,]
group_labels = ['Cluster 1', 'Cluster 3']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Income by Cluster',
    xaxis_title = 'Income ($)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [58]:
fig = px.histogram(cluster0DF, x = 'income' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 0 Income',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Income ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [59]:
fig = px.histogram(cluster1DF, x = 'income' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 1 Income',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Income ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [60]:
fig = px.histogram(cluster2DF, x = 'income' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 2 Income',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Income ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [61]:
fig = px.histogram(cluster3DF, x = 'income' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 3 Income',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Income ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

## Age Analysis

In [62]:
fig = px.histogram(cluster0DF, x = 'age' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 0 Age',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Age (years)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [63]:
fig = px.histogram(cluster1DF, x = 'age' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 1 Age',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Age (years)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [64]:
fig = px.histogram(cluster2DF, x = 'age' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 2 Age',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Age (years)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [65]:
fig = px.histogram(cluster3DF, x = 'age' ,color_discrete_sequence=[color1,color2 ],nbins=100,
                  title = 'Distribution of Cluster 3 Age',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Age (years)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [66]:
x1 = cluster0DF.age
x2 = cluster1DF.age
x3 = cluster2DF.age
x4 = cluster3DF.age
histdata = [x1,x2, x3, x4]
group_labels = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Age by Cluster',
    xaxis_title = 'Age (years)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [116]:
x1 = cluster0DF.age
#x2 = cluster1DF.income
x3 = cluster2DF.age
#x4 = cluster3DF.income
histdata = [x1, x3]
group_labels = ['Cluster 0', 'Cluster 2']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Age by Cluster',
    xaxis_title = 'Age (years)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [125]:
#x1 = cluster0DF.age
x2 = cluster1DF.age
#x3 = cluster2DF.age
x4 = cluster3DF.age
histdata = [x2, x4]
group_labels = ['Cluster 1', 'Cluster 3']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution of Age by Cluster',
    xaxis_title = 'Age (years)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

## Types of Offers Received

In [67]:
oTypeCols = ['BOGO_Number', 'Discount_Number', 'Informational_Number']
cluster0DF[oTypeCols]

Unnamed: 0,BOGO_Number,Discount_Number,Informational_Number
0,3.0,5.0,4.0
1,3.0,6.0,4.0
11,5.0,0.0,6.0
14,6.0,1.0,4.0
17,5.0,0.0,6.0
...,...,...,...
14795,5.0,5.0,2.0
14800,2.0,2.0,4.0
14802,5.0,0.0,4.0
14808,4.0,5.0,1.0


In [68]:
fig = go.Figure()
fig.add_trace(
    go.Box(x = cluster0DF[oTypeCols]['BOGO_Number'], name = 'BOGO')
)
fig.add_trace(
    go.Box(x = cluster0DF[oTypeCols]['Discount_Number'], name = 'Discount')
)
fig.add_trace(
    go.Box(x = cluster0DF[oTypeCols]['Informational_Number'], name = 'Informational')
)
fig.update_layout(
    legend_traceorder="reversed",
    width =800,
    colorway = dcs,
    title = 'Cluster 0 <br> Number of Offers Received by Type Per Person',
    xaxis_title = 'Number of Offers',
    yaxis_title = 'Offer Type',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
        showticklabels = False
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
# newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
# fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                           legendgroup = newnames[t.name],
#                                          hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                          )
#                    )

In [69]:
fig = go.Figure()
fig.add_trace(
    go.Box(x = cluster1DF[oTypeCols]['BOGO_Number'], name = 'BOGO')
)
fig.add_trace(
    go.Box(x = cluster1DF[oTypeCols]['Discount_Number'], name = 'Discount')
)
fig.add_trace(
    go.Box(x = cluster1DF[oTypeCols]['Informational_Number'], name = 'Informational')
)
fig.update_layout(
    legend_traceorder="reversed",
    width =800,
    colorway = dcs,
    title = 'Cluster 1 <br> Number of Offers Received by Type Per Person',
    xaxis_title = 'Number of Offers',
    yaxis_title = 'Offer Type',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
        showticklabels = False
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
# newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
# fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                           legendgroup = newnames[t.name],
#                                          hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                          )
#                    )

In [70]:
fig = go.Figure()
fig.add_trace(
    go.Box(x = cluster2DF[oTypeCols]['BOGO_Number'], name = 'BOGO')
)
fig.add_trace(
    go.Box(x = cluster2DF[oTypeCols]['Discount_Number'], name = 'Discount')
)
fig.add_trace(
    go.Box(x = cluster2DF[oTypeCols]['Informational_Number'], name = 'Informational')
)
fig.update_layout(
    legend_traceorder="reversed",
    width =800,
    colorway = dcs,
    title = 'Cluster 2 <br> Number of Offers Received by Type Per Person',
    xaxis_title = 'Number of Offers',
    yaxis_title = 'Offer Type',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
        showticklabels = False
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
# newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
# fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                           legendgroup = newnames[t.name],
#                                          hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                          )
#                    )

In [71]:
fig = go.Figure()
fig.add_trace(
    go.Box(x = cluster3DF[oTypeCols]['BOGO_Number'], name = 'BOGO')
)
fig.add_trace(
    go.Box(x = cluster3DF[oTypeCols]['Discount_Number'], name = 'Discount')
)
fig.add_trace(
    go.Box(x = cluster3DF[oTypeCols]['Informational_Number'], name = 'Informational')
)
fig.update_layout(
    legend_traceorder="reversed",
    width =800,
    colorway = dcs,
    title = 'Cluster 3 <br> Number of Offers Received by Type Per Person',
    xaxis_title = 'Number of Offers',
    yaxis_title = 'Offer Type',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
        showticklabels = False
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True
          
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
# newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
# fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                           legendgroup = newnames[t.name],
#                                          hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                          )
#                    )

In [72]:
CR = clusterDF.loc[:, ['cluster', 'reward_avg', 'difficulty_avg', 'duration_avg']].groupby('cluster').mean().reset_index()
CR = pd.melt(CR, id_vars = 'cluster')
CR.columns = ['cluster', 'metric', 'average']
CR


Unnamed: 0,cluster,metric,average
0,0,reward_avg,3.97711
1,1,reward_avg,4.778732
2,2,reward_avg,4.307212
3,3,reward_avg,4.4728
4,0,difficulty_avg,6.363353
5,1,difficulty_avg,8.119559
6,2,difficulty_avg,7.672243
7,3,difficulty_avg,9.022382
8,0,duration_avg,5.974531
9,1,duration_avg,6.742818


In [73]:
fig = px.bar(CR ,x = 'metric', y = 'average',color = 'cluster', color_discrete_sequence=dcs,
             width = 800,
             barmode = 'group',
                  title = 'Overall Reward/Difficulty/Duration Distribution',
                   labels = {
                       'cluster': 'Cluster'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Metric',
    yaxis_title = 'Average Value (0-10)',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True,
        tickmode = 'array',
        tickvals = ['reward_avg', 'difficulty_avg', 'duration_avg'],
        ticktext = ['Reward', 'Difficulty', 'Duration']
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                          legendgroup = newnames[t.name],
#                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                         )
#                   )

In [74]:
CC = clusterDF.loc[:, ['cluster', 'web', 'email', 'social', 'mobile']].groupby('cluster').mean().reset_index()
CC = pd.melt(CC, id_vars = 'cluster')
CC.columns = ['cluster', 'channel', 'average']
CC

Unnamed: 0,cluster,channel,average
0,0,web,7.73596
1,1,web,11.429883
2,2,web,4.942025
3,3,web,7.891745
4,0,email,10.286113
5,1,email,13.753697
6,2,email,6.320738
7,3,email,9.380841
8,0,social,7.130763
9,1,social,9.575982


In [75]:
fig = px.bar(CC ,x = 'channel', y = 'average',color = 'cluster', color_discrete_sequence=dcs,
             width = 800,
             barmode = 'group',
                  title = 'Overall Offer Channel Distribution',
                   labels = {
                       'cluster': 'Cluster'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Channel',
    yaxis_title = 'Average Offers Per User',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True,
        tickmode = 'array',
        tickvals = ['web', 'email', 'social', 'mobile'],
        ticktext = ['Web', 'E-Mail', 'Social', 'Mobile']
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                          legendgroup = newnames[t.name],
#                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                         )
#                   )

In [76]:
CP = clusterDF.loc[:,['cluster', 'ratio_completed', 'ratio_viewed']].groupby('cluster').mean().reset_index()
CP = pd.melt(CP, id_vars = 'cluster')
CP['value'] = round(CP['value'] * 100,1)
CP

Unnamed: 0,cluster,variable,value
0,0,ratio_completed,29.2
1,1,ratio_completed,77.3
2,2,ratio_completed,16.8
3,3,ratio_completed,66.6
4,0,ratio_viewed,83.2
5,1,ratio_viewed,86.5
6,2,ratio_viewed,58.0
7,3,ratio_viewed,72.3


In [77]:
fig = px.bar(CP ,x = 'variable', y = 'value',color = 'cluster', color_discrete_sequence=dcs,
             width = 800,
             barmode = 'group',
                  title = 'Overall Offer Engagement',
                   labels = {
                       'cluster': 'Cluster'
                   }
                  )
fig.update_layout(
    xaxis_title = '',
    yaxis_title = 'Average Percent',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True,
        tickmode = 'array',
        tickvals = ['ratio_completed', 'ratio_viewed'],
        ticktext = ['Completed', 'Viewed']
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                          legendgroup = newnames[t.name],
#                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                         )
#                   )

In [118]:
CP['cluster'] = CP['cluster'].astype('category')
fig = px.bar(CP.loc[CP['cluster'].isin([0,2])] ,x = 'variable', y = 'value',color = 'cluster', color_discrete_sequence=dcs,
             width = 800,
             barmode = 'group',
                  title = 'Overall Offer Engagement',
                   labels = {
                       'cluster': 'Cluster'
                   }
                  )
fig.update_layout(
    xaxis_title = '',
    yaxis_title = 'Average Percent',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True,
        tickmode = 'array',
        tickvals = ['ratio_completed', 'ratio_viewed'],
        ticktext = ['Completed', 'Viewed']
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                          legendgroup = newnames[t.name],
#                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                         )
#                   )

In [78]:
CRFM = clusterDF.loc[:, ['cluster', 'R_score', 'F_score', 'M_score']].groupby('cluster').mean().reset_index()
CRFM = pd.melt(CRFM, id_vars = 'cluster')
CRFM

Unnamed: 0,cluster,variable,value
0,0,R_score,2.455993
1,1,R_score,3.349567
2,2,R_score,2.430343
3,3,R_score,3.414849
4,0,F_score,2.072646
5,1,F_score,3.779959
6,2,F_score,1.874243
7,3,F_score,3.44107
8,0,M_score,2.327186
9,1,M_score,4.196073


In [127]:
CRFM['cluster'] = CRFM['cluster'].astype('category')
fig = px.bar(CRFM ,x = 'variable', y = 'value',color = 'cluster', color_discrete_sequence=dcs,
             width = 800,
             barmode = 'group',
                  title = 'Average RFM Score',
                   labels = {
                       'cluster': 'Cluster'
                   }
                  )
fig.update_layout(
    xaxis_title = 'R-F-M',
    yaxis_title = 'Average Value (0-5)',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True,
        tickmode = 'array',
        tickvals = ['R_score', 'F_score', 'M_score'],
        ticktext = ['Receny Score', 'Frequency Score', 'Monetary Score']
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                          legendgroup = newnames[t.name],
#                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                         )
#                   )

In [80]:
cluster1DF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,email,social,mobile,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,5.727273,9,11,8,11,3.0,4.0,4.0,0.75,1.00,672.0,154.05,12.0,4,4,4,1,1
7,004c5799adbf42868b9cff0396190900,M,54,99000.0,2016,8.0,6.0,0.0,4.428571,7.857143,7.428571,12,14,14,14,5.0,5.0,4.0,1.00,0.80,690.0,347.38,12.0,5,4,4,0,1
10,00715b6e55c3431cb56ff7307eb19675,F,58,119000.0,2017,5.0,8.0,1.0,5.785714,12.857143,7.571429,11,14,5,9,5.0,6.0,3.0,0.83,0.50,702.0,375.12,15.0,5,5,5,1,1
15,0091d2b6a5ea4defaa8393e4e816db60,F,62,81000.0,2016,11.0,3.0,0.0,6.785714,10.000000,6.071429,14,14,11,11,4.0,5.0,5.0,0.80,1.00,540.0,279.16,12.0,5,1,4,1,1
18,00ae03011f9f49b8a4b3e6d416678b0b,M,55,83000.0,2015,5.0,8.0,2.0,4.800000,7.466667,6.466667,8,15,13,15,4.0,6.0,5.0,0.67,0.83,696.0,218.05,8.0,5,4,3,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14811,ffeaa02452ef451082a0361c3ca62ef5,F,67,77000.0,2017,3.0,6.0,4.0,3.230769,6.923077,5.769231,6,13,7,13,3.0,5.0,5.0,0.60,1.00,654.0,234.05,12.0,5,3,4,1,1
14814,fff29fb549084123bd046dbc5ceb4faa,F,59,93000.0,2017,14.0,3.0,0.0,7.705882,9.117647,6.470588,12,17,17,17,6.0,6.0,5.0,1.00,0.83,684.0,291.47,11.0,5,4,4,1,1
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,6.750000,9,12,6,12,3.0,6.0,3.0,0.50,0.50,552.0,580.98,11.0,5,1,4,1,1
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,8.166667,10,12,10,12,3.0,5.0,4.0,0.60,0.80,696.0,29.94,6.0,2,4,2,0,1


In [81]:
x1 = cluster0DF.total_spent
x2 = cluster1DF.total_spent
x3 = cluster2DF.total_spent
x4 = cluster3DF.total_spent
histdata = [x1,x2, x3, x4]
group_labels = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Total Spent by Cluster',
    xaxis_title = 'Total Spent ($)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        range = [0,800],
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [126]:
#x1 = cluster0DF.total_spent
x2 = cluster1DF.total_spent
#x3 = cluster2DF.total_spent
x4 = cluster3DF.total_spent
histdata = [x2, x4]
group_labels = [ 'Cluster 1', 'Cluster 3']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Total Spent by Cluster',
    xaxis_title = 'Total Spent ($)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        range = [0,800],
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [123]:
x1 = cluster0DF.total_spent
#x2 = cluster1DF.total_spent
x3 = cluster2DF.total_spent
#x4 = cluster3DF.total_spent
histdata = [x1, x3,]
group_labels = ['Cluster 0', 'Cluster 2']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution Total Spent by Cluster <br> Over One Month',
    xaxis_title = 'Total Spent ($)',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        range = [0,300],
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  
        
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [82]:
fig = px.histogram(cluster0DF, x = 'total_spent' ,color_discrete_sequence=[color1,color2 ],nbins=250,
                  title = 'Distribution of Cluster 0 Revenue',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Revenue ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
        range = [0,500]
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
        range = [0,800]
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [83]:
fig = px.histogram(cluster1DF, x = 'total_spent' ,color_discrete_sequence=[color1,color2 ],nbins=250,
                  title = 'Distribution of Cluster 1 Revenue',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Revenue ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
        range = [0,500]
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,800]
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [84]:
fig = px.histogram(cluster2DF, x = 'total_spent' ,color_discrete_sequence=[color1,color2 ],nbins=250,
                  title = 'Distribution of Cluster 2 Revenue',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Revenue ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
        range = [0,500]
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,800]
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [85]:
fig = px.histogram(cluster3DF, x = 'total_spent' ,color_discrete_sequence=[color1,color2 ],nbins=250,
                  title = 'Distribution of Cluster 3 Revenue',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Revenue ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,800]
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [86]:
x1 = cluster0DF.transactions_number
x2 = cluster1DF.transactions_number
x3 = cluster2DF.transactions_number
x4 = cluster3DF.transactions_number
histdata = [x1,x2, x3, x4]
group_labels = ['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3']
colors = dcs
fig = ff.create_distplot(histdata, group_labels, show_hist = False, show_rug = False, colors = colors, curve_type = 'normal')
fig.update_layout(
    title_text = 'Distribution of Total Number of Transactions Cluster',
    xaxis_title = 'Number of Transactions',
    yaxis_title = 'Density',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black'  ,
   
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
fig.show()

In [87]:
fig = px.histogram(cluster0DF, x = 'transactions_number' ,color_discrete_sequence=[color1,color2 ],nbins=50,
                  title = 'Distribution of Cluster 0 Number of Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Number of Transactions',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [88]:
fig = px.histogram(cluster1DF, x = 'transactions_number' ,color_discrete_sequence=[color1,color2 ],nbins=50,
                  title = 'Distribution of Cluster 1 Number of Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Number of Transactions',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [89]:
fig = px.histogram(cluster2DF, x = 'transactions_number' ,color_discrete_sequence=[color1,color2 ],nbins=50,
                  title = 'Distribution of Cluster 2 Number of Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Number of Transactions',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [90]:
fig = px.histogram(cluster3DF, x = 'transactions_number' ,color_discrete_sequence=[color1,color2 ],nbins=25,
                  title = 'Distribution of Cluster 3 Number of Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Number of Transactions',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,35]
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

# Let's add the cluster # for each row in our original DF by joining

In [91]:
PC = clusterDF.loc[:, ['person', 'cluster']]
dfWC = df.merge(PC, how = 'left', on = 'person')
dfWC

Unnamed: 0,person,event,time,offer_id,reward,difficulty,duration,offer_type,web,email,mobile,social,gender,age,became_member_on,income,anonymous,income_cluster,cluster
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,bogo,1,1,1,0,F,75,2017-05-09,100000.0,0,0,0
1,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,2,10,7,discount,1,1,1,0,M,68,2018-04-26,70000.0,0,1,0
2,389bc3fa690240e798340f5a15918d5c,offer received,0,f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,bogo,1,1,1,1,M,65,2018-02-09,53000.0,0,1,1
3,2eeac8d8feae4a8cad5a6af0499a211d,offer received,0,3f207df678b143eea3cee63160fa8bed,0,0,4,informational,1,1,1,0,M,58,2017-11-11,51000.0,0,1,2
4,aa4862eba776480b8bb9c68455b8c2e1,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,discount,1,1,0,0,F,61,2017-09-11,57000.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148800,84fb57a7fe8045a8bf6236738ee73a0f,offer viewed,714,5a8bc65990b245e5a138643cd4eb9837,0,0,3,informational,0,1,1,1,F,64,2017-06-15,113000.0,0,0,1
148801,abc4359eb34e4e2ca2349da2ddf771b6,offer viewed,714,3f207df678b143eea3cee63160fa8bed,0,0,4,informational,1,1,1,0,M,51,2014-08-04,68000.0,0,1,1
148802,8dda575c2a1d44b9ac8e8b07b93d1f8e,offer viewed,714,0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,discount,1,1,0,0,F,60,2017-09-08,64000.0,0,1,1
148803,8431c16f8e1d440880db371a68f82dd0,offer completed,714,fafdcd668e3743c1bb461111dcafc2a4,2,10,10,discount,1,1,1,1,M,39,2018-06-27,39000.0,0,2,3


In [92]:
dfWC.loc[dfWC['cluster'] == 0,['person', 'time']].groupby('person').diff().dropna()

Unnamed: 0,time
11024,0.0
11027,0.0
11035,0.0
11037,0.0
11038,0.0
...,...
148780,90.0
148794,138.0
148795,138.0
148798,174.0


In [93]:
AW = dfWC.loc[dfWC['cluster'] == 0,['person', 'time']].groupby('person').diff().dropna()
AW = AW.loc[AW['time']!= 0 ]
fig = px.histogram(AW, x = 'time' ,color_discrete_sequence=[color1,color2 ],nbins=150,
                  title = 'Distribution of Cluster 0 Hours between Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Hours',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,500]
        
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [94]:
AW = dfWC.loc[dfWC['cluster'] == 1,['person', 'time']].groupby('person').diff().dropna()
AW = AW.loc[AW['time']!= 0 ]
fig = px.histogram(AW, x = 'time' ,color_discrete_sequence=[color1,color2 ],nbins=150,
                  title = 'Distribution of Cluster 1 Hours between Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Hours',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,500]
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [95]:
AW = dfWC.loc[dfWC['cluster'] == 2,['person', 'time']].groupby('person').diff().dropna()
AW = AW.loc[AW['time']!= 0 ]
fig = px.histogram(AW, x = 'time' ,color_discrete_sequence=[color1,color2 ],nbins=150,
                  title = 'Distribution of Cluster 2 Hours between Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Hours',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',
    

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,500]
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

In [96]:
AW = dfWC.loc[dfWC['cluster'] == 3,['person', 'time']].groupby('person').diff().dropna()
AW = AW.loc[AW['time']!= 0 ]
fig = px.histogram(AW, x = 'time' ,color_discrete_sequence=[color1,color2 ],nbins=150,
                  title = 'Distribution of Cluster 3 Hours between Transactions',
                   labels = {
                       'variable':''
                   }
                  )
fig.update_layout(
    xaxis_title = 'Hours',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.01,

    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black',          

    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        range = [0,500]
      
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.98
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"

)

<h2> Cluster Analysis Gender  </h2>
<ul style = "list-style-type:square">
    <li> All Clusters are predominately male (is inline with total distribution)   </li>
    <li> Cluster 0 and 2 Have the closest Male to female ratio  </li>
    <li> All clusters except for 3 have reasonalby even distribtution of 'other' gender  </li>
    <li> Clusters 1 and 3  has the fewest female members with 3 being slightly less overall    </li>
</ul>

<img src="./pics/genderCLuster0.png" style = "float:left;width:20%"/>
<img src="./pics/genderCluster1.png" style = "float:left;width:20%"/>
<img src="./pics/genderCluster2.png"style = "float:left;width:20%"/>
<img src="./pics/genderCLuster3.png"style = "float:left;width:20%"/> 
<br>
<img src="./pics/genderOverall.png"style = "float:left;width:40%"/>
<img src="./pics/genderOverallBar.png" style = "float:left;width:40%"/> 
<p style="clear:both;">
 
<p>
<br>
<h2> Cluster Analysis Income    </h2>
<ul style = "list-style-type:square">
    <li> Cluster 3 is the only significant outlier with noticably lower income    </li>
</ul>
</p>
<img src="./pics/incomeCluster0.png" style = "float:left;width:20%"/>   
<img src="./pics/incomeCluster1.png" style = "float:left;width:20%"/> 
<img src="./pics/incomeCluster2.png" style = "float:left;width:20%"/>   
<img src="./pics/incomeCluster3.png" style = "float:left;width:20%"/>
<br>
<img src="./pics/incomeOverall.png" style = "float:left;width:80%"/>     
<p style="clear:both;">
<br>
<h2> Cluster Analysis Age    </h2>
<ul style = "list-style-type:square">
    <li> Most of the younger population is put in cluster 3  </li>
    <li>  Cluster 2 has a slightly higher distribution of younder people  </li>
</ul>


<img src="./pics/ageCluster0.png" style = "float:left;width:20%"/>     
<img src="./pics/ageCluster1.png" style = "float:left;width:20%"/>  
<img src="./pics/ageCluster2.png" style = "float:left;width:20%"/>     
<img src="./pics/ageCluster3.png" style = "float:left;width:20%"/>  
<br>
<img src="./pics/ageClusterOverall.png" style = "float:left;width:80%"/>   
<p style="clear:both;">
<br> 
<h2> Cluster Analysis Offer Types     </h2>
<ul style = "list-style-type:square">
    <li>  Cluster 2 received less informational and BOGO offers and the most discount offers  </li>
<li> Cluster 1 Received the most BOGO and informational offers  </li>
    <li> Cluster 3 received the lowest amount of offers   </li>
    <li> Cluster 0 and cluster 3 received an even number of BOGO and Discount offers </li>
</ul>
<img src="./pics/offerTypeCluster0.png" style = "float:left;width:20%"/>   
<img src="./pics/offerTypeCluster1.png" style = "float:left;width:20%"/>   
<img src="./pics/offerTypeCluster2.png" style = "float:left;width:20%"/>   
<img src="./pics/offerTypeCluster3.png" style = "float:left;width:20%"/>    
<p style="clear:both;">
<br>
<h2> Cluster Analysis Reward - Difficulty - Duration </h2>
<ul style = "list-style-type:square">
    <li> only significant deviation is cluster 1 receiving less difficult offers in general  </li>
</ul>
<img src="./pics/RewardDifficultyDuration.png" style = "float:center;width:40%"/>  
<p style="clear:both;">   
<br>
<h2> Cluster Analysis Channels </h2>
<ul style = "list-style-type:square">
    <li> Cluster 2 received more web offers than the overall trend </li>
</ul>
<img src="./pics/channels.png" style = "float:center;width:40%"/>  
<p style="clear:both;"> 
<br>
<h2> Offer Engagement  </h2>
<ul style = "list-style-type:square">
<li> Cluster 0 and 2 are our clear best customers, they complete a large percent of the offers they receive and view most of them  </li>
    <li> Cluster 3 barely views the offers and completes very few  </li>
    <li> Cluster 1 views a large amount of offers, but barely completes any  </li>
</ul>

<img src="./pics/ratioCompleted.png" style = "float:center;width:40%"/>     
<p style="clear:both;"> 
<br>   
<h2> RFM  </h2>
<ul style = "list-style-type:square">
    <li> Cluster 0 and 2 are clearly our best customers  </li>
    <li> Cluster 1 and 3 are lowest on all metrics</li>
</ul>
<img src="./pics/RFM.png" style = "float:center;width:40%"/> 
<p style="clear:both;"> 
<br>

<h2> Revenue Analysis </h2>
<ul style = "list-style-type:square">
    <li> Cluster 0 spending about 150 a month, with an even distribution on both sides with some spending 100s of dollars </li>
    <li> Cluster 1 is low spending over this month with some purchasing more </li>
    <li> Cluster 2 is an even distribution of relatively high spending </li>
    <li> Cluster 3 is  </li>
</ul>
<img src="./pics/totalSpentCluster0.png" style = "float:left;width:20%"/> 
<img src="./pics/totalSpentCluster1.png" style = "float:left;width:20%"/> 
<img src="./pics/totalSpentCluster2.png" style = "float:left;width:20%"/> 
<img src="./pics/totalSpentCluster3.png" style = "float:left;width:20%"/> 
<br>
<img src="./pics/totalSpentOverall.png" style = "float:left;width:80%"/> 
<p style="clear:both;"> 
<br>
<h2> Transactions Analysis </h2>
<ul style = "list-style-type:square">
    <li> Cluster 1 and 3 looks like weekend buyers only 4-6 transactions in a month </li>
    <li> Cluster 0 and 2 look like frequent weekday buyers with transactions ~15-20 </li>
</ul>
<img src="./pics/transactionsCluster0.png" style = "float:left;width:20%"/> 
<img src="./pics/transactionsCluster1.png" style = "float:left;width:20%"/> 
<img src="./pics/transactionsCluster2.png" style = "float:left;width:20%"/> 
<img src="./pics/transactionsCluster3.png" style = "float:left;width:20%"/> 
<br>
<img src="./pics/transactionsOverall.png" style = "float:left;width:80%"/> 
<p style="clear:both;"> 
<br>
<h2> Time Between Transactions </h2>
<ul style = "list-style-type:square">
    <li> Cluster 0 represents a large number of customers who purchase every 3 or 6 days and some multi times a day </li>
    <li> Cluster 1 represents a large number of multi-purchase customers in a day </li>
    <li> Cluster 2 represents a large number of every 3 day purchasers and every week purchasers and a smaller number of every day purchasers </li>
    <li> Cluster 3 is extremely similar to cluster 2 in frequency behavior </li>
</ul>
<img src="./pics/timeBetweenCluster0.png" style = "float:left;width:50%"/> 
<img src="./pics/timeBetweenCluster1.png" style = "float:left;width:50%"/> 
<p style="clear:both;"> 
<br>
<img src="./pics/timeBetweenCluster2.png" style = "float:left;width:50%"/> 
<img src="./pics/timeBetweenCluster3.png" style = "float:left;width:50%"/> 
<br>
<p style="clear:both;"> 

In [97]:
clusterDF.columns

Index(['person', 'gender', 'age', 'income', 'year_became_member',
       'BOGO_Number', 'Discount_Number', 'Informational_Number', 'reward_avg',
       'difficulty_avg', 'duration_avg', 'web', 'email', 'social', 'mobile',
       'offer completed', 'offer received', 'offer viewed', 'ratio_completed',
       'ratio_viewed', 'most_recent', 'total_spent', 'transactions_number',
       'M_score', 'R_score', 'F_score', 'gender_encode', 'cluster'],
      dtype='object')

In [98]:
# write clustered DF to csv
dfWC.to_csv('Clustered.csv', index = False)

In [99]:
clusterDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,email,social,mobile,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,6.083333,10,12,8,12,3.0,5.0,4.0,0.60,0.80,696.0,127.60,8.0,4,4,3,0,0
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,6.615385,11,13,5,10,3.0,5.0,5.0,0.60,1.00,654.0,79.46,5.0,3,3,2,2,0
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,7.090909,8,11,11,11,3.0,5.0,3.0,0.60,0.60,708.0,196.86,8.0,5,5,3,1,3
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,5.727273,9,11,8,11,3.0,4.0,4.0,0.75,1.00,672.0,154.05,12.0,4,4,4,1,1
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,7.833333,10,12,8,10,3.0,5.0,4.0,0.60,0.80,696.0,48.34,18.0,2,4,5,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,6.750000,9,12,6,12,3.0,6.0,3.0,0.50,0.50,552.0,580.98,11.0,5,1,4,1,1
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,8.166667,10,12,10,12,3.0,5.0,4.0,0.60,0.80,696.0,29.94,6.0,2,4,2,0,1
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,6.800000,5,5,4,5,0.0,3.0,2.0,0.00,0.67,576.0,12.15,5.0,1,2,2,0,2
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,5.181818,9,11,8,11,3.0,4.0,4.0,0.75,1.00,678.0,88.83,12.0,3,4,4,0,0


In [107]:
fig = px.histogram(clusterDF ,x = 'total_spent', color_discrete_sequence=[color1],nbins = 500,
             width = 1200,
             barmode = 'group',
                  title = 'Total Spent for Each Customer<br> Over One Month',
                   labels = {
                       'cluster': 'Cluster'
                   }
                  )
fig.update_layout(
    xaxis_title = 'Spent ($)',
    yaxis_title = 'Count',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size =25),
        linecolor = 'black',
        showticklabels = True,
        range = [0,400]
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'M':'Male', 'F':'Female', 'O':'Other'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                          legendgroup = newnames[t.name],
#                                         hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                         )
#                   )