In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy
import matplotlib.pyplot as mlt
import plotly.express as px

# Define Color Pallette

In [2]:
color1 = '#00704A'
color2 = '#FF9FE5'
color3 = '#45062E'
backgroundColor = '#B8A085'
ccs = ['#ACDDA9', '#00704A', '#002F20']
dcs = ['#002619','#008256', '#00de92', '#a1ffdf', '#fdfffe']

# Import Data

In [3]:
df = pd.read_csv('transactions.csv')
df.drop(['Unnamed: 0.1','Unnamed: 0' ], axis =1, inplace=True)
df = df.loc[df['anonymous'] == 0]
df.head()

Unnamed: 0,person,event,time,offer_id,reward,difficulty,duration,offer_type,web,email,mobile,social,gender,age,became_member_on,income,anonymous,income_cluster
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,bogo,1,1,1,0,F,75,2017-05-09,100000.0,0,0
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,2,10,7,discount,1,1,1,0,M,68,2018-04-26,70000.0,0,1
5,389bc3fa690240e798340f5a15918d5c,offer received,0,f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,bogo,1,1,1,1,M,65,2018-02-09,53000.0,0,1
7,2eeac8d8feae4a8cad5a6af0499a211d,offer received,0,3f207df678b143eea3cee63160fa8bed,0,0,4,informational,1,1,1,0,M,58,2017-11-11,51000.0,0,1
8,aa4862eba776480b8bb9c68455b8c2e1,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,discount,1,1,0,0,F,61,2017-09-11,57000.0,0,1


# Feature Engineering
## We want an indivdual observation for each customer with relevant data

## Offer Count for each Customer   
#### Number of BOGO, Discount, and Information Offers
#### 'offerNumDF'

In [4]:
# Start with getting a offer count for each customer
A = df.groupby(['person', 'offer_type']).count().event.reset_index()
A.columns = ['person', 'offer_type', 'Number of Offers']
A
# Pivot longer
A = A.pivot(columns = 'offer_type', values = 'Number of Offers',index = 'person').reset_index()
A.columns = ['person', 'BOGO_Number', 'Discount_Number', 'Informational_Number']
A.fillna(0, inplace = True)
offerNumDF = A
offerNumDF

Unnamed: 0,person,BOGO_Number,Discount_Number,Informational_Number
0,0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0
1,0011e0d4e6b944f998e987f904e8c1e5,3.0,6.0,4.0
2,0020c2b971eb4e9188eac86d93036a77,4.0,5.0,2.0
3,0020ccbbb6d84e358d3414a3ff76cffd,6.0,3.0,2.0
4,003d66b6608740288d6cc97a6903f4f0,0.0,8.0,4.0
...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,3.0,6.0,3.0
14816,fff7576017104bcc8677a8d63322b5e1,6.0,6.0,0.0
14817,fff8957ea8b240a6b5e634b6ee8eafcf,2.0,2.0,1.0
14818,fffad4f4828548d1b5583907f2e9906b,9.0,0.0,2.0


## Aggregate offers channels (web, mobile, social, email) and Reward, difficulty and offer duration
#### 'aggDF'

In [5]:
A = df.loc[:, ['person', 'reward', 'difficulty', 'duration', 'web', 'email', 'social', 'mobile']]
aggDF = A.groupby('person').agg(
    reward_avg = ('reward', 'mean'),
    difficulty_avg = ('difficulty', 'mean'),
    duration_avg = ('duration', 'mean'),
    web = ('web', 'sum'), 
    email = ('email', 'sum'),
    social = ('social', 'sum'),
    mobile = ('mobile', 'sum')
).reset_index()
aggDF

Unnamed: 0,person,reward_avg,difficulty_avg,duration_avg,web,email,social,mobile
0,0009655768c64bdeb2e877511632db8f,2.083333,5.416667,6.083333,10,12,8,12
1,0011e0d4e6b944f998e987f904e8c1e5,3.000000,7.384615,6.615385,11,13,5,10
2,0020c2b971eb4e9188eac86d93036a77,4.545455,8.181818,7.090909,8,11,11,11
3,0020ccbbb6d84e358d3414a3ff76cffd,3.545455,4.636364,5.727273,9,11,8,11
4,003d66b6608740288d6cc97a6903f4f0,1.833333,8.333333,7.833333,10,12,8,10
...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,2.250000,6.250000,6.750000,9,12,6,12
14816,fff7576017104bcc8677a8d63322b5e1,5.166667,9.166667,8.166667,10,12,10,12
14817,fff8957ea8b240a6b5e634b6ee8eafcf,4.800000,8.000000,6.800000,5,5,4,5
14818,fffad4f4828548d1b5583907f2e9906b,4.090909,4.090909,5.181818,9,11,8,11


## Event aggregation (How many offers received, viewed and completed)
#### 'eventDF'

In [6]:
A = df.loc[:, ['person', 'event', 'web']]
A = A.groupby(['person', 'event']).count().reset_index()
eventDF = A.pivot(index = 'person', columns = 'event', values = 'web').reset_index()
eventDF.fillna(0, inplace = True)
eventDF['ratio_completed'] = round(eventDF['offer completed']/eventDF['offer received'] ,2)
eventDF['ratio_viewed'] = round(eventDF['offer viewed']/ eventDF['offer received'],2)
eventDF

event,person,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed
0,0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,0.60,0.80
1,0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,0.60,1.00
2,0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,0.60,0.60
3,0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,0.75,1.00
4,003d66b6608740288d6cc97a6903f4f0,3.0,5.0,4.0,0.60,0.80
...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,3.0,6.0,3.0,0.50,0.50
14816,fff7576017104bcc8677a8d63322b5e1,3.0,5.0,4.0,0.60,0.80
14817,fff8957ea8b240a6b5e634b6ee8eafcf,0.0,3.0,2.0,0.00,0.67
14818,fffad4f4828548d1b5583907f2e9906b,3.0,4.0,4.0,0.75,1.00


## Create Demographics DF (age, income, year been a  member, gender)
#### 'demoDF'

In [7]:
demoDF = df.loc[:,['person', 'gender', 'age', 'became_member_on', 'income']]
demoDF = demoDF.groupby('person').agg(
    gender = ('gender', 'first'),
    age = ('age', 'first'),
    income = ('income', 'first'),
    became_member_on = ('became_member_on', 'first')
).reset_index()
# convert became a member on to date
demoDF['became_member_on'] = pd.to_datetime(demoDF['became_member_on'])
demoDF['year_became_member'] = demoDF['became_member_on'].dt.year
demoDF.drop(['became_member_on'], axis = 1, inplace = True)
demoDF

Unnamed: 0,person,gender,age,income,year_became_member
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017
...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017


## Attach how much money they spent and most recent transaction date

In [8]:
money = pd.read_csv('transcriptT.csv')
money.drop(['Unnamed: 0', 'value'], axis = 1, inplace = True)
money = money.groupby('person').agg(
    most_recent = ('time', 'last'),
    total_spent = ('spent', 'sum'),
    transactions_number = ('event', 'count')
).reset_index()
money

Unnamed: 0,person,most_recent,total_spent,transactions_number
0,0009655768c64bdeb2e877511632db8f,696,127.60,8
1,00116118485d4dfda04fdbaba9a87b5c,474,4.09,3
2,0011e0d4e6b944f998e987f904e8c1e5,654,79.46,5
3,0020c2b971eb4e9188eac86d93036a77,708,196.86,8
4,0020ccbbb6d84e358d3414a3ff76cffd,672,154.05,12
...,...,...,...,...
16573,fff3ba4757bd42088c044ca26d73817a,552,580.98,11
16574,fff7576017104bcc8677a8d63322b5e1,696,29.94,6
16575,fff8957ea8b240a6b5e634b6ee8eafcf,576,12.15,5
16576,fffad4f4828548d1b5583907f2e9906b,678,88.83,12


## Merge

In [9]:
mergeDF = offerNumDF.merge(aggDF, how = 'left', on = 'person')
mergeDF = mergeDF.merge(eventDF, how = 'left', on = 'person')

mergeDF = demoDF.merge(mergeDF, how = 'left', on = 'person')
mergeDF = mergeDF.merge(money, how = 'left', on = 'person')
mergeDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,social,mobile,offer completed,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,8,12,3.0,5.0,4.0,0.60,0.80,696.0,127.60,8.0
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,5,10,3.0,5.0,5.0,0.60,1.00,654.0,79.46,5.0
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,11,11,3.0,5.0,3.0,0.60,0.60,708.0,196.86,8.0
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,8,11,3.0,4.0,4.0,0.75,1.00,672.0,154.05,12.0
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,8,10,3.0,5.0,4.0,0.60,0.80,696.0,48.34,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,6,12,3.0,6.0,3.0,0.50,0.50,552.0,580.98,11.0
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,10,12,3.0,5.0,4.0,0.60,0.80,696.0,29.94,6.0
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,4,5,0.0,3.0,2.0,0.00,0.67,576.0,12.15,5.0
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,8,11,3.0,4.0,4.0,0.75,1.00,678.0,88.83,12.0


### Look for NAs

In [10]:
mergeDF.isna().any()

person                  False
gender                  False
age                     False
income                  False
year_became_member      False
BOGO_Number             False
Discount_Number         False
Informational_Number    False
reward_avg              False
difficulty_avg          False
duration_avg            False
web                     False
email                   False
social                  False
mobile                  False
offer completed         False
offer received          False
offer viewed            False
ratio_completed         False
ratio_viewed            False
most_recent              True
total_spent              True
transactions_number      True
dtype: bool

### Only NA values are in actual transactions, meaning they never bought anything, fill nas with 0

In [11]:
mergeDF.fillna(0, inplace=True)
mergeDF.isna().any()

person                  False
gender                  False
age                     False
income                  False
year_became_member      False
BOGO_Number             False
Discount_Number         False
Informational_Number    False
reward_avg              False
difficulty_avg          False
duration_avg            False
web                     False
email                   False
social                  False
mobile                  False
offer completed         False
offer received          False
offer viewed            False
ratio_completed         False
ratio_viewed            False
most_recent             False
total_spent             False
transactions_number     False
dtype: bool

## Calculate RFM score (recency, frequency, monetary)

In [12]:
mergeDF['M_score'] = pd.cut(mergeDF['total_spent'], bins=[-1,
       np.percentile(mergeDF['total_spent'], 20),
       np.percentile(mergeDF['total_spent'], 40),
       np.percentile(mergeDF['total_spent'], 60),
       np.percentile(mergeDF['total_spent'], 80),
       mergeDF['total_spent'].max()],
                           labels = [1,2,3,4,5]).astype('int')

In [13]:
mergeDF[['M_score', 'total_spent']]

Unnamed: 0,M_score,total_spent
0,4,127.60
1,3,79.46
2,5,196.86
3,4,154.05
4,2,48.34
...,...,...
14815,5,580.98
14816,2,29.94
14817,1,12.15
14818,3,88.83


In [14]:
mergeDF['R_score'] = pd.cut(mergeDF['most_recent'], bins=[-1,
       np.percentile(mergeDF['most_recent'], 20),
       np.percentile(mergeDF['most_recent'], 40),
       np.percentile(mergeDF['most_recent'], 60),
       np.percentile(mergeDF['most_recent'], 80),
       mergeDF['most_recent'].max()],
                           labels = [1,2,3,4,5]).astype('int')

In [15]:
mergeDF[['R_score', 'most_recent']]

Unnamed: 0,R_score,most_recent
0,4,696.0
1,3,654.0
2,5,708.0
3,4,672.0
4,4,696.0
...,...,...
14815,1,552.0
14816,4,696.0
14817,2,576.0
14818,4,678.0


In [16]:
mergeDF['F_score'] = pd.cut(mergeDF['transactions_number'], bins=[-1,
       np.percentile(mergeDF['transactions_number'], 20),
       np.percentile(mergeDF['transactions_number'], 40),
       np.percentile(mergeDF['transactions_number'], 60),
       np.percentile(mergeDF['transactions_number'], 80),
       mergeDF['transactions_number'].max()],
                           labels = [1,2,3,4,5]).astype('int')

In [17]:
mergeDF[['F_score', 'transactions_number']]

Unnamed: 0,F_score,transactions_number
0,3,8.0
1,2,5.0
2,3,8.0
3,4,12.0
4,5,18.0
...,...,...
14815,4,11.0
14816,2,6.0
14817,2,5.0
14818,4,12.0


In [18]:
mergeDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,offer received,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,5.0,4.0,0.60,0.80,696.0,127.60,8.0,4,4,3
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,5.0,5.0,0.60,1.00,654.0,79.46,5.0,3,3,2
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,5.0,3.0,0.60,0.60,708.0,196.86,8.0,5,5,3
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,4.0,4.0,0.75,1.00,672.0,154.05,12.0,4,4,4
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,5.0,4.0,0.60,0.80,696.0,48.34,18.0,2,4,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,6.0,3.0,0.50,0.50,552.0,580.98,11.0,5,1,4
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,5.0,4.0,0.60,0.80,696.0,29.94,6.0,2,4,2
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,3.0,2.0,0.00,0.67,576.0,12.15,5.0,1,2,2
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,4.0,4.0,0.75,1.00,678.0,88.83,12.0,3,4,4


# Dimentionality Reduction with PCA
### ordinal encode gender field
### Scale data

In [21]:
mergeDF['gender_encode'] = np.where(mergeDF['gender']=='M', 0, np.where(mergeDF['gender'] == 'F', 1,2))
mergeDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,offer viewed,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,4.0,0.60,0.80,696.0,127.60,8.0,4,4,3,0
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,5.0,0.60,1.00,654.0,79.46,5.0,3,3,2,2
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,3.0,0.60,0.60,708.0,196.86,8.0,5,5,3,1
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,4.0,0.75,1.00,672.0,154.05,12.0,4,4,4,1
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,4.0,0.60,0.80,696.0,48.34,18.0,2,4,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,3.0,0.50,0.50,552.0,580.98,11.0,5,1,4,1
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,4.0,0.60,0.80,696.0,29.94,6.0,2,4,2,0
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,2.0,0.00,0.67,576.0,12.15,5.0,1,2,2,0
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,4.0,0.75,1.00,678.0,88.83,12.0,3,4,4,0


In [22]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [30]:
scaledDF = pd.DataFrame(ss.fit_transform(mergeDF.iloc[:,2:]))
scaledDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,-1.230488,0.305275,0.316911,-0.453489,0.274803,1.585800,-1.352218,-0.976632,-0.468545,0.578338,...,0.492730,0.339644,0.196280,0.650670,0.081163,-0.070019,0.707190,0.776362,0.117739,-0.842316
1,-0.827786,-0.389256,1.151419,-0.453489,0.620259,1.585800,-0.816084,-0.178736,0.025330,0.882792,...,1.267917,0.339644,1.042102,0.310075,-0.289214,-0.648872,0.000143,0.046908,-0.590416,2.969908
2,0.265262,1.138711,-0.517598,-0.106596,0.274803,0.302263,0.087812,0.144486,0.466734,-0.030569,...,-0.282457,0.339644,-0.649542,0.747983,0.614032,-0.070019,1.414237,1.505816,0.117739,1.063796
3,-1.748248,-0.250350,-0.517598,0.587190,-0.416109,0.302263,-0.497062,-1.293002,-0.799057,0.273885,...,0.492730,0.792722,1.042102,0.456044,0.284663,0.701783,0.707190,0.776362,0.825894,1.063796
4,-1.633190,0.351577,0.316911,-1.494167,1.311171,1.585800,-1.498436,0.205917,1.155888,0.578338,...,0.492730,0.339644,0.196280,0.650670,-0.528643,1.859487,-0.706904,0.776362,1.534049,1.063796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,0.840550,0.814597,-1.352107,-0.453489,0.620259,0.944032,-1.254739,-0.638761,0.150287,0.273885,...,-0.282457,0.037591,-1.072453,-0.517085,3.569356,0.508833,1.414237,-1.412001,0.825894,1.063796
14816,0.955608,0.351577,0.316911,0.587190,0.620259,-0.981273,0.451142,0.543789,1.465303,0.578338,...,0.492730,0.339644,0.196280,0.650670,-0.670208,-0.455921,-0.706904,0.776362,-0.590416,-0.842316
14817,0.955608,-0.435558,1.151419,-0.800381,-0.761565,-0.339505,0.236689,0.070769,0.196699,-0.943929,...,-1.057644,-1.472669,-0.353504,-0.322459,-0.807080,-0.648872,-1.413951,-0.682547,-0.590416,-0.842316
14818,-1.172959,-1.454202,0.316911,1.627868,-1.452477,0.302263,-0.178040,-1.514155,-1.305374,0.273885,...,0.492730,0.792722,1.042102,0.504701,-0.217123,0.701783,0.000143,0.776362,0.825894,-0.842316


In [31]:
from sklearn.decomposition import PCA
pca = PCA()

In [32]:
pca.set_params(n_components = None)
pca.fit(scaledDF)

PCA()

In [33]:
pca.explained_variance_ratio_

array([2.97048785e-01, 1.25851418e-01, 1.02472991e-01, 8.61348898e-02,
       8.12626492e-02, 4.71114165e-02, 4.46920181e-02, 3.58557617e-02,
       3.41681723e-02, 2.82355801e-02, 2.68438825e-02, 1.97084254e-02,
       1.59533917e-02, 1.49934666e-02, 1.07067717e-02, 9.77178461e-03,
       5.36986268e-03, 4.96523329e-03, 3.00844780e-03, 2.52966933e-03,
       1.39911457e-03, 1.17918505e-03, 7.37082333e-04, 4.31235893e-33,
       9.94794712e-34])

In [79]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [73]:
fig = go.Figure()
fig.add_trace(go.Bar(x = list(range(25)), y = pca.explained_variance_ratio_))
fig.add_trace(go.Scatter(x= list(range(25)), y=np.cumsum(pca.explained_variance_ratio_)))
fig.update_layout(
    colorway = [color1, color1],
    showlegend= False,
    title = 'Principal Component Analysis <br> Explained Variance',
    xaxis_title = 'Principal Components',
    yaxis_title = 'Explained Variance',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size = 25),
        linecolor = 'black'     
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        titlefont = dict(size =25),
        linecolor = 'black',
        #showticklabels = False
          
    ),
    legend=dict(
        #yanchor='top',
        #y = .98,
        #xanchor='right',
        #x = 0.98,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
        
    )   
)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'Percent Anonymous Completed':'Anonymous', 'Percent of Known Completed':'Known'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                         legendgroup = newnames[t.name],
#                                        hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                        )
#                  )

## 90% of variance can be explained by the first 11 components

In [61]:
pca.set_params(n_components = 11)
pcaDF = pd.DataFrame(pca.fit_transform(scaledDF))

# Clustering

In [69]:
from sklearn.cluster import KMeans
from PlottingFunctions import plot_inertia, plot_silhouette
from sklearn.metrics import silhouette_score, silhouette_samples

In [66]:
kmeans = KMeans()
distortions = []
for k in range(1,15):
    kmeanModel = KMeans(n_clusters = k)
    kmeanModel.fit(pcaDF)
    distortions.append(kmeanModel.inertia_)

In [67]:
distortions

[337035.5378238354,
 262136.7289420032,
 242195.31193214588,
 225920.13488698588,
 214152.85355983494,
 204024.43683052494,
 195933.81532991957,
 189290.52172483326,
 183296.5603622581,
 178220.5200032758,
 173898.4763411912,
 170302.5462425968,
 167253.60497675062,
 164799.89055233268]

In [83]:
silhos = []
for k in range(2,15):
    kmeanModel = KMeans(n_clusters = k)
    kmeanModel.fit(pcaDF)
    silhos.append(silhouette_score(pcaDF, kmeanModel.labels_, metric = 'euclidean'))
silhos

[0.1924348738858318,
 0.13018680561835458,
 0.11783195590299685,
 0.11700631302313469,
 0.11634007327980221,
 0.10921469380997997,
 0.1079052687445897,
 0.10612453255085759,
 0.10517880727841374,
 0.10448923442817783,
 0.1024054817310237,
 0.10111423256623354,
 0.1008660301914601]

In [106]:
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
    go.Scatter(x = list(range(1,15)), y = distortions, name = 'Distortion'), secondary_y = False
)
fig.add_trace(
    go.Scatter(x = list(range(2,15)), y = silhos, name = 'Silhouette Score'), secondary_y = True
)
fig.add_trace(
    go.Scatter(x = [4,4],y = [0,338000], name = 'Clusters = 4',line = dict(dash = 'dash')), secondary_y = False
)
fig.update_layout(
    colorway = [color1, color3, 'black'],
    title = 'Inertia and Silhouette Scores K-Means Fit <br> on Transformed PCA Data',
#    xaxis_title = 'Principal Components',
#    yaxis_title = 'Explained Variance',
    title_x=0.5,
    plot_bgcolor = backgroundColor,
    title_font = dict(size = 25),
    bargap = 0.2,
    yaxis = dict(
        tickfont = dict(size=12),
        titlefont = dict(size = 25),
        linecolor = 'black',
        nticks = 10,
        showgrid = False
    ),
    xaxis = dict(
        tickfont = dict(size=16),
        title = 'Number of Clusters',
        titlefont = dict(size =25),
        linecolor = 'black'
        
          
    ),
    legend=dict(
        yanchor='top',
        y = .98,
        xanchor='right',
        x = 0.9,
        bgcolor = '#DFBA8F',
        bordercolor = 'black',
        borderwidth = 2,
        font = dict(
            color = 'black'
        )
        
    )   
)
fig.update_yaxes(title_text = 'Distortion', secondary_y=False)
fig.update_yaxes(title_text = 'Silhouette Score', secondary_y=True, title_font = {'size':20}, nticks=12, showgrid=False)
fig.update_traces(
    marker_line_width=1,
    marker_line_color="black"
)
#newnames = {'Percent Anonymous Completed':'Anonymous', 'Percent of Known Completed':'Known'}
#fig.for_each_trace(lambda t: t.update(name = newnames[t.name],
#                                         legendgroup = newnames[t.name],
#                                        hovertemplate = t.hovertemplate.replace(t.name, newnames[t.name])
#                                        )
#                  )

In [107]:
kmeans = KMeans()
kmeans.set_params(n_clusters = 4)
kmeans.fit(pcaDF)

KMeans(n_clusters=4)

In [108]:
kmeans.labels_

array([3, 3, 2, ..., 1, 3, 0])

In [112]:
clusterDF = pd.concat([mergeDF, pd.Series(kmeans.labels_, name = 'cluster')], axis = 1)
clusterDF

Unnamed: 0,person,gender,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
0,0009655768c64bdeb2e877511632db8f,M,33,72000.0,2017,3.0,5.0,4.0,2.083333,5.416667,...,0.60,0.80,696.0,127.60,8.0,4,4,3,0,3
1,0011e0d4e6b944f998e987f904e8c1e5,O,40,57000.0,2018,3.0,6.0,4.0,3.000000,7.384615,...,0.60,1.00,654.0,79.46,5.0,3,3,2,2,3
2,0020c2b971eb4e9188eac86d93036a77,F,59,90000.0,2016,4.0,5.0,2.0,4.545455,8.181818,...,0.60,0.60,708.0,196.86,8.0,5,5,3,1,2
3,0020ccbbb6d84e358d3414a3ff76cffd,F,24,60000.0,2016,6.0,3.0,2.0,3.545455,4.636364,...,0.75,1.00,672.0,154.05,12.0,4,4,4,1,0
4,003d66b6608740288d6cc97a6903f4f0,F,26,73000.0,2017,0.0,8.0,4.0,1.833333,8.333333,...,0.60,0.80,696.0,48.34,18.0,2,4,5,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14815,fff3ba4757bd42088c044ca26d73817a,F,69,83000.0,2015,3.0,6.0,3.0,2.250000,6.250000,...,0.50,0.50,552.0,580.98,11.0,5,1,4,1,0
14816,fff7576017104bcc8677a8d63322b5e1,M,71,73000.0,2017,6.0,6.0,0.0,5.166667,9.166667,...,0.60,0.80,696.0,29.94,6.0,2,4,2,0,0
14817,fff8957ea8b240a6b5e634b6ee8eafcf,M,71,56000.0,2018,2.0,2.0,1.0,4.800000,8.000000,...,0.00,0.67,576.0,12.15,5.0,1,2,2,0,1
14818,fffad4f4828548d1b5583907f2e9906b,M,34,34000.0,2017,9.0,0.0,2.0,4.090909,4.090909,...,0.75,1.00,678.0,88.83,12.0,3,4,4,0,3


In [113]:
cluster0DF = clusterDF.loc[clusterDF['cluster'] == 0]
cluster1DF = clusterDF.loc[clusterDF['cluster'] == 1]
cluster2DF = clusterDF.loc[clusterDF['cluster'] == 2]
cluster3DF = clusterDF.loc[clusterDF['cluster'] == 3]

# Cluster Analysis

In [115]:
cluster0DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,...,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0
mean,55.608146,69351.125727,2016.325828,6.470276,5.948394,1.321528,4.777743,8.133931,6.749847,11.423982,...,0.772198,0.865054,655.938275,211.002168,11.757652,4.192006,3.34809,3.770301,0.509992,0.0
std,16.820287,21180.498616,1.100737,3.102449,3.197429,1.426891,1.470308,1.784977,0.832844,2.662692,...,0.180269,0.155918,50.047712,166.605445,5.082264,0.924978,1.197905,1.112525,0.531844,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,1.2,3.4,4.2,0.0,...,0.17,0.33,372.0,21.47,1.0,1.0,1.0,1.0,0.0,0.0
25%,45.0,54000.0,2016.0,4.0,3.0,0.0,3.714286,6.882353,6.142857,10.0,...,0.67,0.8,630.0,121.08,8.0,4.0,2.0,3.0,0.0,0.0
50%,56.0,68000.0,2016.0,6.0,6.0,1.0,4.6875,8.0,6.714286,12.0,...,0.8,0.83,666.0,176.53,11.0,4.0,3.0,4.0,0.0,0.0
75%,67.0,85000.0,2017.0,9.0,8.0,2.0,5.785714,9.25,7.272727,13.0,...,1.0,1.0,696.0,244.96,15.0,5.0,4.0,5.0,1.0,0.0
max,101.0,120000.0,2018.0,18.0,17.0,7.0,10.0,16.0,10.0,18.0,...,1.0,1.0,714.0,1608.69,36.0,5.0,5.0,5.0,2.0,0.0


In [116]:
cluster1DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,...,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0,3458.0
mean,51.508676,58807.981492,2017.053788,2.675824,2.420185,1.239734,4.329273,7.742314,6.456664,4.970214,...,0.166408,0.580951,564.034702,36.323809,4.976287,1.693464,2.417582,1.857432,0.331116,1.0
std,17.862411,20504.248625,1.112944,1.889453,1.833263,1.235885,1.985255,2.88638,1.199843,1.941726,...,0.225777,0.26813,167.430806,45.069734,3.140618,0.932347,1.36837,1.036604,0.491723,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
25%,37.0,42000.0,2017.0,1.0,1.0,0.0,2.857143,5.857143,5.666667,4.0,...,0.0,0.4,528.0,11.1725,3.0,1.0,1.0,1.0,0.0,1.0
50%,53.0,56000.0,2017.0,2.0,2.0,1.0,4.25,7.625,6.4,5.0,...,0.0,0.5,612.0,20.37,5.0,1.0,2.0,2.0,0.0,1.0
75%,64.0,71000.0,2018.0,4.0,4.0,2.0,5.666667,10.0,7.222222,6.0,...,0.33,0.75,672.0,45.5075,6.0,2.0,4.0,2.0,1.0,1.0
max,101.0,119000.0,2018.0,9.0,9.0,6.0,10.0,20.0,10.0,10.0,...,1.0,1.0,714.0,743.07,23.0,5.0,5.0,5.0,2.0,1.0


In [117]:
cluster2DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,...,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0,3847.0
mean,54.597089,66854.951911,2016.146088,3.587991,4.876787,0.906161,4.480861,9.02329,7.147975,7.888485,...,0.665214,0.722506,655.654796,139.899607,10.327008,3.551859,3.404731,3.430205,0.489992,2.0
std,17.601156,22235.625769,1.225901,2.383094,2.505926,1.083743,1.661536,2.306576,0.968942,2.224477,...,0.237599,0.227427,56.075755,113.698182,4.768336,1.086652,1.247237,1.238468,0.528286,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,1.0,2.5,4.166667,0.0,...,0.0,0.0,360.0,12.2,1.0,1.0,1.0,1.0,0.0,2.0
25%,42.0,49000.0,2015.0,2.0,3.0,0.0,3.25,7.363636,6.428571,6.0,...,0.5,0.5,630.0,69.015,7.0,3.0,2.0,3.0,0.0,2.0
50%,56.0,65000.0,2016.0,3.0,5.0,0.0,4.333333,8.846154,7.090909,8.0,...,0.67,0.75,672.0,121.66,10.0,4.0,4.0,4.0,0.0,2.0
75%,67.0,83000.0,2017.0,5.0,6.0,2.0,5.545455,10.181818,7.818182,9.0,...,0.8,1.0,696.0,176.83,13.0,4.0,4.0,4.0,1.0,2.0
max,101.0,120000.0,2018.0,11.0,14.0,5.0,10.0,20.0,10.0,14.0,...,1.0,1.0,714.0,1211.76,36.0,5.0,5.0,5.0,2.0,2.0


In [122]:
cluster3DF.iloc[:,2:].describe()

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
count,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,...,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0,3562.0
mean,55.607805,65871.982033,2017.038181,4.267546,3.275407,2.712802,3.942639,6.270153,5.93151,7.679674,...,0.291561,0.831053,578.31443,66.479239,5.761931,2.348681,2.474453,2.097979,0.421954,3.0
std,16.968288,20956.606079,1.053312,2.442258,2.37964,1.773086,1.606145,1.965949,0.912677,2.454272,...,0.216833,0.173994,149.606797,65.720322,3.757988,1.126004,1.353143,1.183383,0.524813,0.0
min,18.0,30000.0,2013.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.17,0.0,0.0,0.0,1.0,1.0,1.0,0.0,3.0
25%,45.0,51000.0,2017.0,3.0,2.0,2.0,2.777778,5.0,5.3,6.0,...,0.17,0.75,546.0,21.95,3.0,1.0,1.0,1.0,0.0,3.0
50%,56.0,64000.0,2017.0,4.0,3.0,2.0,3.851648,6.25,5.909091,8.0,...,0.25,0.8,624.0,49.67,5.0,2.0,2.0,2.0,0.0,3.0
75%,67.0,80000.0,2018.0,6.0,5.0,4.0,5.0,7.555556,6.545455,9.0,...,0.5,1.0,672.0,95.425,7.0,3.0,4.0,3.0,1.0,3.0
max,101.0,120000.0,2018.0,13.0,12.0,10.0,10.0,13.571429,9.5,15.0,...,1.0,1.0,714.0,1016.93,25.0,5.0,5.0,5.0,2.0,3.0


### Use Radar Plots to compare like metrics of the 4 clusters

In [131]:
radarDF = pd.concat([cluster0DF.iloc[:,2:].describe().loc[['mean']],cluster1DF.iloc[:,2:].describe().loc[['mean']],cluster2DF.iloc[:,2:].describe().loc[['mean']],cluster3DF.iloc[:,2:].describe().loc[['mean']]])
radarDF['cluster'] = [0,1,2,3]
radarDF

Unnamed: 0,age,income,year_became_member,BOGO_Number,Discount_Number,Informational_Number,reward_avg,difficulty_avg,duration_avg,web,...,ratio_completed,ratio_viewed,most_recent,total_spent,transactions_number,M_score,R_score,F_score,gender_encode,cluster
mean,55.608146,69351.125727,2016.325828,6.470276,5.948394,1.321528,4.777743,8.133931,6.749847,11.423982,...,0.772198,0.865054,655.938275,211.002168,11.757652,4.192006,3.34809,3.770301,0.509992,0
mean,51.508676,58807.981492,2017.053788,2.675824,2.420185,1.239734,4.329273,7.742314,6.456664,4.970214,...,0.166408,0.580951,564.034702,36.323809,4.976287,1.693464,2.417582,1.857432,0.331116,1
mean,54.597089,66854.951911,2016.146088,3.587991,4.876787,0.906161,4.480861,9.02329,7.147975,7.888485,...,0.665214,0.722506,655.654796,139.899607,10.327008,3.551859,3.404731,3.430205,0.489992,2
mean,55.607805,65871.982033,2017.038181,4.267546,3.275407,2.712802,3.942639,6.270153,5.93151,7.679674,...,0.291561,0.831053,578.31443,66.479239,5.761931,2.348681,2.474453,2.097979,0.421954,3


In [139]:
A = pd.melt(radarDF.loc[:, ['BOGO_Number','Discount_Number', 'Informational_Number', 'reward_avg', 'difficulty_avg', 'duration_avg', 'cluster']],id_vars='cluster' )
A

Unnamed: 0,cluster,variable,value
0,0,BOGO_Number,6.470276
1,1,BOGO_Number,2.675824
2,2,BOGO_Number,3.587991
3,3,BOGO_Number,4.267546
4,0,Discount_Number,5.948394
5,1,Discount_Number,2.420185
6,2,Discount_Number,4.876787
7,3,Discount_Number,3.275407
8,0,Informational_Number,1.321528
9,1,Informational_Number,1.239734


In [140]:
px.line_polar(A, r = 'value', theta = 'variable', color = 'cluster')

In [141]:
radarDF.columns

Index(['age', 'income', 'year_became_member', 'BOGO_Number', 'Discount_Number',
       'Informational_Number', 'reward_avg', 'difficulty_avg', 'duration_avg',
       'web', 'email', 'social', 'mobile', 'offer completed', 'offer received',
       'offer viewed', 'ratio_completed', 'ratio_viewed', 'most_recent',
       'total_spent', 'transactions_number', 'M_score', 'R_score', 'F_score',
       'gender_encode', 'cluster'],
      dtype='object')

In [143]:
A = pd.melt(radarDF.loc[:, ['web', 'email', 'social', 'mobile', 'offer completed', 'offer received', 'offer viewed', 'M_score', 'F_score', 'R_score','cluster']],id_vars='cluster' )

In [144]:
px.line_polar(A, r = 'value', theta = 'variable', color = 'cluster')