In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
import numpy as np
import scipy as sp
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering as AC


In [None]:
#load in data
customer_data = pd.read_csv('customer_data.csv')

In [None]:
customer_data.shape #check the dimensions

In [None]:
customer_data.head() #let's peek at our data frame

In [None]:
data = customer_data.iloc[:,3:5].values #removes first 3 columns leaving only annual income and spending score

plt.scatter(data[:,0],data[:,1]) #let's look at our data
plt.title('Customer spending habits based on household income')
plt.xlabel('Household income')
plt.ylabel('Spending score')

### The plot shows some clear grouping

In [None]:
plt.figure(figsize=(10,7))
plt.title("Customer Dendrograms")
dend = sch.dendrogram(sch.linkage(data,method='complete')) #try with different linkage methods

#### Also, take a moment to examine your dendrogram for distinct clusters at different "heights"

In [None]:
cluster = AC(n_clusters=5,affinity='euclidean',linkage='complete')
cluster.fit_predict(data) #tells us what clusters our data lives in out of the 5

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(data[:,0],data[:,1],c=cluster.labels_,cmap='rainbow')
plt.xlabel('Household income')
plt.ylabel('Spending score')
plt.title('Customer spending habits based on household income')
#what inferences might we make from these clusters?

## Let's look at the same problem utilizing K-means clustering


In [None]:
#import Kmeans tool from sklearn
from sklearn.cluster import KMeans

Let's plot an elbow test!

Typically, as a good rule of thumb this will show you a reasonable number of clusters to use.
We will pick the number where the line begins to flatten.

In [None]:
nk = range(1,20) #generate list from 1 to 10 as number of clusters for elbow test

In [None]:
kmeans=[KMeans(n_clusters=i) for i in nk] #set up array of the K-means algorithm to test data over each number of clusters

In [None]:
score = [kmeans[i].fit(data).score(data) for i in range(len(kmeans))] #Create array of scores for data for k-means algos

In [None]:
#plot your elbow test!
plt.figure(figsize=(10,7))
plt.plot(nk,score)
plt.xlabel('Number of clusters')
plt.ylabel('Score')

In [None]:
kmeans=KMeans(n_clusters=5)
kmeans_out=kmeans.fit(data)
kmeans_out

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(data[:,0],data[:,1],c=kmeans_out.labels_,cmap='rainbow')
plt.title('Customer spending habits based on household income')
plt.xlabel('Household income')
plt.ylabel('Spending score')

## Dimensionality reduction with PCA

We just got a little more data on our customers! 

However, we now have several variables to work with.

You know that there are very good tools for dimensionality reduction, and you are going to implement PCA
on this data to make sure you are examining the most relevant variables of your data.





In [None]:
#load in the extended customer data
customer_data = pd.read_csv('customer_data_extended.csv')

In [None]:
#let's get the dimensions
customer_data.shape

In [None]:
#Let's take a look at the fields we have in our data frame.
customer_data.head()

In [None]:
#We now have Gender, Age, Annual Income, Spending Score, and Number of items bought for each customer
#Let's run PCA to find out which variables are responsible for most of the variance in our dataset!

#import necessary tools
from mpl_toolkits.mplot3d import Axes3D #for plotting 3D graphs

from sklearn.preprocessing import StandardScaler
from sklearn import decomposition #for PCA tool
from sklearn import datasets 



data=customer_data.iloc[:,3:6].values #removes first 3 columns 
data.shape

X = StandardScaler().fit_transform(data)
X

In [None]:
pca=decomposition.PCA() #set pca to be your PCA function
pc=pca.fit_transform(X) #apply pca to your data
pc.shape #sanity check for your life (What are the expected dimensions?)

In [None]:
#covariance
pca.get_covariance() #What dimensions should this matrix have? Why?

In [None]:
explained_var=pca.explained_variance_ratio_ #gives the percentage of variance from each component
explained_var #how many items should be in this array?

In [None]:
#Let's plot our explained variances to compare them
plt.figure(figsize=(10,7))
plt.bar(range(3),explained_var)
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.title('Explained variance ratio of principal components')

In [None]:
#Which components should we keep, and which should we throwaway?

In [None]:
plt.figure(figsize=(10,7))
plt.scatter(data[:,0],data[:,1])
plt.title('Costumer spending habits based on household income')
plt.ylabel('Spending score')
plt.xlabel('Household income')

In [None]:
plt.figure(figsize=(10,7))
plt.title('PCA of customer spending habits')
plt.scatter(pc[:,0],pc[:,1])
plt.ylabel('PC2')
plt.xlabel('PC1')

In [None]:
#Just for the sake of comparison, let's plot the 3d data
fig = plt.figure(figsize=(10,7))
ax = plt.axes(projection='3d')
ax.scatter(data[:,0],data[:,1],data[:,2])
ax.set_xlabel('Household income')
ax.set_ylabel('Spending score')
ax.set_zlabel('Number of items bought')
ax.set_title('Consumer spending habits')

In [None]:
#Just for the sake of comparison, let's plot the 3d data
fig = plt.figure(figsize=(10,7))
ax = plt.axes(projection='3d')
ax.scatter(pc[:,0],pc[:,1],pc[:,2])
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('PCA for Consumer spending habits')