In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
from matplotlib import cm

import seaborn as sns
import viz

import wrangle_zillow as w
from pydataset import data

In [12]:
def cluster(df, feature1, feature2, k):
    X = df[[feature1, feature2]]

    kmeans = KMeans(n_clusters=k).fit(X)
    
    df['cluster'] = kmeans.labels_
    df.cluster = df.cluster.astype('category')
    
    df['cluster'] = kmeans.predict(X)

    centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)

    df.groupby('cluster')[feature1, feature2].mean()
    
    plt.figure(figsize=(9, 7))
    
    for cluster, subset in df.groupby('cluster'):
        plt.scatter(subset[feature2], subset[feature1], label='cluster ' + str(cluster), alpha=.6)
    
    centroids.plot.scatter(y=feature1, x=feature2, c='black', marker='x', s=100, ax=plt.gca(), label='centroid')
    
    plt.legend()
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    plt.title('Visualizing Cluster Centers')

    return

def inertia(df, feature1, feature2, r1, r2):
    cols = [feature1, feature2]
    X = df[cols]
    
    inertias = {}
    
    for k in range(r1, r2):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        inertias[k] = kmeans.inertia_
    
    pd.Series(inertias).plot(xlabel='k', ylabel='Inertia', figsize=(9, 7))
    plt.grid()
    return

# def scale_minmax(df):
#     scaler = MinMaxScaler()
#     df_scaled = scaler.fit_transform(df)
#     df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
#     return df_scaled


In [3]:
iris = data('iris')

In [4]:
iris.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [5]:
iris.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species']

In [6]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [11]:
scaled_iris = scale_minmax(iris)


ValueError: could not convert string to float: 'setosa'

### 1.Clustering with the Iris Dataset
Using this lesson as a guide, perform clustering on the iris dataset.

Choose features other than the ones used in the lesson. Visualize the results of your clustering. Use the elbow method to visually select a good value for k. Repeat the clustering, this time with 3 different features.

In [None]:
inertia(iris, 'sepal_width', 'petal_width', 2, 6)

In [None]:
X = iris[['sepal_width', 'petal_width']]
kmeans = KMeans(n_clusters = 3)
kmeans.fit(X)
kmeans.predict(X)


In [None]:
iris['cluster'] = kmeans.predict(X)

In [None]:
iris.head()

### Clustering Centers

In [None]:
kmeans.cluster_centers_


In [None]:
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=X.columns)
centroids

In [None]:
cluster(iris, 'sepal_width', 'petal_width', 3)

In [None]:
with plt.style.context('seaborn-whitegrid'):
     plt.figure(figsize=(20, 9))
     plt.subplot(1, 3, 1)
     sns.scatterplot(x = iris.sepal_width, y = iris.petal_width, data=iris)
     plt.title('Original Data')
     plt.subplot(1, 3, 2)
     sns.scatterplot(x = iris.sepal_width, y = iris.petal_width, hue=iris.species, data=iris)
     plt.title('Clustered Data')
     plt.subplot(1, 3, 3)
     sns.scatterplot(x = iris.sepal_width, y = iris.petal_width, hue=iris.cluster, data=iris)
     plt.title('Original Data with Cluster Labels')

In [None]:
inertia(iris,'petal_length', 'petal_width', 2, 6)


In [None]:
cluster(iris, 'petal_length', 'petal_width', 3)

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(20, 9))
    plt.subplot(1, 3, 1)
    sns.scatterplot(x = 'petal_length', y = iris.petal_width, data=iris)
    plt.title('Original Data')
    plt.subplot(1, 3, 2)
    sns.scatterplot(x = 'petal_length', y = iris.petal_width, hue=iris.species, data=iris)
    plt.title('Clustered Data')
    plt.subplot(1, 3, 3)
    sns.scatterplot(x = 'petal_length', y = iris.petal_width, hue=iris.cluster, data=iris)
    plt.title('Original Data with Cluster Labels')

In [None]:
sns.relplot( data = iris, x = 'petal_length', y = 'petal_width', hue = 'cluster')

In [None]:
cluster(iris, 'petal_length', 'petal_width', 3)

### 2.) Use the techniques discussed in this lesson, as well as the insights gained from the exploration exercise to perform clustering on the mall customers dataset. Be sure to visualize your results!

In [None]:
mall = w.get_mall_data()

In [None]:
mall.head()

In [None]:
mall.dtypes

In [None]:
# get dummies for gender column
dummy_df = pd.get_dummies(mall.gender, drop_first=True)
mall = pd.concat([mall, dummy_df], axis=1).drop(columns = ['gender'])
mall.rename(columns= {'Male': 'is_male'}, inplace = True)


In [None]:
mall.head()

In [None]:
inertia(mall, 'age', 'spending_score', 2, 6)

In [None]:
X = mall[['age', 'spending_score']]
kmeans = KMeans(n_clusters = 4)
kmeans.fit(X)
kmeans.predict(X)

In [None]:
mall['cluster'] = kmeans.predict(X)

In [None]:
mall.head()

In [None]:
cluster(mall, 'age', 'spending_score', 4)

In [None]:
with plt.style.context('seaborn-whitegrid'):
    plt.figure(figsize=(20, 9))
    plt.subplot(1, 3, 1)
    sns.scatterplot(x = 'age', y = 'spending_score', data=mall)
    plt.title('Original Data')
    plt.subplot(1, 3, 2)
    sns.scatterplot(x = 'age', y = 'spending_score', hue='annual_income', data=mall)
    plt.title('Clustered Data')
    plt.subplot(1, 3, 3)
    sns.scatterplot(x = 'age', y = 'spending_score', hue='cluster', data=mall)
    plt.title('Original Data with Cluster Labels')