In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("ticks")

In [None]:
df = pd.read_csv("Mall_Customers.csv")

In [None]:
df.head()

In [None]:
rename_cols = {"Annual Income (k$)": "Income",
               "Spending Score (1-100)": "SpendingScore"}
df = df.rename(columns=rename_cols)

In [None]:
df.shape

In [None]:
df.dtypes

# EDA

In [None]:
df["Gender"].value_counts()

In [None]:
df["Gender"].unique()

In [None]:
df.describe()

In [None]:
fig, ax = plt.subplots(nrows=1,ncols=3,figsize=(18,6))

sns.distplot(df["Age"],ax=ax[0])
ax[0].set_title("Age Distribution")

sns.distplot(df["Income"], ax=ax[1])
ax[1].set_title("Income Distribution")

sns.distplot(df["SpendingScore"], ax=ax[2])
ax[2].set_title("Spending Score Distribution")
plt.show()

In [None]:
df["Age"].hist()

In [None]:
plt.figure(figsize=(10,8))
sns.scatterplot(data=df, x="Income", y="Age", hue="Gender");

In [None]:
def scatter_plot_2d(df, x, y, hue):
    plt.figure(figsize=(10,8))
    sns.scatterplot(data=df, x=x, y=y, hue=hue, style=hue);

In [None]:
scatter_plot_2d(df, "Age", "SpendingScore", "Gender")

In [None]:
scatter_plot_2d(df, "Income", "SpendingScore", "Gender")

# K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
X = df[["Age", "SpendingScore"]]
# model = KMeans(n_clusters=2, init='k-means++')
model = KMeans(n_clusters=2, init='random')
clusters = model.fit(X)

In [None]:
X.head()

In [None]:
cluster_numbers = clusters.predict(X)
cluster_numbers

In [None]:
df['new'] = cluster_numbers
df['new'].value_counts()

In [None]:
df[df['new']==1].head()

In [None]:
def scatter_plot_2d(df, x, y, hue):
    plt.figure(figsize=(10,8))
    sns.scatterplot(data=df, x=x, y=y, hue=hue, style=hue);

In [None]:
scatter_plot_2d(df, "Age", "SpendingScore", cluster_numbers)

In [None]:
scatter_plot_2d(df, "Income", "Age", cluster_numbers)

In [None]:
scatter_plot_2d(df, "Income", "SpendingScore", cluster_numbers)

In [None]:
sns.scatterplot(data=df, x='Age', y='SpendingScore', hue=cluster_numbers, style='Gender')
plt.show()

## Determining optimal number of clusters

Using elbow plot

In [None]:
X1 = df[["Age", "SpendingScore"]]
clusters1 = KMeans(n_clusters=2, n_init=10, init="random").fit(X1)

In [None]:
clusters1.inertia_

In [None]:
results = []
for i in range(1,10):
    X_n = df[["Age", "SpendingScore"]]
    clusters_n = KMeans(n_clusters=i, n_init=10, init="random").fit(X_n)
    results.append({"n": i, "inertia": clusters_n.inertia_})
elbow_plot = pd.DataFrame(results)

In [None]:
elbow_plot

In [None]:
results

In [None]:
plt.figure(figsize=(12,8))
sns.lineplot(data=elbow_plot, x="n", y="inertia")
plt.title("Optimal Number of Clusters")

In [None]:
def plot_elbow(df, dims):
    results = []
    for i in range(1,10):
        X_n = df[dims]
        clusters_n = KMeans(n_clusters=i, n_init=10, init="random").fit(X_n)
        results.append({"n": i+1, "inertia": clusters_n.inertia_})
    elbow_plot = pd.DataFrame(results)
    
    plt.figure(figsize=(12,8))
    sns.lineplot(data=elbow_plot, x="n", y="inertia")
    plt.title("Optimal Number of Clusters")

In [None]:
plot_elbow(df, ["Age", "SpendingScore", 'Income'])

In [None]:
def cluster_2d(df, x, y, n_clusters):
    X = df[[x, y]]
    clusters = KMeans(n_clusters=n_clusters, n_init=10, init="random").fit(X)
    scatter_plot_2d(df, x, y, clusters.predict(X))

In [None]:
cluster_2d(df, "Age", "SpendingScore", 4)

In [None]:
model = KMeans(n_clusters=4, n_init=10, init="random")
clusters = model.fit(df[["Age", "SpendingScore"]])
cluster_numbers = clusters.predict(df[["Age", "SpendingScore"]])
cluster_numbers

In [None]:
X2 = df.assign(cluster=cluster_numbers)
X2 = df.drop(columns = 'CustomerID', axis=1)

In [None]:
X2.drop('Gender', axis =1,inplace=True)

In [None]:
X2.head()

In [None]:
clusters = model.fit(df[["Age", "SpendingScore",'Income']])

In [None]:
import plotly
import plotly.graph_objs as go

In [None]:
# scatter = go.Scatter3d(x=X2["Age"],
#                      y=X2["Income"],
#                      z=X2["SpendingScore"],
#                      mode="markers",
#                      marker={"color": clusters.predict(X2),
#                              "size": 8,
#                              "line": {"color": clusters.predict(X2),
#                                       "width":12}});

# layout = go.Layout(title="Clusters",
#                    scene={"xaxis": {"title": "Age"},
#                           "yaxis": {"title": "Income"},
#                           "zaxis": {"title": "SpendingScore"}});

# fig = go.Figure(data=scatter, layout=layout);
# plotly.offline.iplot(fig)