In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import All Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans,DBSCAN

In [None]:
from yellowbrick.cluster import KElbow,KElbowVisualizer,SilhouetteVisualizer,ClusteringScoreVisualizer
from sklearn.metrics import silhouette_score

## Load Data

In [None]:
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df

## Exploratory Data Analysis

In [None]:
df.info()

In [None]:
df = df.drop('CustomerID',axis=1)
df

In [None]:
df.Gender.value_counts()

In [None]:
df.Gender.value_counts().plot(kind='pie',autopct="%.1f%%")
plt.title('Gender Distributions Data')
plt.legend()
plt.show()

## Checking Normal Distributions Of Data

In [None]:
sns.kdeplot(df['Age'])
plt.show()

In [None]:
sns.kdeplot(df['Annual Income (k$)'])
plt.show()

In [None]:
sns.kdeplot(df['Spending Score (1-100)'])
plt.show()

In [None]:
df['Gender'] = df['Gender'].replace({'Female':0,'Male':1})

In [None]:
df.head()

### KdePlot

In [None]:
plt.figure(figsize=(12,10))
i=1
for col in df.columns:
    plt.subplot(2,2,i)
    sns.kdeplot(df[col])
    i+=1

### Histogram Plot

In [None]:
plt.figure(figsize=(10,10))
i=1
for col in df.columns:
    plt.subplot(2,2,i)
    sns.histplot(df[col])
    i+=1

## Checking Outliers Detection

In [None]:
plt.figure(figsize=(10,10))
i=1
for col in df.columns:
    plt.subplot(2,2,i)
    df[[col]].boxplot()
    i+=1

## Scale The Data

In [None]:
df.head()

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled.shape

In [None]:
df_scaled_data = pd.DataFrame(df_scaled,columns=df.columns)
df_scaled_data

## Kmeans Model

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
inertia_data = []
for k in range(1,15):
    model = KMeans(n_clusters=k)
    model.fit(df_scaled_data)
    inertia_data.append(model.inertia_)

In [None]:
inertia_data

In [None]:
plt.plot(range(1,15),inertia_data,color='green', marker='o', linestyle='dashed',linewidth=2, markersize=12)
plt.title("Elbow Method")
plt.xlabel("Number Of Clusters")
plt.ylabel("Inertia Of Model")
plt.show()

## KElbowVisualizer

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans(random_state=1)
visualizer = KElbowVisualizer(model, k=(1,15))

visualizer.fit(df_scaled_data)
visualizer.show()
plt.show()

In [None]:
from yellowbrick.cluster import KElbowVisualizer

model = KMeans(random_state=1)
visualizer = KElbowVisualizer(model, k=(2,15),metric='silhouette')

visualizer.fit(df_scaled_data)
visualizer.show()
plt.show()

### SilhouetteVisualizer

### metrics >> distortion

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
model = KMeans(n_clusters=5,)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(df_scaled_data)
visualizer.show()
plt.show()

### metrics >> silhouette

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
model = KMeans(n_clusters=10)
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(df_scaled_data)
visualizer.show()
plt.show()

## silhouette score

In [None]:
# Calculate silhouette score for different values of k
silhouette_scores = []
for k in range(2, 15):
    kmeans = KMeans(n_clusters=k)
    cluster_labels = kmeans.fit_predict(df_scaled_data)
    silhouette_avg = silhouette_score(df_scaled_data, cluster_labels)
    silhouette_scores.append(silhouette_avg)

In [None]:
silhouette_scores

In [None]:
plt.plot(silhouette_scores,range(2,15),color='green', marker='o', linestyle='dashed',linewidth=2, markersize=12)
plt.title("Silhouette Score and Cluster Distributions")
plt.xlabel("Silhouette Score")
plt.ylabel("No. Of Cluster")
plt.show()

## Predicted Labels Distributions Data

In [None]:
m = KMeans(n_clusters=5)
y_pred = m.fit_predict(df_scaled_data)

In [None]:
y_pred    # Predicted Labels if K = 5

In [None]:
m1 = KMeans(n_clusters=10)
y_pred1 = m1.fit_predict(df_scaled_data)
y_pred1    # Predicted Labels if K = 10

In [None]:
df['K_5_y_pred'] = y_pred
df['K_10_y_pred'] = y_pred1
df

In [None]:
sns.scatterplot(x=df['Age'],y=df['Annual Income (k$)'],hue=df['K_5_y_pred'],palette=['Red','Blue','Green','Yellow','Violet'])
plt.title(f"No. Of Cluster:{5} Distributions Labels")
plt.show()

In [None]:
sns.scatterplot(x=df['Age'],y=df['Annual Income (k$)'],hue=df['K_10_y_pred'])#palette=['Red','Blue','Green','Yellow','Violet'])
plt.title(f"No. Of Cluster:{10} Distributions Labels")
plt.show()