# Import Modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from scipy import stats
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, adjusted_rand_score
from sklearn.cluster import KMeans,DBSCAN,MeanShift,SpectralClustering,AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.impute import SimpleImputer, KNNImputer
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTENC,SMOTE
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import matplotlib.cm as cm
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.metrics import silhouette_samples, silhouette_score
import tensorflow.keras

# Data Ingestion

In [None]:
customer_df=pd.read_csv(r"customer_segmentation.csv")
customer_df

# EDA

### Data Info

In [None]:
customer_df.info()

In [None]:
duplicate=customer_df.duplicated().sum()
duplicate[duplicate>0]

### Statistical Info

In [None]:
customer_df.describe()

In [None]:
customer_df.describe(include="object")

In [None]:
customer_df['Dt_Customer'] = pd.to_datetime(customer_df['Dt_Customer'])
#ustomer_df.info()

### Analysis

### Univariate

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(15, 15))
axes = axes.flatten()
for i, col in enumerate(customer_df.select_dtypes(include="number").drop("ID",axis=1).columns):
    sns.histplot(customer_df[col], ax=axes[i], kde=True)
    axes[i].set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
axes = axes.flatten()
for i,col in enumerate(customer_df.select_dtypes(include="object").columns):
    customer_df[col].value_counts(ascending = False).head(20).plot(kind = 'bar',ax=axes[i])
    #sns.countplot(credit_df[col], ax=axes[i])
    axes[i].set_title(f'Count of {col}')
    #axes[i].tick_params(axis='x', rotation=45) 
plt.tight_layout()
plt.show()

### Bivariate

In [None]:
fig,axes=plt.subplots(3,3,figsize=(15,15))
axes = axes.flatten()
sns.scatterplot(data=customer_df,x="Income",y='MntMeatProducts',ax=axes[0])
sns.scatterplot(data=customer_df,x="Income",y='MntFishProducts',ax=axes[1])
sns.scatterplot(data=customer_df,x="Income",y='MntGoldProds',ax=axes[2])
sns.scatterplot(data=customer_df,x="Income",y='NumDealsPurchases',ax=axes[3])
sns.scatterplot(data=customer_df,x="Income",y='NumWebPurchases',ax=axes[4])
sns.scatterplot(data=customer_df,x="Income",y='NumCatalogPurchases',ax=axes[5])
sns.scatterplot(data=customer_df,x="Income",y='MntSweetProducts',ax=axes[6])
sns.scatterplot(data=customer_df,x="Income",y='MntWines',ax=axes[7])
sns.scatterplot(data=customer_df,x="Income",y='MntFruits',ax=axes[8])
plt.tight_layout()
plt.show()

In [None]:
fig,axes=plt.subplots(3,3,figsize=(15,15))
axes = axes.flatten()
sns.scatterplot(data=customer_df,x="Z_CostContact",y='NumWebVisitsMonth',ax=axes[0])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='NumStorePurchases',ax=axes[1])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='Response',ax=axes[2])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='NumDealsPurchases',ax=axes[3])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='NumWebPurchases',ax=axes[4])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='NumCatalogPurchases',ax=axes[5])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='MntSweetProducts',ax=axes[6])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='MntWines',ax=axes[7])
sns.scatterplot(data=customer_df,x="Z_CostContact",y='MntFruits',ax=axes[8])
plt.tight_layout()
plt.show()

### Multivariate

In [None]:
fig,axes=plt.subplots(3,3,figsize=(15,15))
axes = axes.flatten()
sns.scatterplot(data=customer_df,x="Income",y='MntMeatProducts',hue="Education",ax=axes[0])
sns.scatterplot(data=customer_df,x="Income",y='MntFishProducts',hue="Education",ax=axes[1])
sns.scatterplot(data=customer_df,x="Income",y='MntGoldProds',hue="Education",ax=axes[2])
sns.scatterplot(data=customer_df,x="Income",y='NumDealsPurchases',hue="Education",ax=axes[3])
sns.scatterplot(data=customer_df,x="Income",y='NumWebPurchases',hue="Education",ax=axes[4])
sns.scatterplot(data=customer_df,x="Income",y='NumCatalogPurchases',hue="Education",ax=axes[5])
sns.scatterplot(data=customer_df,x="Income",y='MntSweetProducts',hue="Education",ax=axes[6])
sns.scatterplot(data=customer_df,x="Income",y='MntWines',hue="Education",ax=axes[7])
sns.scatterplot(data=customer_df,x="Income",y='MntFruits',hue="Education",ax=axes[8])
plt.tight_layout()
plt.show()

In [None]:
fig,axes=plt.subplots(3,3,figsize=(15,15))
axes = axes.flatten()
sns.scatterplot(data=customer_df,x="Income",y='MntMeatProducts',hue="Marital_Status",ax=axes[0])
sns.scatterplot(data=customer_df,x="Income",y='MntFishProducts',hue="Marital_Status",ax=axes[1])
sns.scatterplot(data=customer_df,x="Income",y='MntGoldProds',hue="Marital_Status",ax=axes[2])
sns.scatterplot(data=customer_df,x="Income",y='NumDealsPurchases',hue="Marital_Status",ax=axes[3])
sns.scatterplot(data=customer_df,x="Income",y='NumWebPurchases',hue="Marital_Status",ax=axes[4])
sns.scatterplot(data=customer_df,x="Income",y='NumCatalogPurchases',hue="Marital_Status",ax=axes[5])
sns.scatterplot(data=customer_df,x="Income",y='MntSweetProducts',hue="Marital_Status",ax=axes[6])
sns.scatterplot(data=customer_df,x="Income",y='MntWines',hue="Marital_Status",ax=axes[7])
sns.scatterplot(data=customer_df,x="Income",y='MntFruits',hue="Marital_Status",ax=axes[8])
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(customer_df.select_dtypes(include="number").corr(),annot=True)
plt.show()

### Outlier Detection

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(20, 10))
axes = axes.flatten()
for i, col in enumerate(customer_df.select_dtypes(include="number").drop("ID",axis=1).columns):
    sns.boxplot(customer_df[col], ax=axes[i])
    axes[i].set_title(f'Outlier Detection of {col}')
plt.tight_layout()
plt.show()

### Null Value Detection

In [None]:
null=customer_df_reduce.isnull().sum()
null[null>0]

### Preprocessing

In [None]:
customer_df.drop(["ID"],axis=1,inplace=True)

In [None]:
customer_df.drop(["Z_Revenue","Z_CostContact"],axis=1,inplace=True)

# Feature Engineering

### Outlier Handling

In [None]:
def handle_outlier(df,col):
    q1=df[col].quantile(0.25)
    q3=df[col].quantile(0.75)
    iqr=q3-q1
    l=q1-1.5*iqr
    u=q3+1.5*iqr
    df[col]=df[col].clip(l,u)

In [None]:
for col in customer_df.drop(["AcceptedCmp3","AcceptedCmp4","AcceptedCmp5","AcceptedCmp1","AcceptedCmp2","Complain","Response"],axis=1).select_dtypes(include="number").columns:
    #print(col)
    handle_outlier(customer_df,col)

### Null Value Handling

In [None]:
knn_imputer = KNNImputer(n_neighbors=20)
customer_df["Income"]=knn_imputer.fit_transform(customer_df[["Income"]])

### Data Transformation

In [None]:
fig, axes = plt.subplots(5, 5, figsize=(15, 15))
axes = axes.flatten()
for i, col in enumerate(customer_df.select_dtypes(include="number").columns):
    sns.histplot(customer_df[col], ax=axes[i], kde=True)
    axes[i].set_title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

##### Right-Skew

In [None]:
right_skew=["MntWines","MntFruits","MntMeatProducts","MntFishProducts","MntSweetProducts","MntGoldProds","Z_CostContact",
            "Z_Revenue"]
pt=PowerTransformer(method="yeo-johnson")
quantile = QuantileTransformer(output_distribution='normal', random_state=0)

In [None]:
fig, axes = plt.subplots(2,4, figsize=(10, 5))
sns.distplot(customer_df["MntWines"],ax=axes[0,0]).set_title('Original of MntWines')
sns.distplot(np.log(customer_df["MntWines"]+0.0001),ax=axes[0,1]).set_title('Log')
sns.distplot(np.sqrt(customer_df["MntWines"]),ax=axes[0,2]).set_title('Square Root')
sns.distplot(np.cbrt(customer_df["MntWines"]),ax=axes[0,3]).set_title('Cube Root')
sns.distplot(1/(customer_df["MntWines"]+1),ax=axes[1,0]).set_title('Reciprocal')
box_cox_dfz,_=stats.boxcox(customer_df["MntWines"]+1)
sns.distplot(box_cox_dfz,ax=axes[1,1]).set_title('Box-Cox')
yoe_dfz=pt.fit_transform(customer_df[["MntWines"]])
sns.distplot(yoe_dfz,ax=axes[1,2]).set_title('Yeo-Johnson')
quantile_dfz=quantile.fit_transform(pd.DataFrame(customer_df["MntWines"]))
sns.distplot(quantile_dfz,ax=axes[1,3]).set_title('Quantile')
plt.tight_layout()
plt.show()

In [None]:
customer_df["MntWines"]=np.cbrt(customer_df["MntWines"])

In [None]:
fig, axes = plt.subplots(2,4, figsize=(10, 5))
sns.distplot(customer_df["MntFruits"],ax=axes[0,0]).set_title('Original of MntFruits')
sns.distplot(np.log(customer_df["MntFruits"]+0.0001),ax=axes[0,1]).set_title('Log')
sns.distplot(np.sqrt(customer_df["MntFruits"]),ax=axes[0,2]).set_title('Square Root')
sns.distplot(np.cbrt(customer_df["MntFruits"]),ax=axes[0,3]).set_title('Cube Root')
sns.distplot(1/(customer_df["MntFruits"]+1),ax=axes[1,0]).set_title('Reciprocal')
box_cox_dfz,_=stats.boxcox(customer_df["MntFruits"]+1)
sns.distplot(box_cox_dfz,ax=axes[1,1]).set_title('Box-Cox')
yoe_dfz=pt.fit_transform(customer_df[["MntFruits"]])
sns.distplot(yoe_dfz,ax=axes[1,2]).set_title('Yeo-Johnson')
quantile_dfz=quantile.fit_transform(pd.DataFrame(customer_df["MntFruits"]))
sns.distplot(quantile_dfz,ax=axes[1,3]).set_title('Quantile')
plt.tight_layout()
plt.show()

In [None]:
box_cox_dfz,_=stats.boxcox(customer_df["MntFruits"]+1)
customer_df["MntFruits"]=box_cox_dfz

In [None]:
fig, axes = plt.subplots(2,4, figsize=(10, 5))
sns.distplot(customer_df["MntMeatProducts"],ax=axes[0,0]).set_title('Original of MntMeatProducts')
sns.distplot(np.log(customer_df["MntMeatProducts"]+0.0001),ax=axes[0,1]).set_title('Log')
sns.distplot(np.sqrt(customer_df["MntMeatProducts"]),ax=axes[0,2]).set_title('Square Root')
sns.distplot(np.cbrt(customer_df["MntMeatProducts"]),ax=axes[0,3]).set_title('Cube Root')
sns.distplot(1/(customer_df["MntMeatProducts"]+1),ax=axes[1,0]).set_title('Reciprocal')
box_cox_dfz,_=stats.boxcox(customer_df["MntMeatProducts"]+1)
sns.distplot(box_cox_dfz,ax=axes[1,1]).set_title('Box-Cox')
yoe_dfz=pt.fit_transform(customer_df[["MntMeatProducts"]])
sns.distplot(yoe_dfz,ax=axes[1,2]).set_title('Yeo-Johnson')
quantile_dfz=quantile.fit_transform(pd.DataFrame(customer_df["MntMeatProducts"]))
sns.distplot(quantile_dfz,ax=axes[1,3]).set_title('Quantile')
plt.tight_layout()
plt.show()

In [None]:
box_cox_dfz,_=stats.boxcox(customer_df["MntMeatProducts"]+1)
customer_df["MntMeatProducts"]=box_cox_dfz

In [None]:
fig, axes = plt.subplots(2,4, figsize=(10, 5))
sns.distplot(customer_df["MntFishProducts"],ax=axes[0,0]).set_title('Original of MntFishProducts')
sns.distplot(np.log(customer_df["MntFishProducts"]+0.0001),ax=axes[0,1]).set_title('Log')
sns.distplot(np.sqrt(customer_df["MntFishProducts"]),ax=axes[0,2]).set_title('Square Root')
sns.distplot(np.cbrt(customer_df["MntFishProducts"]),ax=axes[0,3]).set_title('Cube Root')
sns.distplot(1/(customer_df["MntFishProducts"]+1),ax=axes[1,0]).set_title('Reciprocal')
box_cox_dfz,_=stats.boxcox(customer_df["MntFishProducts"]+1)
sns.distplot(box_cox_dfz,ax=axes[1,1]).set_title('Box-Cox')
yoe_dfz=pt.fit_transform(customer_df[["MntFishProducts"]])
sns.distplot(yoe_dfz,ax=axes[1,2]).set_title('Yeo-Johnson')
quantile_dfz=quantile.fit_transform(pd.DataFrame(customer_df["MntFishProducts"]))
sns.distplot(quantile_dfz,ax=axes[1,3]).set_title('Quantile')
plt.tight_layout()
plt.show()

In [None]:
box_cox_dfz,_=stats.boxcox(customer_df["MntFishProducts"]+1)
customer_df["MntFishProducts"]=box_cox_dfz

In [None]:
fig, axes = plt.subplots(2,4, figsize=(10, 5))
sns.distplot(customer_df["MntSweetProducts"],ax=axes[0,0]).set_title('Original of MntSweetProducts')
sns.distplot(np.log(customer_df["MntSweetProducts"]+0.0001),ax=axes[0,1]).set_title('Log')
sns.distplot(np.sqrt(customer_df["MntSweetProducts"]),ax=axes[0,2]).set_title('Square Root')
sns.distplot(np.cbrt(customer_df["MntSweetProducts"]),ax=axes[0,3]).set_title('Cube Root')
sns.distplot(1/(customer_df["MntSweetProducts"]+1),ax=axes[1,0]).set_title('Reciprocal')
box_cox_dfz,_=stats.boxcox(customer_df["MntSweetProducts"]+1)
sns.distplot(box_cox_dfz,ax=axes[1,1]).set_title('Box-Cox')
yoe_dfz=pt.fit_transform(customer_df[["MntSweetProducts"]])
sns.distplot(yoe_dfz,ax=axes[1,2]).set_title('Yeo-Johnson')
quantile_dfz=quantile.fit_transform(pd.DataFrame(customer_df["MntSweetProducts"]))
sns.distplot(quantile_dfz,ax=axes[1,3]).set_title('Quantile')
plt.tight_layout()
plt.show()

In [None]:
box_cox_dfz,_=stats.boxcox(customer_df["MntSweetProducts"]+1)
customer_df["MntSweetProducts"]=box_cox_dfz

In [None]:
fig, axes = plt.subplots(2,4, figsize=(10, 5))
sns.distplot(customer_df["MntGoldProds"],ax=axes[0,0]).set_title('Original of MntGoldProds')
sns.distplot(np.log(customer_df["MntGoldProds"]+0.0001),ax=axes[0,1]).set_title('Log')
sns.distplot(np.sqrt(customer_df["MntGoldProds"]),ax=axes[0,2]).set_title('Square Root')
sns.distplot(np.cbrt(customer_df["MntGoldProds"]),ax=axes[0,3]).set_title('Cube Root')
sns.distplot(1/(customer_df["MntGoldProds"]+1),ax=axes[1,0]).set_title('Reciprocal')
box_cox_dfz,_=stats.boxcox(customer_df["MntGoldProds"]+1)
sns.distplot(box_cox_dfz,ax=axes[1,1]).set_title('Box-Cox')
yoe_dfz=pt.fit_transform(customer_df[["MntGoldProds"]])
sns.distplot(yoe_dfz,ax=axes[1,2]).set_title('Yeo-Johnson')
quantile_dfz=quantile.fit_transform(pd.DataFrame(customer_df["MntGoldProds"]))
sns.distplot(quantile_dfz,ax=axes[1,3]).set_title('Quantile')
plt.tight_layout()
plt.show()

In [None]:
box_cox_dfz,_=stats.boxcox(customer_df["MntGoldProds"]+1)
customer_df["MntGoldProds"]=box_cox_dfz

### Encoding

##### Label Encoding

In [None]:
le=LabelEncoder()
customer_df.Marital_Status=le.fit_transform(customer_df.Marital_Status)

##### Ordinal Encoding

In [None]:
oe=OrdinalEncoder(categories=[['Basic','Graduation', '2n Cycle', 'Master', 'PhD']])
customer_df.Education=le.fit_transform(customer_df.Education)

# Feature Selection

In [None]:
customer_df_reduce=customer_df.drop(["Dt_Customer"],axis=1)

### Feature Selection using PCA

In [None]:
pca = PCA(n_components=23)
customer_df_reduce_pca = pca.fit_transform(customer_df_reduce)
customer_reduce_pca_df=pd.DataFrame(customer_df_reduce_pca)
customer_reduce_pca_df

In [None]:
explained_var_ratio = pca.explained_variance_ratio_
cumulative_var_ratio = np.cumsum(explained_var_ratio)
cumulative_var_ratio

### Feature Selection using Variance Threshold (VarianceThreshold)

In [None]:
vt = VarianceThreshold(threshold=0.1)
customer_df_reduce_vt = vt.fit_transform(customer_df_reduce)
selected_features_mask_vt = vt.get_support()
selected_features_vt = customer_df_reduce.columns[selected_features_mask_vt]
customer_reduce_vt_df = pd.DataFrame(customer_df_reduce[selected_features_vt])
customer_reduce_vt_df

# Scaling

In [None]:
minmax=MinMaxScaler()

### Scaling for Features Selected using Variance Threshold (VarianceThreshold)

In [None]:
customer_reduce_vt_df_scale=minmax.fit_transform(customer_reduce_vt_df)
customer_reduce_vt_df_scale

### Scaling for Features Selected using PCA

In [None]:
customer_reduce_pca_df_scale=minmax.fit_transform(customer_reduce_pca_df)
customer_reduce_pca_df_scale

# Model Training and Evaluation

### KMeans

##### Variance Threshold

In [None]:
sse = []
for k in range(1,11):
    km = KMeans(n_clusters=k, random_state=2)
    km.fit(customer_reduce_vt_df_scale)
    sse.append(km.inertia_)
sns.set_style("whitegrid")
g=sns.lineplot(x=range(1,11), y=sse)
g.set(xlabel ="Number of cluster (k)", ylabel = "Sum Squared Error", title ='Elbow Method')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 2, random_state = 2)
kmeans.fit(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_1 = kmeans.fit_predict(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_1

In [None]:
print("Silhouette Score: ",silhouette_score(customer_reduce_vt_df_scale_pred_1.reshape(-1, 1), kmeans.labels_))
print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_vt_df_scale_pred_1.reshape(-1, 1), kmeans.labels_))
print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_vt_df_scale_pred_1.reshape(-1, 1), kmeans.labels_))

In [None]:
kmeans.n_clusters

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_vt_df_scale)
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans.labels_)
centers_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1])
#centers_pca = pca.fit_transform(kmeans.cluster_centers_)
for center in centers_pca:
    plt.scatter(center[0], center[1], marker='^', c='red', s=200, edgecolor='k')

In [None]:
for k in range(kmeans.n_clusters):
    my_members = kmeans.labels_ == k
    color = cm.nipy_spectral(float(k) / kmeans.n_clusters, 2)
    plt.plot(customer_reduce_vt_df_scale[my_members, 0], customer_reduce_vt_df_scale[my_members, 1], ".", c=color)
    cluster_center = kmeans.cluster_centers_[k]
    plt.plot(cluster_center[0], cluster_center[1], "o", markerfacecolor=color, markeredgecolor="k", markersize=6)
plt.show()

In [None]:
visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
visualizer.fit(customer_reduce_vt_df_scale)

##### PCA

In [None]:
sse = []
for k in range(1,11):
    km = KMeans(n_clusters=k, random_state=2)
    km.fit(customer_reduce_pca_df_scale)
    sse.append(km.inertia_)
sns.set_style("whitegrid")
g=sns.lineplot(x=range(1,11), y=sse)
g.set(xlabel ="Number of cluster (k)", ylabel = "Sum Squared Error", title ='Elbow Method')
plt.show()

In [None]:
kmeans = KMeans(n_clusters = 2, random_state = 2)
kmeans.fit(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_1 = kmeans.fit_predict(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_1

In [None]:
print("Silhouette Score: ",silhouette_score(customer_reduce_pca_df_scale_pred_1.reshape(-1, 1), kmeans.labels_))
print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_pca_df_scale_pred_1.reshape(-1, 1), kmeans.labels_))
print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_pca_df_scale_pred_1.reshape(-1, 1), kmeans.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_pca_df_scale)
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans.labels_)
centers_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1])
#centers_pca = pca.fit_transform(kmeans.cluster_centers_)
for center in centers_pca:
    plt.scatter(center[0], center[1], marker='^', c='red', s=200, edgecolor='k')

In [None]:
for k in range(kmeans.n_clusters):
    my_members = kmeans.labels_ == k
    color = cm.nipy_spectral(float(k) / kmeans.n_clusters, 2)
    plt.plot(customer_reduce_pca_df_scale[my_members, 0], customer_reduce_pca_df_scale[my_members, 1], ".", c=color)
    cluster_center = kmeans.cluster_centers_[k]
    plt.plot(cluster_center[0], cluster_center[1], "o", markerfacecolor=color, markeredgecolor="k", markersize=6)
plt.show()

In [None]:
visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')
visualizer.fit(customer_reduce_pca_df_scale)

### Agglomerative Clustering

##### Variance Threshold

In [None]:
ac = AgglomerativeClustering(n_clusters=2)
ac.fit(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_2 = ac.fit_predict(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_2

In [None]:
print("Silhouette Score: ",silhouette_score(customer_reduce_vt_df_scale_pred_2.reshape(-1, 1), ac.labels_))
print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_vt_df_scale_pred_2.reshape(-1, 1), ac.labels_))
print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_vt_df_scale_pred_2.reshape(-1, 1), ac.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_vt_df_scale)
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans.labels_)
unique_labels = np.unique(ac.labels_)
centroids = np.array([data_pca[ac.labels_ == label].mean(axis=0) for label in unique_labels])
plt.scatter(centroids[:, 0], centroids[:, 1])
plt.scatter(centroids[:,0], centroids[:,1], marker='^', c='red', s=200, edgecolor='k')

In [None]:
np.unique(ac.labels_)

In [None]:
for k in range(ac.n_clusters):
    my_members = ac.labels_ == k
    color = cm.nipy_spectral(float(k) / ac.n_clusters, 2)
    plt.plot(customer_reduce_vt_df_scale[my_members, 0], customer_reduce_vt_df_scale[my_members, 1], ".", c=color)
    #cluster_center = ac.labels_[k]
    plt.plot(np.unique(ac.labels_)[0], np.unique(ac.labels_)[1], "o", markerfacecolor=color, markeredgecolor="k", markersize=6)
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_vt_df_scale, ac.labels_)
silhouette_values = silhouette_samples(customer_reduce_vt_df_scale, ac.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

##### PCA

In [None]:
ac = AgglomerativeClustering(n_clusters=2)
ac.fit(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_2 = ac.fit_predict(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_2

In [None]:
print("Silhouette Score: ",silhouette_score(customer_reduce_pca_df_scale_pred_2.reshape(-1, 1), ac.labels_))
print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_pca_df_scale_pred_2.reshape(-1, 1), ac.labels_))
print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_pca_df_scale_pred_2.reshape(-1, 1), ac.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_pca_df_scale)
plt.scatter(data_pca[:, 0], data_pca[:, 1], c=kmeans.labels_)
unique_labels = np.unique(ac.labels_)
centroids = np.array([data_pca[ac.labels_ == label].mean(axis=0) for label in unique_labels])
plt.scatter(centroids[:, 0], centroids[:, 1])
plt.scatter(centroids[:,0], centroids[:,1], marker='^', c='red', s=200, edgecolor='k')

In [None]:
for k in range(ac.n_clusters):
    my_members = ac.labels_ == k
    color = cm.nipy_spectral(float(k) / ac.n_clusters, 2)
    plt.plot(customer_reduce_pca_df_scale[my_members, 0], customer_reduce_pca_df_scale[my_members, 1], ".", c=color)
    #cluster_center = ac.labels_[k]
    plt.plot(np.unique(ac.labels_)[0], np.unique(ac.labels_)[1], "o", markerfacecolor=color, markeredgecolor="k", markersize=6)
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_pca_df_scale, ac.labels_)
silhouette_values = silhouette_samples(customer_reduce_pca_df_scale, ac.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

### DBSCAN

##### Variance Threshold

In [None]:
db = DBSCAN(eps=0.8, min_samples=20)
db.fit(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_3 = db.fit_predict(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_3

In [None]:
if len(np.unique(db.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_vt_df_scale_pred_3.reshape(-1, 1), db.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_vt_df_scale_pred_3.reshape(-1, 1), db.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_vt_df_scale_pred_3.reshape(-1, 1), db.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_vt_df_scale_pred_3, db.labels_))

In [None]:
len(np.unique(db.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_vt_df_scale)
unique_labels = np.unique(db.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[db.labels_ == label, 0], data_pca[db.labels_ == label, 1], 
                c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[db.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
unique_labels = set(db.labels_)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    class_member_mask = db.labels_ == k
    xy = customer_reduce_vt_df_scale[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=14)
    #xy = customer_reduce_vt_df_scale[class_member_mask & ~core_samples_mask]
    #plt.plot(xy[:, 0], xy[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=6)
plt.title("Estimated number of clusters: ")
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_vt_df_scale, db.labels_)
silhouette_values = silhouette_samples(customer_reduce_vt_df_scale, db.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

##### PCA

In [None]:
db = DBSCAN(eps=0.8, min_samples=30)
db.fit(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_3 = db.fit_predict(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_3

In [None]:
if len(np.unique(db.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_pca_df_scale_pred_3.reshape(-1, 1), db.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_pca_df_scale_pred_3.reshape(-1, 1), db.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_pca_df_scale_pred_3.reshape(-1, 1), db.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_pca_df_scale_pred_3, db.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_pca_df_scale)
unique_labels = np.unique(db.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[db.labels_ == label, 0], data_pca[db.labels_ == label, 1], 
                c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[db.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
db = DBSCAN(eps=0.3, min_samples=10).fit(customer_reduce_pca_df_scale)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)

In [None]:
if len(np.unique(db.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_pca_df_scale_pred_3.reshape(-1, 1), db.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_pca_df_scale_pred_3.reshape(-1, 1), db.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_pca_df_scale_pred_3.reshape(-1, 1), db.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_pca_df_scale_pred_3, db.labels_))

In [None]:
unique_labels = set(labels)
core_samples_mask = np.zeros_like(labels, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = labels == k

    xy = customer_reduce_pca_df_scale[class_member_mask & core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=14,
    )

    xy = customer_reduce_pca_df_scale[class_member_mask & ~core_samples_mask]
    plt.plot(
        xy[:, 0],
        xy[:, 1],
        "o",
        markerfacecolor=tuple(col),
        markeredgecolor="k",
        markersize=6,
    )

plt.title(f"Estimated number of clusters: {n_clusters_}")
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_pca_df_scale, db.labels_)
silhouette_values = silhouette_samples(customer_reduce_pca_df_scale, db.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

### MeanShift

##### Variance Threshold

In [None]:
ms = MeanShift()
ms.fit(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_4 = ms.fit_predict(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_4

In [None]:
if len(np.unique(ms.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_vt_df_scale_pred_4.reshape(-1, 1), ms.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_vt_df_scale_pred_4.reshape(-1, 1), ms.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_vt_df_scale_pred_4.reshape(-1, 1), ms.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_vt_df_scale_pred_4, ms.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_vt_df_scale)
unique_labels = np.unique(ms.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[ms.labels_ == label, 0], data_pca[ms.labels_ == label, 1], c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[ms.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
cmap = plt.cm.get_cmap("tab10", len(unique_labels))
color_list=[cmap(i) for i in range(len(ms.labels_))]
markers ='^'
for k, col in zip(range(len(ms.cluster_centers_)), color_list):
    my_members = ms.labels_ == k
    cluster_center = ms.cluster_centers_[k]
    plt.plot(customer_reduce_vt_df_scale[my_members, 0], customer_reduce_vt_df_scale[my_members, 1], markers, color=col)
    plt.plot(cluster_center[0], cluster_center[1], markers, markerfacecolor="red", markeredgecolor="k", markersize=14)
plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_vt_df_scale, ms.labels_)
silhouette_values = silhouette_samples(customer_reduce_vt_df_scale, ms.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

##### PCA

In [None]:
ms = MeanShift()
ms.fit(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_4 = ms.fit_predict(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_4

In [None]:
if len(np.unique(ms.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_pca_df_scale_pred_4.reshape(-1, 1), ms.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_pca_df_scale_pred_4.reshape(-1, 1), ms.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_pca_df_scale_pred_4.reshape(-1, 1), ms.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_pca_df_scale_pred_4, ms.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_pca_df_scale)
unique_labels = np.unique(ms.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[ms.labels_ == label, 0], data_pca[ms.labels_ == label, 1], c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[ms.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
cmap = plt.cm.get_cmap("tab10", len(unique_labels))
color_list=[cmap(i) for i in range(len(ms.labels_))]
markers ='^'
for k, col in zip(range(len(ms.cluster_centers_)), color_list):
    my_members = ms.labels_ == k
    cluster_center = ms.cluster_centers_[k]
    plt.plot(customer_reduce_pca_df_scale[my_members, 0], customer_reduce_pca_df_scale[my_members, 1], markers, color=col)
    plt.plot(cluster_center[0], cluster_center[1], markers, markerfacecolor="red", markeredgecolor="k", markersize=14)
plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_pca_df_scale, ms.labels_)
silhouette_values = silhouette_samples(customer_reduce_pca_df_scale, ms.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

### Spectral Clustering

##### Variance Threshold

In [None]:
sc = SpectralClustering(n_clusters = 2, affinity ='rbf')
sc.fit(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_5 = sc.fit_predict(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_5

In [None]:
if len(np.unique(sc.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_vt_df_scale_pred_5.reshape(-1, 1), sc.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_vt_df_scale_pred_5.reshape(-1, 1), sc.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_vt_df_scale_pred_5.reshape(-1, 1), sc.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_vt_df_scale_pred_5, sc.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_vt_df_scale)
unique_labels = np.unique(sc.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[sc.labels_ == label, 0], data_pca[sc.labels_ == label, 1], 
                c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[sc.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
plt.scatter(customer_reduce_vt_df_scale[:,0], customer_reduce_vt_df_scale[:,1], s=5, c=sc.labels_, label="n_cluster-"+str(2))
plt.legend()
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_vt_df_scale, sc.labels_)
silhouette_values = silhouette_samples(customer_reduce_vt_df_scale, sc.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

##### PCA

In [None]:
sc = SpectralClustering(n_clusters = 2, affinity ='rbf')
sc.fit(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_5 = sc.fit_predict(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_5

In [None]:
if len(np.unique(sc.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_pca_df_scale_pred_5.reshape(-1, 1), sc.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_pca_df_scale_pred_5.reshape(-1, 1), sc.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_pca_df_scale_pred_5.reshape(-1, 1), sc.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_pca_df_scale_pred_5, sc.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_pca_df_scale)
unique_labels = np.unique(sc.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[sc.labels_ == label, 0], data_pca[sc.labels_ == label, 1], 
                c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[sc.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
plt.scatter(customer_reduce_pca_df_scale[:,0], customer_reduce_pca_df_scale[:,1], s=5, c=sc.labels_, label="n_cluster-"+str(2))
plt.legend()
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_pca_df_scale, sc.labels_)
silhouette_values = silhouette_samples(customer_reduce_pca_df_scale, sc.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

##### Variance Threshold

In [None]:
sc = SpectralClustering(n_clusters = 2, affinity ='nearest_neighbors')
sc.fit(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_5_2 = sc.fit_predict(customer_reduce_vt_df_scale)
customer_reduce_vt_df_scale_pred_5_2

In [None]:
if len(np.unique(sc.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_vt_df_scale_pred_5_2.reshape(-1, 1), sc.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_vt_df_scale_pred_5_2.reshape(-1, 1), sc.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_vt_df_scale_pred_5_2.reshape(-1, 1), sc.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_vt_df_scale_pred_5_2, sc.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_vt_df_scale)
unique_labels = np.unique(sc.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[sc.labels_ == label, 0], data_pca[sc.labels_ == label, 1], 
                c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[sc.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
plt.scatter(customer_reduce_vt_df_scale[:,0], customer_reduce_vt_df_scale[:,1], s=5, c=sc.labels_, label="n_cluster-"+str(2))
plt.legend()
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_vt_df_scale, sc.labels_)
silhouette_values = silhouette_samples(customer_reduce_vt_df_scale, sc.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()

##### PCA

In [None]:
sc = SpectralClustering(n_clusters = 4, affinity ='nearest_neighbors')
sc.fit(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_5_2 = sc.fit_predict(customer_reduce_pca_df_scale)
customer_reduce_pca_df_scale_pred_5_2

In [None]:
if len(np.unique(sc.labels_))>1:
    print("Silhouette Score: ",silhouette_score(customer_reduce_pca_df_scale_pred_5_2.reshape(-1, 1), sc.labels_))
    print("Calinski-Harabasz Index: ",calinski_harabasz_score(customer_reduce_pca_df_scale_pred_5_2.reshape(-1, 1), sc.labels_))
    print("Davies-Bouldin Index: ", davies_bouldin_score(customer_reduce_pca_df_scale_pred_5_2.reshape(-1, 1), sc.labels_))
print("Adjusted Rand Index:" , adjusted_rand_score(customer_reduce_pca_df_scale_pred_5_2, sc.labels_))

In [None]:
pca = PCA(n_components=2)
data_pca = pca.fit_transform(customer_reduce_pca_df_scale)
unique_labels = np.unique(sc.labels_)
colors = plt.cm.get_cmap("tab10", len(unique_labels))
for label in unique_labels:
    if label == -1:
        color = "black"
        marker = "x"
        label_name = "Noise"
    else:
        color = colors(label)
        marker = "o"
        label_name = f"Cluster {label}"  
    plt.scatter(data_pca[sc.labels_ == label, 0], data_pca[sc.labels_ == label, 1], 
                c=[color], marker=marker, label=label_name)
centroids = np.array([data_pca[sc.labels_ == label].mean(axis=0) for label in unique_labels if label != -1])
plt.scatter(centroids[:, 0], centroids[:, 1], marker='^', c='red', s=200, edgecolor='k', label='Centroids')
plt.legend()
plt.show()

In [None]:
plt.scatter(customer_reduce_pca_df_scale[:,0], customer_reduce_pca_df_scale[:,1], s=5, c=sc.labels_, label="n_cluster-"+str(2))
plt.legend()
plt.show()

In [None]:
silhouette_avg = silhouette_score(customer_reduce_pca_df_scale, sc.labels_)
silhouette_values = silhouette_samples(customer_reduce_pca_df_scale, sc.labels_)
fig, ax = plt.subplots(figsize=(10, 6))
y_lower = 10
for i in range(2):
    ith_cluster_silhouette_values = silhouette_values[labels == i]
    ith_cluster_silhouette_values.sort()
    size_cluster_i = ith_cluster_silhouette_values.shape[0]
    y_upper = y_lower + size_cluster_i
    color = plt.cm.nipy_spectral(float(i) / 3)
    ax.fill_betweenx(np.arange(y_lower, y_upper),0, ith_cluster_silhouette_values,facecolor=color, edgecolor=color, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
    y_lower = y_upper + 10
ax.set_title("The silhouette plot for the various clusters.")
ax.set_xlabel("The silhouette coefficient values")
ax.set_ylabel("Cluster label")
ax.axvline(x=silhouette_avg, color="red", linestyle="--")
plt.show()