# Customer segregation using KMeans clustering

To segragate the customer based on SGN and Revenue data, we'll be following the below mentioned steps.
1. Loading the data
2. Basic Data Visuaization
3. Data analysis
4. Exploratory Data Analysis 
5. Visualizing the data disctirbution among all the features
6. Finding out the correlation between the features
7. Feature Selection
8. Model Training
7. Using the Elbow method to validate the value of K

#### Step1: Load the necessary librarires

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

### plotly
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px

from scipy.spatial.distance import cdist
import pickle


#### Step2: Load the data

In [None]:
df = pd.read_csv('..\dataset\dataset.csv')

In [None]:
df

#### Step3: Exploratory Data Analysis

In [None]:
### Basic Data Visualization
data = df.to_numpy()
plt.scatter([data[:, 1]], data[:,0], cmap='rainbow')
plt.grid()

In [None]:
##### see the information in the data frame
df.info()

In [None]:
### check the basic data description
df.describe()

In [None]:
### data cleaning
df.isnull().sum()

In [None]:
df[df.duplicated(keep=False)]

In [None]:
### see the unique count of the columns
df['Revenue '].count(),df.shape,df.SGN.unique()

Since the SGN column has all the continuous and unique values, hence there is no correlation between these two features.
<br>
Hence discarding the SGN column from now.

In [None]:
data = df['Revenue '].values
data = np.reshape(data, (len(data), 1))
data.shape

In [None]:
_ = df.pop('SGN')
# Visualize the distribution of each feature.
plt.figure(figsize=(12,16))
for i, j in enumerate(df.describe().columns):
    plt.subplot(5,2, i+1)
    sns.distplot(x=df[j])
    plt.xlabel(j)
    plt.title('{} Distribution'.format(j))
    # plt.subplots_adjust(wspace=.2, hspace=.5)
    plt.tight_layout()
plt.show()

In [None]:
# Visualize the boxplot of each feature.
plt.figure(figsize=(12,10))
for i, j in enumerate(df.describe().columns):
    plt.subplot(3,3, i+1)
    sns.boxplot(x=df[j])
    plt.title('{} Boxplot'.format(j))
    plt.tight_layout()
    
plt.show()

In [100]:
# ## check the feature mapping
# fig = plt.figure(figsize=(8,6))
# sns.scatterplot(x=data[:,1], y=data[:,1])
# plt.title('SGN vs. Revenue Scatterplot')
# plt.savefig('scatter.png')
# plt.show()

In [101]:
# pairplot = sns.pairplot(df, corner=True)
# plt.show(pairplot)

In [102]:
# ### check the correlation between the available features

# correlation_metrics=df.corr()
# fig = plt.figure(figsize=(14,9))
# sns.heatmap(correlation_metrics,square=True, annot=True, vmax=1, vmin=-1, cmap='RdBu')
# plt.title('Correlation Between Variables', size=14)
# plt.show()


Skipping the correlation part, as we found all the values as unique in the SGN column
<br>
Hence, we can skip the SGN feature from the consideration.

#### Step4: Feature Selection

In [103]:
# data = data[:, 1] #### exclude the SGN feature from consideration 

#### Step5: Data Preprocessing

In [None]:
## Scale the data
scaler = MinMaxScaler()
data_train = np.reshape(data, (len(data), 1))
scaler.fit(data_train)
data_norm = scaler.transform(data_train)
data_norm

As of now, randomly select the value of K, lets select k as 4 for now.
<br>

#### Step 6: Model Training

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(data_norm)

In [None]:
kmeans.cluster_centers_

In [None]:
kmeans.labels_

In [None]:
plt.scatter( np.zeros(len(data_norm)),data_norm,  c=kmeans.labels_, cmap='rainbow')
plt.grid()

#### Step7: Validate the value of K with Elbow method

In [None]:
#### validation using the Elbow method
distortions = []
inertias  =[]
mapping1 = {}
mapping2 = {}

K = range(1, 10)

for k in K:
    kmeans_model = KMeans(n_clusters=k, random_state=42).fit(data_norm)

    distortions.append(sum(np.mean(cdist(data_norm, kmeans_model.cluster_centers_, 'euclidean'), axis=1)**2)/data_norm.shape[0])

    inertias.append(kmeans_model.inertia_)

    mapping1[k] = (distortions[-1])
    mapping2[k] = inertias[-1]


print('Inertia values ::')

for key, val in mapping2.items():
    print(f'{key} : {val}')

plt.plot(K, inertias, 'bx-', marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('The elbow method using Inertias')
plt.grid()
plt.savefig(r'..\results\elbow.png')
plt.show()

Let us analyse the value of inertias, the value of intertia is getting constant beyond 6.
<br>
hence the perfect value of k could be 5 for this dataset.
<br>
Let us visualize the data eith different values of K

In [None]:
# Initialize a range of k values
k_range = range(2, 7)

# Fit and plot data for each k value
for k in k_range:
    kmeans = KMeans(n_clusters=k,  random_state=42)
    y_kmeans = kmeans.fit_predict(data_norm)
    
    # Plot the clustered data points
    plt.scatter(np.zeros(len(data_norm)),data_norm, c=y_kmeans, cmap='viridis', marker='o', edgecolor='k', s=100)
    plt.title(f'K-means Clustering (k={k})')
    plt.ylabel('Revenue')
    plt.legend()
    plt.grid()
    plt.show()


As per Elbow method the optimized value of K for this dataset is 6,
<br>
Hence, re-train the model with K=5 and see the final results

#### Step8: Final Results

In [111]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(data_norm)
label = kmeans.fit_predict(data_norm)

In [None]:
### save the trained model
with open("..\model\kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)

In [None]:
cmap = matplotlib.cm.get_cmap('rainbow', 5)
u_labels = np.unique(label)
for i in u_labels:
    data_len = len(data[label==i])
    plt.scatter( np.zeros(data_len), data[label==i],  cmap=cmap, marker='o',  s=50, label="Class_"+str(i+1))
plt.legend(loc="upper right")
plt.axis('equal')
plt.ylabel('Revenue')
plt.colorbar(ticks=[])
plt.grid()
plt.savefig(r"..\results\final_results.png")
plt.show()