# Clustering Algorithms

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN

In [None]:
flame = pd.read_csv('./datasets/flame.csv')
agg = pd.read_csv('./datasets/aggregation.csv')
spiral = pd.read_csv('./datasets/spiral.csv')
r15 = pd.read_csv('./datasets/r15.csv')
path = pd.read_csv('./datasets/pathbased.csv')
jain = pd.read_csv('./datasets/jain.csv')
comp = pd.read_csv('./datasets/compound.csv')

In [None]:
datasets = {'flame' : flame, 'agg' : agg, 'spiral' : spiral, 'r15' : r15, 
            'path' : path, 'jain' : jain, 'comp' : comp}

In [None]:
comp.head()

In [None]:
def plot_dataset(title, dataset):
    plt.figure(figsize = (8,6))
    plt.title(title)
    plt.scatter(dataset['x'], dataset['y'], c = dataset['label'], cmap = 'seismic')

In [None]:
for title, dataset in datasets.items():
    plot_dataset(title, dataset)

### Fit and compare kmeans, agglomerative, and dbscan to flame dataset

In [None]:
flame_no_label = flame.loc[:, ['x', 'y']]

In [None]:
## fit kmeans model to flame dataset

kmeans = flame_no_label.copy()
kmeans_mod = KMeans(n_clusters = 3)
kmeans_mod.fit(flame_no_label)
kmeans['label'] = kmeans_mod.labels_

In [None]:
## fit agglomerative model to flame dataset

agglom = flame_no_label.copy()
agglom_mod = AgglomerativeClustering(n_clusters = 2)
agglom_mod.fit(flame_no_label)
agglom['label'] = agglom_mod.labels_

In [None]:
## fit dbscan model to flame dataset

dbscan = flame_no_label.copy()
dbscan_mod = DBSCAN(eps = .82, min_samples = 3)
dbscan_mod.fit(flame_no_label)
dbscan['label'] = dbscan_mod.labels_

In [None]:
plot_dataset('flame', flame)
plot_dataset('kmeans', kmeans)
plot_dataset('agglom', agglom)
plot_dataset('dbscan', dbscan)

### Let's try spiral

In [None]:
spiral_no_label = spiral.loc[:, ['x', 'y']]

In [None]:
## fit kmeans model to spiral dataset

kmeans = spiral_no_label.copy()
kmeans_mod = KMeans(n_clusters = 3)
kmeans_mod.fit(spiral_no_label)
kmeans['label'] = kmeans_mod.labels_

In [None]:
agglom = spiral_no_label.copy()
agglom_mod = AgglomerativeClustering(n_clusters = 3)
agglom_mod.fit(spiral_no_label)
agglom['label'] = agglom_mod.labels_

In [None]:
## fit dbscan model to spiral dataset

dbscan = spiral_no_label.copy()
dbscan_mod = DBSCAN(eps = 2, min_samples = 4)
dbscan_mod.fit(spiral_no_label)
dbscan['label'] = dbscan_mod.labels_

In [None]:
plot_dataset('spiral', spiral)
plot_dataset('kmeans', kmeans)
plot_dataset('agglom', agglom)             
plot_dataset('dbscan', dbscan)

## Credit Card Customer Clustering

In [None]:
cc_df = pd.read_csv('./datasets/CC_GENERAL.csv')

In [None]:
cc_df.head()

In [None]:
cc_df.info()

In [None]:
cc_df.describe().T

In [None]:
cc_df.isna().sum()

In [None]:
cc_df[cc_df['CREDIT_LIMIT'].isna()]

In [None]:
## FILL NAs with median (more research should really be done on why min payment is 0. Might be a promotion, etc.)

cc_df['MINIMUM_PAYMENTS'] = cc_df['MINIMUM_PAYMENTS'].fillna(cc_df['MINIMUM_PAYMENTS'].median())

In [None]:
cc_df.dropna(inplace=True)

In [None]:
cc_df.head()

In [None]:
from sklearn.preprocessing import StandardScaler, power_transform
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cc_df.drop('CUST_ID', inplace = True, axis = 1)

In [None]:
X = cc_df.copy()

In [None]:
X = power_transform(X, 'yeo-johnson')

In [None]:
scaler = StandardScaler()

In [None]:
X_sc = scaler.fit_transform(X)

In [None]:
X_sc

In [None]:
n_clusters=30
cost=[]
for i in range(1,n_clusters):
    kmean= KMeans(i)
    kmean.fit(X)
    cost.append(kmean.inertia_)  

In [None]:
plt.plot(cost, 'bx-')

In [None]:
kmean= KMeans(5)
kmean.fit(X)
labels=kmean.labels_

In [None]:
X = pd.DataFrame(X, columns = cc_df.columns)

In [None]:
clusters=pd.concat([X, pd.DataFrame({'cluster':labels})], axis=1)
clusters.head()

In [None]:
for c in clusters:
    grid= sns.FacetGrid(clusters, col='cluster')
    grid.map(plt.hist, c)

In [None]:
dist = 1 - cosine_similarity(X)

pca = PCA(2)
pca.fit(dist)
X_PCA = pca.transform(dist)
X_PCA.shape

In [None]:
x, y = X_PCA[:, 0], X_PCA[:, 1]

colors = {0: 'red',
          1: 'blue',
          2: 'green', 
          3: 'yellow', 
          4: 'orange',  
          5:'purple'}

names = {0: 'cluster 0', 
         1: 'cluster 1', 
         2: 'cluster 2', 
         3: 'cluster 3', 
         4: 'cluster 4',
         5: 'cluster 5'}
  
df = pd.DataFrame({'x': x, 'y':y, 'label':labels}) 
groups = df.groupby('label')

fig, ax = plt.subplots(figsize=(20, 13)) 

for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5,
            color=colors[name],label=names[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(axis='x',which='both',bottom='off',top='off',labelbottom='off')
    ax.tick_params(axis= 'y',which='both',left='off',top='off',labelleft='off')
    
ax.legend()
ax.set_title("Customers Segmentation based on their Credit Card usage bhaviour.")
plt.show()