# CLUSTERING Snippets e Scheletro

## 1. Elaborazione dei Dati

### Import e preparazione delle strutture dati

In [None]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Variables
file_name= 'File_Name.csv'
file_name_2 = 'File_Name_2.csv'
separator = 'Separator'
random_state = 42
target = 'Class_Target'

# Directives
%matplotlib inline
np.random.seed(random_state)


### Caricamento delle strutture dati

In [None]:
# Load file (Prima riga ci sono le label e la prima colonna ha gli indici)
df = pd.read_csv(file_name, delimiter = separator, header = 0, index_col = 0)

# Load file (DataSet senza label e indici)
df = pd.read_csv(file_name, delimiter = separator, header=None, index_col=None)

# Load file (DataSet con names)
df = pd.read_csv(file_name, delimiter = separator, header=None, index_col=None, names=['colonna1', 'colonna2'])

# Load data from a .txt file
text = np.loadtxt(file_name, delimiter = separator)
df = pd.DataFrame(text)


### Mostra dei dati (SIZE, DESCRIBE, BOXPLOT, PAIRPLOT, CORRELATION MATRIX)

In [None]:
# Show the DataFrame (All)
df

# Show Structure
df.describe()

# Show the head of the dataframe
df.head()

# For each column show the frequencies of each distinct value
np.unique(df, return_counts = True)

# Show the number of rows and columns
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns in this dataset")

# Show Shape
print ("The shape is: {}".format(df.shape))

# Show the size of the dataframe
print(f"The dataframe has size: {df.size}")

# Pairplot (relazioni fra attributi rispetto al target)
# NON TIENE VALORI STRINGHE (NO ERRORI)
sns.pairplot(df, hue = target)

#come selezionare colonne : selezionata colonna 1 e 2
pippo = X[[1,2]]
pippo
sns.pairplot(pippo)



# Boxplot (trovare Outliers)
# NON TIENE VALORI STRINGHE (DA ERRORI, DA TOGLIERE)
plt.figure(figsize=(15,15))
pos = 1
for i in df.columns:
        if(type(df[i][0]) != str):
                plt.subplot(4, 3, pos)
                sns.boxplot(df[i])
                pos += 1

# Correlation Matrix
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap="YlGnBu", annot=True)

#Check the number of rows with missing values
rows_missingvalues = df.isna().any(axis=1).sum()
print("Rows with missing values: {}".format(rows_missingvalues))

# Histogram of numeric data
pd.DataFrame.hist(df, figsize=[15,15]);

# Histogram of the column target (even if a string)
df['target'].hist()

# Scatter Plot (X column 0 and Y column 1 of df)
sns.scatterplot(x=focus[0], y=focus[1], data=df, hue="target")


### Esempi di Commenti sui grafici dei dati

In [None]:
# The boxplots show that there are no outliers, the distribution of 0 and 3 is very similar.
#  1 and 2 have a similar median value but different distribution of values. There doesn't seem to be any particular situation showing.

# From the pairplot it is clear that the columns 1 and 2 tend to form quite distinct clusters. They're probably our best bet for our clustering efforts.

# Column 1 and 2 are the most interesting attribute. !!! 1 e 2 sono gli attributi piu' facilmente separabili

# Both the silhouette scores and the inertia elbow suggest that the best number of clusters is 4,  !!! Quando si stampa silhouette e inertia
# which is in line with what we were expecting, given the initial pairplots

# The pairplots don't seem to show any particular pattern in the data.




### Modifica del DataSet

In [None]:
# Merge the two dataframes with the 'outer' how, as to perform a SQL-like full outer join
# on the two indexes, adding suffixes as requested (default option)
# (Entrambi hanno Indici e prima riga Label da differenziare Target)
df = first_df.merge(second_df, how = 'outer', left_index = True, right_index = True, suffixes = ('_x', '_y'))

# Drop those rows from the dataframe
df = df.drop(index = indexes_to_delete, axis = 0)

# Drop specific column
df = df.drop(columns = 'Column_Name', axis = 1)

# Drop more than 1 column
df = df.drop(columns = ['Column_Name1', 'Column_Name2'], axis = 1)

# Rename specific column
df = df.rename(columns = {'Old_Name1':'New_Name1', 'Old_Name2':'New_name2'})

# Get the column names
column_names = list(df.columns)

# Reindex the dataframe
df = df.reindex(columns = column_names)

# Eliminate the rows containing null values
df = df.dropna()



In [None]:
# assegnare dei nomi alle colonne se in dataset originale non ha nomi alle colonne
columns =[]
for i in range(df.shape[1]):
    columns.append(i)

df.columns = columns

# assegnare dei nomi alle colonne se in dataset originale non ha nomi alle colonne
columns =[]
for i in range(df.shape[1]):
    columns.append(i)

# last element
columns[-1] = 'Class_target'
df.columns = columns

### Trasformazione dei dati per Grafici o altro

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

column_target = 'target'

# Set the transformer data type (if required)
transf_dtype = np.int32

# OneHotEncoder (da Nominal a Numerical)
encoder = OneHotEncoder(dtype = transf_dtype)
transformed = encoder.fit_transform(df[[column_target]])
df[encoder.categories_[0]] = transformed.toarray()
df = df.drop(column_target, axis = 1)

# OrdinalEncoder (da Ordinal a Numerical)
encoder = OrdinalEncoder(dtype = transf_dtype)
df[column_target] = encoder.fit_transform(df[[column_target]])

In [None]:
from sklearn.preprocessing import OneHotEncoder

# We will transform into integers
transf_dtype = np.int32
encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False, dtype = transf_dtype)

# Fit and transform the data
X_e = encoder.fit_transform(df)
X_ohe = pd.DataFrame(X_e)
X_ohe.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder

def get_ohe(df):
  # We will transform into integers
  transf_dtype = np.int32
  encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False, dtype = transf_dtype)

  # Fit and transform the data
  X_e = encoder.fit_transform(df)
  X_ohe = pd.DataFrame(X_e)
  return X_ohe

X_ohe = get_ohe(df)


In [None]:
# Sex ha valori : M,F,I =>

oe = OrdinalEncoder()
df['Sex'] = oe.fit_transform(df['Sex'].values.reshape(-1,1))

### Snippets utili (Liste)

In [None]:
# Remove an item (target) from a list
list_name.remove(target)

# Sort the values
list_name.sort()

# Append an item (target) to a list (put it last)
list_name.append(target)


# Sorting tuple
# For example : A sorted list of the discovered clusters for decreasing sizes
# df : dataframe
# "cluster_n" : column name of the clusters
val, counts = np.unique(df["cluster_n"], return_counts=True)
val_count = []
for i in range(0,len(val)):
    val_count.append([counts[i], val[i]])
# for decreasing sizes => reverse=True
val_count.sort(key=lambda x: x[0],reverse=True)
print(val_count)

### Plot

In [None]:
sns.pairplot(X)

## 2. Clustering

* Il cluster si applica a valori numerici X, quindi vanno tolte (o trasformate) quelle colonne di valori stringhe
* Il salvataggio degli altri parametri potrebbe essere messo in vettore y

In [None]:
target_column = 'y'

# Separate in X all the columns but the last one
X = df.drop(target_column, axis = 1)

# Save the last column in y
y = df[target_column]

### KMeans

#### i. Trovare parametri

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Range of possible clusters
k_range = range(2,11)

# Distortion, Silhouette Score and size deviation as measures
distortions = []
silhouette_scores = []
size_deviation = []

for i in k_range:

    # Iterate over our range of possible clusters
    km = KMeans(n_clusters = i,
                init = 'k-means++',
                n_init = 10,
                max_iter = 300,
                random_state = random_state)

    # Fit predict
    y_km = km.fit_predict(X)

    # Compute the deviation with the provided formula
    deviation = np.sqrt(np.unique(y_km, return_counts = True)[1].var())/i

    # Store the data in the arrays
    distortions.append(km.inertia_)
    silhouette_scores.append(silhouette_score(X,y_km))
    size_deviation.append(deviation)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
def km_parameters(min_n_cluster,max_n_cluster,random_state,X):
    # Range of possible clusters
    k_range = range(min_n_cluster,max_n_cluster)

    # Distortion, Silhouette Score and size deviation as measures
    distortions = []
    silhouette_scores = []
    size_deviation = []

    for i in k_range:

        # Iterate over our range of possible clusters
        km = KMeans(n_clusters = i,
                    init = 'k-means++',
                    n_init = 10,
                    max_iter = 300,
                    random_state = random_state)

        # Fit predict
        y_km = km.fit_predict(X)

        # Compute the deviation with the provided formula
        deviation = np.sqrt(np.unique(y_km, return_counts = True)[1].var())/i

        # Store the data in the arrays
        distortions.append(km.inertia_)
        silhouette_scores.append(silhouette_score(X,y_km))
        size_deviation.append(deviation)

    return distortions,silhouette_scores,size_deviation,k_range

In [None]:
km = KMeans(n_clusters=k,
            random_state=random_state)
y_km = km.fit_predict(X)
print("Number of clusters = {}\t- Distortion = {:6.2f}\t- Silhouette score = {:4.2f}"\
    .format(k,distortions[k_range.index(k)],silhouette_scores[k_range.index(k)]))

In [None]:
# Size of a cluster
clust_sizes_km = np.unique(y_km,return_counts=True)

fig = plt.figure(figsize=(16,8))
data = clust_sizes_km[1]
labels = clust_sizes_km[0]
plt.pie(data,
    labels = labels,
    autopct='%1.1f%%',
    shadow=True)
plt.legend()
plt.show()

# Print clust_sizes_km in oder to have size of the cluster


#### ii. Grafico Inertia - Silhouette

In [None]:
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Number of clusters')
ax1.set_ylabel('Inertia', color=color)
ax1.plot(k_range, distortions, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('Silhouette scores', color=color)  # we already handled the x-label with ax1
ax2.plot(k_range, silhouette_scores, color=color)
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim(0,1) # the axis for silhouette is [0,1]

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

#### ii.a Grafico generico 2 parametri

In [None]:
def two_plots(x, y1, y2, xlabel, y1label, y2label):
    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel(xlabel)
    ax1.set_ylabel(y1label, color=color)
    ax1.plot(x, y1, color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

    color = 'tab:blue'
    ax2.set_ylabel(y2label, color=color)  # we already handled the x-label with ax1
    ax2.plot(x, y2, color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    ax2.set_ylim(0,1) # the axis for silhouette is [0,1]

    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()


# Usage
# inertias_km e silhouette_scores_km potrebbero essere presi da  i. Trovare parametri
two_plots(x=k_range, y1=inertias_km, y2=silhouette_scores_km, xlabel='Number of clusters', y1label='Inertias', y2label='Silhouette scores' )

#### iii. Grafico Inertia - Size deviation

In [None]:
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('Number of clusters')
ax1.set_ylabel('Inertia', color=color)
ax1.plot(k_range, distortions, color=color)
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('Size deviation index', color=color)  # we already handled the x-label with ax1
ax2.plot(k_range, size_deviation, color=color)
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.show()

#### iv. Grafico Silhouette

In [None]:
fig, ax = plt.subplots()
color = 'tab:red'
ax.set_xlabel('Number of clusters')
ax.set_ylabel('Silhouette scores', color=color)
ax.plot(k_range, silhouette_scores, color=color)
ax.tick_params(axis='y', labelcolor=color)
fig.tight_layout()
plt.show()

#### v. Grafico Silhouette con plot_silhouette. A plot of the silhouette index for the data points, grouped according to the clusters

In [None]:
from plot_silhouette import plot_silhouette

# IF IT DOESN'T WORK TRY TRANSFORMING X TO AN ARRAY
# X_array = X.to_numpy()

# produce the silhouette plot using the function plot_silhouette
silhouette_score_samples = silhouette_samples(X, y_km, metric='euclidean')
plt.title(f'Silhouette score for samples with {best_k} clusters')
plot_silhouette(silhouette_score_samples, y_km)

V.1 Grafico Silhouette con plot_silhouette e silhouette sample

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
silhouette = silhouette_samples(X,y_km)
# from plot_silhouette import plot_silhouette  # python script provided separately
plot_silhouette(silhouette,y_km)

#### vi. Istanziamento + fitting

In [None]:
## CAMBIARE PARAMETRO CLUSTER
# Re-instantiate
km = KMeans(n_clusters = 3, init = 'k-means++', n_init = 10, max_iter = 300, random_state = random_state)

# Fit and predict
y_km = km.fit_predict(X)

In [None]:
print("Number of clusters = {}\t- Distortion = {:6.2f}\t- Silhouette score = {:4.2f}".format(k,distortions[k_range.index(k)],silhouette_scores[k_range.index(k)]))

#### vii. plot pairplot . Input : dataframe , numero_di_cluster , random_state

In [None]:

def plot_pairplot(df,n_cluster,random_state):
    best_km = KMeans(n_cluster, init = 'k-means++', random_state=random_state)
    y_best_km = best_km.fit_predict(df)
    target = "cluster_n"
    df_y = df
    df_y[target] = y_best_km
    sns.pairplot(df_y, hue = target)

### DBSCAN

#### i. Trovare parametri

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.model_selection import ParameterGrid

# !!! ATTENZIONE alla distanza tra i punti !!!
# param_grid = {'eps': list(np.arange(0.01, 1, 0.01)), 'min_samples': list(range(1,10,1))}
# param_grid = {'eps': list(np.arange(60, 120, 20)), 'min_samples': list(range(5,30,5))}
params = list(ParameterGrid(param_grid))

dbscan_out = pd.DataFrame(columns = ['eps','min_samples','n_clusters', 'size deviation index','silhouette', 'unclust%'])        #0 righe, 5 colonne

for i in range(len(params)):
    db = DBSCAN(**(params[i]))
    y_db = db.fit_predict(X)
                                                                            #prende valori e filtra rumore
    cluster_labels_all = np.unique(y_db)
    cluster_labels = cluster_labels_all[cluster_labels_all != -1]
    n_clusters = len(cluster_labels)

    if n_clusters > 1:
        X_cl = X.iloc[y_db!=-1,:]                                                #filtra il rumore da X
        y_db_cl = y_db[y_db!=-1]                                            #filtra il rumore da Y

        silhouette = silhouette_score(X_cl,y_db_cl)
        deviation = np.sqrt(np.unique(y_db_cl, return_counts = True)[1].var())/i
        uncl_p = (1 - y_db_cl.shape[0]/y_db.shape[0]) * 100                 #% di dati unclustered
        dbscan_out.loc[len(dbscan_out)] = [db.eps, db.min_samples, n_clusters, deviation, silhouette, uncl_p]
        print("{:11.2f}\t{:11}\t{:11}\t{:11.2f}\t{:11.2f}\t{:11.2f}".format(db.eps, db.min_samples, n_clusters, silhouette, uncl_p, s_deviation))

In [None]:
sil_thr = 0.7  # visualize results only for combinations with silhouette above the threshold
unc_thr = 100 # visualize results only for combinations with unclustered% below the threshold
n_clu_max_thr = 10

                                                                        #Filtro
db_sort = dbscan_out[(dbscan_out['silhouette']>=sil_thr)\
         & (dbscan_out['unclust%']<=unc_thr)\
         & (dbscan_out['n_clusters']<=n_clu_max_thr)]

db_sort.sort_values('silhouette', ascending=False)

In [None]:
# Metodo alternativo per trovare i parametri con stampa


In [None]:
from sklearn.cluster import DBSCAN
from sklearn.model_selection import ParameterGrid

# !!!! ATTENZIONE  !!!!
# Versione di param_grid con variabili continue
param_grid = {'eps': list(np.arange(0.01, 1, 0.01)), 'min_samples': list(range(1,10,1))}
# Versione di param_grid con veriabili discrete tipo OneHotEncoding
# Valori di esempio
number_of_column = 30 # df.shape[1]
number_of_row = 1000 # df.shape[0]
delta_item = 10
min_number_of_item_for_cluster = 30

param_grid = {'eps': list(np.arange(1, number_of_column, 1)),
              'min_samples': list(range(min_number_of_item_for_cluster,number_of_row,delta_item))}
params = list(ParameterGrid(param_grid))

dbscan_out = pd.DataFrame(columns = ['eps','min_samples','n_clusters', 'size deviation index','silhouette', 'unclust%'])        #0 righe, 5 colonne
print("{:11}\t{:11}\t{:11}\t{:11}\t{:11}\t{:11}".format('        eps','min_samples',' n_clusters',' silhouette', '    unclust%', '    size deviation'))

for i in range(len(params)):
    db = DBSCAN(**(params[i]))
    y_db = db.fit_predict(df_ohe)

    cluster_labels_all = np.unique(y_db)
    cluster_labels = cluster_labels_all[cluster_labels_all != -1]
    n_clusters = len(cluster_labels)

    if n_clusters > 1:
        X_cl = df_ohe.iloc[y_db!=-1,:]
        y_db_cl = y_db[y_db!=-1]

        silhouette = silhouette_score(X_cl,y_db_cl)
        deviation = np.sqrt(np.unique(y_db_cl, return_counts = True)[1].var())/i
        uncl_p = (1 - y_db_cl.shape[0]/y_db.shape[0]) * 100
        dbscan_out.loc[len(dbscan_out)] = [db.eps, db.min_samples, n_clusters, deviation, silhouette, uncl_p]
        print("{:11.2f}\t{:11}\t{:11}\t{:11.2f}\t{:11.2f}\t{:11.2f}".format(db.eps, db.min_samples, n_clusters, silhouette, uncl_p, deviation))

Size of a cluster

In [None]:
db = DBSCAN(eps = 100.00, min_samples = 15)

# Fit and predict
y_db = db.fit_predict(X.to_numpy())
y_db_cl = y_db[y_db!=-1]                                            #filtra il rumore da Y

clust_sizes_km = np.unique(y_db_cl, return_counts=True)

fig = plt.figure(figsize=(16,8))
data = clust_sizes_km[1]
labels = clust_sizes_km[0]
plt.pie(data,
    labels = labels,
    autopct='%1.1f%%',
    shadow=True)
plt.legend()
plt.show()

#### ii. Istanziamento + fitting

In [None]:
## CAMBIARE PARAMETRI
# Re-instantiate
db = DBSCAN(eps = 0.14, min_samples = 7)

# Fit and predict
y_db = db.fit_predict(X)

### Agglomerative Clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

parameters = [{'n_clusters': k_range
                    , 'linkage' : ['ward', 'complete', 'average', 'single']}]
pg = list(ParameterGrid(parameters))
result_ac = []
for i in range(len(pg)):
    ac = AgglomerativeClustering(**(pg[i]))
    y_ac = ac.fit_predict(X)
    deviation = np.sqrt(np.unique(y_ac, return_counts = True)[1].var())/i
    result_ac.append([pg[i]['linkage'],pg[i]['n_clusters'], silhouette_score(X,y_ac), deviation])

In [None]:
df_result_ac = pd.DataFrame(data = result_ac, columns=['linkage','n_clusters','silhouette_score', 'deviation'])
df_result_ac.sort_values(by='silhouette_score', ascending=False).head(5)

#### Istanziamento + fitting

In [None]:
## CAMBIARE PARAMETRO CLUSTER
# Re-instantiate
ac = AgglomerativeClustering(n_clusters=5)

# Fit and predict
y_ac = ac.fit_predict(X)

## 3. Confronto

### Performance con migliori parametri, matrici di confusione (Pair) e adjusted_rand_score

In [None]:
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.metrics.cluster import pair_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score

# Output the silhouette score
print(f"The silhouette score for best_k = 3 was {silhouette_score(X,y_km)}")

# Pairplot con classe generata dal cluster
sns.pairplot(X, hue = 'Target')

# Adjusted_rand_score
adjusted_rand_score(y_km, y_db)

# Pair_confusion_matrix (Discrimina come TP FP TN FN)
pair_confusion_matrix(y_km, y_db)

#Confusion matrix (Confronta valori della y)
cm = confusion_matrix(y, y_ac)
CMD = ConfusionMatrixDisplay(cm)
CMD.plot()

# Accuracy scores
DB_accuracy = accuracy_score(y, y_db) * 100
print(f"The accuracy for DB was {DB_accuracy:.2f}%")

KM_accuracy = accuracy_score(y, y_km) * 100
print(f"The accuracy for KM was {KM_accuracy:.2f}%")

AC_accuracy = accuracy_score(y, y_ac) * 100
print(f"The accuracy for AC was {AC_accuracy:.2f}%")

### A sorted list of the discovered clusters for decreasing sizes

In [None]:
val, counts = np.unique(df["cluster"], return_counts=True)
val_count = []
for i in range(0,len(val)):
    val_count.append([counts[i], val[i]])
val_count.sort(reverse=True)
sorted_clusters = [i[1] for i in val_count]
sorted_clusters

## 4. Gold Standard

### Rimappare gli y_km derivati dal clustering secondo il gold standard y

In [None]:
# For this algorithm to work correctly we need to have
# a match in the first iteration
previous_label = y_km[0]
start_idx = 0
changes = 0

for idx, label in enumerate(y_km):

    # Apply remapping
    if label != previous_label:

        # Count the occurrences in the corresponding
        # y_km subset
        occurrences = np.bincount(y[start_idx:idx])

        # Find which cluster index is the most frequent
        gold_standard = np.argmax(occurrences)

        # Remap the clusters
        for i in range(start_idx, idx):

            if y_km[i] != gold_standard:
                y_km[i] = gold_standard
                changes = changes + 1

        # The current index will now be the new start idx
        start_idx = idx

    # Save current label as 'previous_label'
    previous_label = label

# Output the silhouette score
print(f"This remapping performed {changes} changes to y_km subsets")

## 5. Trasformazioni dei valori

### Miglioramento dei risultati (operazione Logaritmica)

In [None]:
# Iterate over column names
for column in X:

    # Select column contents by column
    # name using [] operator
    if( not (X[column] <= 0).any() ):
        X[column] = np.log(X[column])

### MinMax Scaler: trasformazione tutti valori in un range da 0 a 1

In [None]:
# remap on the 0:1 range with MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
X = pd.DataFrame(mms.fit_transform(X), columns = X.columns)
X.head()

### Square Root transformation

In [None]:
# square root transformation - the first two columns are not transformed
from math import sqrt

X_sqrt = pd.concat([X.iloc[:,:2],X.iloc[:,2:].applymap(sqrt)],axis=1)

### MaxDiag

In [None]:
# Professor function
from max_diag import max_diag

# Apply on a confusion matrix
cm_km = max_diag(cm)
CMD = ConfusionMatrixDisplay(cm_km)
CMD.plot()

# To see why it is useful
help(max_diag)

### PowerTransformer,StandardScaler

In [None]:
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.pipeline import make_pipeline

preprocessor = make_pipeline(StandardScaler(with_std=False),
                        PowerTransformer(standardize=True))
X_fit= X.copy()
X_fit= preprocessor.fit_transform(X_fit)
X_fit