In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [3]:
iris=pd.read_csv('data/iris.csv')

In [4]:
train, test=train_test_split(iris, test_size=0.3, random_state=123)

In [5]:
var_list=iris.columns[:-1]
var_list

Index(['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth'], dtype='object')

In [6]:
train_m=train.copy()
test_m=test.copy()

In [7]:
minmax=MinMaxScaler().fit(train[var_list])

train_m[var_list]=minmax.transform(train[var_list])
test_m[var_list]=minmax.transform(test[var_list])

In [8]:
km=KMeans(3).fit(train_m[var_list])

In [9]:
km.cluster_centers_

array([[0.71891327, 0.39184953, 0.78433665, 0.80603448],
       [0.18181818, 0.55255682, 0.07997881, 0.06510417],
       [0.45661157, 0.26239669, 0.57241911, 0.54734848]])

In [10]:
km.labels_

array([0, 0, 2, 1, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, 1, 0, 2, 1, 1, 0, 2,
       0, 0, 1, 2, 2, 2, 1, 0, 2, 0, 1, 0, 2, 1, 1, 2, 2, 0, 1, 1, 2, 1,
       2, 0, 1, 0, 1, 1, 0, 1, 1, 2, 0, 2, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 0, 0, 2, 2, 0, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 2, 2, 1, 2, 0, 2,
       0, 0, 1, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 0])

In [11]:
train_m['gr'] = km.labels_

In [12]:
hist_clust=AgglomerativeClustering(n_clusters=2).fit(train_m[var_list])

In [13]:
hist_clust.labels_

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

# PCA

In [14]:
pca=PCA().fit(train[var_list])

pca.components_

array([[ 0.35836592, -0.08537123,  0.85849429,  0.35675367],
       [ 0.67903416,  0.70688269, -0.1733278 , -0.09584857],
       [-0.54380181,  0.61223521,  0.05028193,  0.57176863],
       [-0.33877141,  0.34379742,  0.48001746, -0.73254385]])

In [15]:
pca.explained_variance_

array([3.97184808, 0.20866195, 0.08429116, 0.02135265])

In [18]:
var_ratio=pd.Series(pca.explained_variance_ratio_, index=range(len(var_list)))
var_ratio

0    0.926670
1    0.048683
2    0.019666
3    0.004982
dtype: float64

In [19]:
var_ratio.cumsum()

0    0.926670
1    0.975352
2    0.995018
3    1.000000
dtype: float64

In [20]:
pca.transform(test[var_list])

array([[ 1.20919369e+00, -3.06687641e-01, -3.64655950e-01,
        -2.09380253e-02],
       [ 2.07483981e+00,  2.24764960e-01,  4.27910895e-02,
        -1.69940856e-01],
       [ 2.07009248e+00, -2.15142880e-01,  2.00069827e-01,
        -1.28444416e-01],
       [ 1.57505766e-01, -2.70738258e-01,  1.67543649e-01,
         1.50595478e-01],
       [-2.89061209e+00,  2.61070783e-01,  9.36624665e-02,
         1.77019403e-01],
       [ 1.08017497e+00, -1.68378339e-01,  2.71104594e-01,
        -1.51727923e-02],
       [ 7.25670058e-01, -3.42230565e-01, -6.26597684e-01,
        -1.83197469e-01],
       [-3.08582062e+00, -3.53451450e-01,  1.72817957e-01,
         8.76300083e-02],
       [-2.97435982e+00, -5.82849036e-01, -5.82441261e-03,
         3.24925272e-02],
       [ 3.77690586e-01, -6.63808231e-01, -6.50625379e-02,
         2.64213273e-01],
       [ 1.83009229e+00,  4.13518048e-01,  1.43875384e-01,
        -5.07954010e-01],
       [-2.69113992e+00,  1.08303603e+00,  1.91899371e-01,
      