In [8]:
#Take Abalone dataset. Run some clustering method (except k-means) with different number of clusters.
#Which number of clusters is better? (use "score")

#Take Abalone dataset. Reduce it’s dimensionality to 3 with Kernel PCA, then run clustering with 10 clusters.
#What age have the cluster centriods?
#Don’t forget to apply feature normalisation before clustering, and inverse feature normalisation for the cluster centres!

#Label propagation:
#cluster Abalone dataset without age (rings) feature using Gaussian Mixture Model
#take randomly 20 samples with age from the original Abalone dataset, predict their probabilities with GaussianMM ("predict_proba")
#compute class distribution for each Gaussian component
#compute Gaussian components for each original data point
#compute labels of original data points by multiplying their Gaussian components by class distribution of these components
#How correct was this label propagation?

In [1]:
import pandas as pd

In [2]:
filename = 'abalone.data'

In [3]:
df = pd.DataFrame()

In [4]:
names = [ 'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

In [5]:
df_small = pd.read_csv(filename,names = names, delimiter=',').sample(20)
df_small

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
1001,M,0.595,0.475,0.165,1.213,0.621,0.2435,0.274,9
2810,M,0.72,0.55,0.205,2.165,1.1055,0.525,0.404,10
1657,M,0.6,0.48,0.165,0.9165,0.4135,0.1965,0.2725,9
854,M,0.57,0.435,0.13,0.7535,0.349,0.1755,0.194,10
2885,I,0.505,0.4,0.125,0.5605,0.2255,0.1435,0.17,8
3097,M,0.545,0.435,0.145,0.9385,0.3685,0.1245,0.345,11
465,I,0.175,0.125,0.05,0.0235,0.008,0.0035,0.008,5
711,M,0.375,0.3,0.1,0.2465,0.104,0.0475,0.083,11
1295,I,0.525,0.4,0.125,0.6965,0.369,0.1385,0.164,9
2886,M,0.505,0.365,0.115,0.521,0.25,0.096,0.15,8


In [6]:
replace_list = {"Sex" : {"M": 0, "F" : 1, "I": 2}}
df_small.replace(replace_list,inplace=True)

In [7]:
df = pd.read_csv(filename ,names = names, delimiter=',')
del df['Rings']

In [8]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [9]:
replace_list = {"Sex" : {"M": 0, "F" : 1, "I": 2}}
df.replace(replace_list,inplace=True)

In [10]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,0,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,1,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,0,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,2,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [11]:
from sklearn import cluster, datasets, mixture

In [12]:
g = mixture.GaussianMixture(n_components=12)

In [13]:
x = df.iloc[:,:8].values

In [25]:
g.fit(x)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=12, n_init=1, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [26]:
g.predict(x)[0]

1

In [27]:
g.predict_proba(x)[563]

array([6.34992882e-01, 0.00000000e+00, 0.00000000e+00, 2.54180214e-03,
       0.00000000e+00, 0.00000000e+00, 3.62465312e-01, 0.00000000e+00,
       0.00000000e+00, 4.30325183e-09, 0.00000000e+00, 0.00000000e+00])

In [28]:
clusters_small = g.predict_proba(df_small.iloc[:,:8])
rings_small = df_small['Rings']

In [29]:
g.score(x)

19.800600536800047

In [30]:
clusters_small[2]

array([0.00000000e+00, 9.11899234e-01, 0.00000000e+00, 0.00000000e+00,
       2.55274699e-15, 7.75316619e-04, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 8.73254491e-02])

In [31]:
pd.get_dummies(rings_small).head()

Unnamed: 0,5,6,8,9,10,11,14,15,20
1001,0,0,0,1,0,0,0,0,0
2810,0,0,0,0,1,0,0,0,0
1657,0,0,0,1,0,0,0,0,0
854,0,0,0,0,1,0,0,0,0
2885,0,0,1,0,0,0,0,0,0


In [32]:
import numpy as np
np.set_printoptions(precision=3, suppress=True)

In [33]:
cluster_age = clusters_small.T @ pd.get_dummies(rings_small)
cluster_age = cluster_age / cluster_age.sum(1)[:,None]
cluster_age

array([[0.   , 0.   , 0.   , 0.534, 0.466, 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.344, 0.317, 0.335, 0.004, 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.603, 0.004, 0.   , 0.   , 0.   , 0.393, 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.001, 0.   , 0.999, 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 1.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.003, 0.562, 0.015, 0.   , 0.   , 0.419],
       [0.   , 0.   , 0.   , 0.16 , 0.839, 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.098, 0.002, 0.01 , 0.   , 0.   , 0.89 , 0.001, 0.   ],
       [1.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 1.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.183, 0.11 , 0.377, 0.   , 0.   , 0.316, 0.014, 0.   ],
       [0.   , 0.   , 0.002, 0.46 , 0.014, 0.415, 0.   , 0.   , 0.109]])

In [34]:
predicted_classes = g.predict_proba(x) @ cluster_age
predicted_classes[:3]

array([[0.   , 0.   , 0.344, 0.317, 0.335, 0.005, 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 1.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.427, 0.572, 0.   , 0.001, 0.   , 0.   ]])

In [35]:
age_labels = np.array(pd.get_dummies(rings_small).columns)
age_labels

array([ 5,  6,  8,  9, 10, 11, 14, 15, 20], dtype=int64)

In [36]:
predicted_age = predicted_classes @ age_labels
predicted_age.shape

(4177,)

In [37]:
predicted_age[:5]

array([ 9.003, 10.999,  9.579,  9.022,  8.12 ])

In [39]:
from sklearn.decomposition import PCA, KernelPCA
#np.random.seed(0)

# X, y = make_circles(n_samples=400, factor=.3, noise=.05)

kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
X_kpca = kpca.fit_transform(x)
X_back = kpca.inverse_transform(X_kpca)
pca = PCA()
X_pca = pca.fit_transform(x)

NameError: name 'KernelPCA' is not defined