<a href="https://colab.research.google.com/github/jidemaestri/Machine-Learning/blob/master/clustering_images_semisupervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Clustering for semisupervised learning

In [6]:
from pathlib import Path
import numpy as np
import pandas as pd

In [8]:
from sklearn.datasets import load_digits

X_digits, y_digits = load_digits(return_X_y=True)
X_train, y_train = X_digits[:1400], y_digits[:1400]
X_test, y_test = X_digits[1400:], y_digits[1400:]

In [38]:
print(X_digits)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [9]:
from sklearn.linear_model import LogisticRegression

n_labeled = 50
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])

LogisticRegression(max_iter=10000)

In [10]:
log_reg.score(X_test, y_test)

0.7481108312342569

#### Clustering the training set

In [12]:
# clustering the training set and finding the closest centroid
from sklearn.cluster import KMeans

k = 50
kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = X_digits_dist.argmin(axis=0)
X_representative_digits = X_train[representative_digit_idx]

In [13]:
y_representative_digits = np.array([
    1, 3, 6, 0, 7, 9, 2, 4, 8, 9,
    5, 4, 7, 1, 2, 6, 1, 2, 5, 1,
    4, 1, 3, 3, 8, 8, 2, 5, 6, 9,
    1, 4, 0, 6, 8, 3, 4, 6, 7, 2,
    4, 1, 0, 7, 5, 1, 9, 9, 3, 7
])

In [22]:
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)

0.8488664987405542

In [18]:
X_representative_digits[1].shape

(64,)

In [19]:
X_representative_digits[1]

array([ 0.,  0.,  4., 13., 16., 11.,  0.,  0.,  0.,  9., 16.,  9., 10.,
       15.,  0.,  0.,  0.,  5.,  4.,  0., 12., 11.,  0.,  0.,  0.,  0.,
        0.,  5., 16., 12.,  1.,  0.,  0.,  0.,  0.,  1.,  9., 15.,  8.,
        0.,  0.,  0.,  0.,  0.,  0.,  8., 12.,  0.,  0.,  0.,  1.,  6.,
        8., 16.,  8.,  0.,  0.,  0.,  5., 16., 15.,  9.,  1.,  0.])

In [23]:
#It's often costly and painful to label instances. It has to be done manually.



#### Label propagation

In [24]:
len(X_train)

1400

In [21]:
y_representative_digits.shape

(50,)

In [25]:
y_train_propagated = np.empty(len(X_train), dtype=np.int64)
for i in range(k):
    y_train_propagated[kmeans.labels_ == i] = y_representative_digits[i]

In [26]:
y_train_propagated.shape

(1400,)

In [27]:
range(k)

range(0, 50)

In [28]:
y_train_propagated[kmeans.labels_ == 5] 

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [29]:
y_representative_digits[5]

9

In [37]:
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_train, y_train_propagated)
log_reg.score(X_test, y_test)

0.8942065491183879