## CHAPTER 9 - Unsupervised Learning Techniques

In [4]:
import sys
import os

from datetime import datetime, timedelta

import pandas as pd
import numpy as np

import re
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows=500
pd.options.display.max_columns=500

In [5]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data[:,2:]
y = iris.target

In [6]:
from sklearn.cluster import KMeans

k = 5
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)

In [7]:
y_pred

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 4, 4, 4, 0, 4, 4, 4, 0, 4, 0, 0, 4, 0, 4, 0, 4,
       4, 0, 4, 0, 4, 0, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 4, 4, 4, 4, 4,
       0, 0, 4, 4, 0, 0, 0, 0, 0, 4, 0, 0, 3, 1, 3, 1, 1, 3, 4, 3, 1, 3,
       1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 1, 1, 3, 4, 1, 3, 4, 4, 1, 1, 3, 3,
       1, 4, 1, 3, 1, 1, 4, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1])

In [8]:
y_pred is kmeans.labels_

True

In [9]:
kmeans.cluster_centers_

array([[3.83181818, 1.16818182],
       [5.39666667, 2.05333333],
       [1.462     , 0.246     ],
       [6.28461538, 2.13846154],
       [4.64857143, 1.5       ]])

### Centroid initialization methods

In [10]:
good_init = np.array([[-3,3], [-3,2], [-3,1], [-1,2], [0,2]])
kmeans = KMeans(n_clusters = 5, init=good_init, n_init=1)
y_pred = kmeans.fit_predict(X)
kmeans.transform(X[:2])

array([[5.52489316, 4.63790422, 3.63704489, 2.69331025, 0.07720104],
       [5.52489316, 4.63790422, 3.63704489, 2.69331025, 0.07720104]])

### Inertia performance measure

In [11]:
kmeans.inertia_

14.701112010796223

In [12]:
kmeans.score(X)

-14.701112010796226

### Kmeans Mini-batching w/ memmap

In [13]:
from sklearn.cluster import MiniBatchKMeans

#X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m,n))
X_mm = X

minibatch_kmeans = MiniBatchKMeans(n_clusters=5)
minibatch_kmeans.fit(X_mm)

MiniBatchKMeans(n_clusters=5)

### Choosing K order for a given dataset, the Inertia is not the good metrics to evaluate the best K

In [14]:
from sklearn.metrics import silhouette_score

silhouette_score(X, kmeans.labels_)

0.6011169787402741

## Clustering for image - Color segment for each pixel:

In [15]:
from matplotlib.image import imread

image = imread(os.path.join("imaes", "unsupervised_learning", "ladybug.png"))
image.shape

x = image.reshape(-1,3)
kmean = KMeans(n_Cluster=8).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'imaes\\unsupervised_learning\\ladybug.png'

## Using clustering for Preprocessing

In [19]:
from sklearn.datasets import load_digits

digits = load_digits()
X_digits = digits.data
y_digits = digits.target

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

In [22]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

log_reg.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9733333333333334

In [23]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50)),
    ("log_reg", LogisticRegression())
])

pipeline.fit(X_train, y_train)

pipeline.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9777777777777777

## DBSCAN

In [26]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.05)
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)

DBSCAN(eps=0.05)

In [27]:
dbscan.labels_

array([ 0,  1,  2,  3,  0,  1,  1,  1,  3,  4,  5,  3, -1,  3, -1,  6,  3,
        1,  0,  4,  3,  3,  4,  1,  4,  3,  4,  1,  7,  3,  1,  6,  0,  3,
        4,  7,  0,  1,  3,  0,  3,  8,  0,  5,  3,  3,  1,  3,  3,  0,  1,
        1,  2,  4,  4,  0, -1,  6,  4,  1, -1,  0,  3,  0,  1,  3,  3, -1,
        3,  0,  5,  0,  1,  9,  0,  1,  3,  8,  4,  0,  4,  6,  3,  4,  1,
        1,  4, -1,  8,  3,  3,  4,  8,  3,  3,  0,  0,  3,  3,  5,  3,  0,
        1,  4,  1,  2,  3, -1,  6, -1,  4,  3,  0,  1,  1,  1,  1,  8,  0,
        3,  0,  0, -1,  1, -1,  6,  6,  7,  0,  6,  4,  1,  9,  4, -1,  0,
        6,  3, -1, -1,  3,  3,  3,  3,  3,  3,  1,  4,  3,  1,  3, -1,  1,
        6,  2,  3,  0,  1,  5, -1,  3,  1,  5,  3,  4,  3,  1,  5,  3,  7,
        4,  1,  4,  0,  3,  3,  8,  0,  1,  5,  1,  4,  4,  3,  0,  3,  6,
        3, -1,  1,  2,  3,  4,  0,  0,  8,  6,  4,  4,  0,  4,  3,  0,  8,
        3,  6,  4, -1,  3,  3,  3,  3,  4,  3,  0,  0,  4,  2, -1,  0,  0,
        3,  5,  4, -1,  3

In [28]:
dbscan.components_

array([[ 0.77874447, -0.45220564],
       [-0.17358935,  1.01387626],
       [-0.00208964,  0.41547679],
       ...,
       [ 1.8135501 ,  0.05948812],
       [ 1.91076856,  0.20284228],
       [ 1.02259512,  0.29893665]])

### DBSCAN dos not have predict() function so:

In [29]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

KNeighborsClassifier(n_neighbors=50)

In [31]:
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
knn.predict(X_new)

array([8, 5, 3, 4], dtype=int64)

In [32]:
knn.predict_proba(X_new)

array([[0.  , 0.02, 0.  , 0.  , 0.  , 0.18, 0.  , 0.3 , 0.5 , 0.  ],
       [0.  , 0.  , 0.38, 0.  , 0.  , 0.52, 0.  , 0.  , 0.  , 0.1 ],
       [0.02, 0.  , 0.  , 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  , 0.  ]])