# Unsupervised Learning

### Loading Libraries

In [None]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt

# StatsModel
from statsmodels. datasets import get_rdataset

# SciPy
from scipy.cluster. hierarchy import (dendrogram, cut_tree)

# Scikit-Learn
from sklearn. decomposition import PCA
from sklearn. preprocessing import StandardScaler
from sklearn.cluster import (KMeans, AgglomerativeClustering)

# ISLP
from ISLP import load_data
from ISLP.cluster import compute_linkage

### Principal Components Analysis

In [None]:
USArrests = get_rdataset('USArrests').data

USArrests

In [None]:
USArrests.columns

In [None]:
USArrests.mean()

In [None]:
USArrests.var()

In [None]:
scaler = StandardScaler(with_std=True,
                        with_mean=True)

USArrests_scaled = scaler.fit_transform(USArrests)

In [None]:
pcaUS = PCA()

In [None]:
pcaUS.fit(USArrests_scaled)

In [None]:
pcaUS.mean_

In [None]:
scores = pcaUS.transform(USArrests_scaled)

In [None]:
pcaUS.components_

In [None]:
i, j = 0, 1 # which components
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

ax.scatter(scores[:, 0], scores[:, 1])
ax.set_xlabel('PC%d' % (i+1))
ax.set_ylabel('PC%d' % (j+1))

for k in range(pcaUS. components_.shape[1]):
    ax.arrow (0, 0, pcaUS.components_[i, k], pcaUS.components_[j, k])
    ax.text(pcaUS.components_[i, k],
            pcaUS.components_[j, k],
            USArrests.columns[k])

plt.grid(True)
plt.show()

In [None]:
scale_arrow = s_ = 2
scores[:, 1] *= -1

pcaUS.components_[1] *= -1 # flip the y-axis
fig, ax = plt. subplots(1, 1, figsize=(8, 8))
ax.scatter(scores[:, 0], scores[:, 1])
ax.set_xlabel('PC%d' % (i+1))
ax.set_ylabel('PC%d' % (j+1))

for k in range(pcaUS.components_.shape[1]):
    ax.arrow(0, 0, s_*pcaUS. components_[i,k], s_*pcaUS.components_[
        j,k])
    ax.text(s_*pcaUS.components_[i, k],
            s_*pcaUS.components_[j, k],
            USArrests.columns[k])

plt.grid(True)
plt.show()

In [None]:
scores.std(0, ddof =1)

In [None]:
pcaUS.explained_variance_

In [None]:
pcaUS.explained_variance_ratio_

In [None]:
%%capture
fig, axes = plt. subplots(1, 2, figsize=(15, 6))

ticks = np.arange(pcaUS.n_components_)+1
ax = axes[0]
ax.plot(ticks,
        pcaUS.explained_variance_ratio_,
        marker='o')
ax.set_xlabel('Principal Component ');
ax.set_ylabel('Proportion of Variance Explained ')
ax.set_ylim([0 ,1])
ax.set_xticks(ticks)

In [None]:
ax = axes [1]

ax.plot(ticks,
        pcaUS.explained_variance_ratio_.cumsum(),
        marker='o')
ax.set_xlabel('Principal Component')
ax.set_ylabel('Cumulative Proportion of Variance Explained')
ax.set_ylim([0, 1])
ax.set_xticks(ticks)

# plt.grid(True)
# plt.show()
fig

In [None]:
a = np.array([1, 2, 8, -3])

np.cumsum(a)

### Matrix Completion

In [None]:
X = USArrests_scaled

U, D, V = np.linalg.svd(X, full_matrices =False)
U.shape, D.shape, V.shape

In [None]:
V

In [None]:
pcaUS.components_

In [None]:
(U * D[None, :])[:3]

In [None]:
scores[:3]

In [None]:
n_omit = 20
np.random.seed(15)

r_idx = np.random.choice(np.arange(X.shape[0]),
                         n_omit,
                         replace=False)

c_idx = np.random.choice(np.arange(X.shape[1]),
                         n_omit,
                         replace=True)

Xna = X.copy()
Xna[r_idx, c_idx] = np.nan

In [None]:
def low_rank(X, M=1):
    U, D, V = np.linalg.svd(X)
    L = U[:, :M] * D[None, :M]
    return L.dot(V[:M])

In [None]:
Xhat = Xna.copy()

Xbar = np.nanmean(Xhat, axis =0)

Xhat[r_idx, c_idx] = Xbar[c_idx]

In [None]:
thresh = 1e-7
rel_err = 1
count = 0

ismiss = np.isnan(Xna)
mssold = np.mean(Xhat[~ismiss]**2)
mss0 = np.mean(Xna[~ismiss]**2)

In [None]:
while rel_err > thresh:
    count += 1
    # Step 2(a)
    Xapp = low_rank (Xhat, M=1)
    # Step 2(b)
    Xhat[ismiss] = Xapp[ismiss]
    # Step 2(c)
    mss = np.mean (((Xna - Xapp)[~ismiss])**2)
    rel_err = (mssold - mss) / mss0
    mssold = mss
    print("Iteration : {0}, MSS :{1:.3f}, Rel.Err {2:.2e}"
        .format(count, mss, rel_err))

In [None]:
np. corrcoef(Xapp[ismiss], X[ismiss])[0, 1]

### Clustering

#### K-Means Clustering

In [None]:
np.random.seed(0);

X = np.random.standard_normal((50, 2));

X[:25, 0] += 3;
X[:25, 1] -= 4;

In [None]:
kmeans = KMeans(n_clusters=2,
                random_state=2,
                n_init =20).fit(X)

In [None]:
kmeans.labels_

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 8))

ax.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
ax. set_title ("K-Means Clustering Results with K=2");

plt.grid(True)
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3,
                random_state=3,
                n_init=20).fit(X)

fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(X[:, 0], X[:, 1], c=kmeans.labels_)
ax. set_title ("K-Means Clustering Results with K=3");

plt.grid(True)
plt.show()

In [None]:
kmeans1 = KMeans(n_clusters=3,
                 random_state=3,
                 n_init =1).fit(X)

kmeans20 = KMeans(n_clusters=3,
                  random_state=3,
                  n_init =20).fit(X);

kmeans1.inertia_, kmeans20.inertia_

In [None]:
#### Hierarchical Clustering