# Unsupervised learning

Unsupervised learning finds patterns in data but without a specific prediction task in mind


In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import warnings
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

sns.set_style('whitegrid')
sns.color_palette("Spectral", as_cmap=True)
warnings.filterwarnings('ignore')

In [None]:
# https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
iris = load_iris()

In [None]:
type(iris)

## KMeans




In [None]:
from sklearn.cluster import KMeans

model = KMeans(n_clusters=3)

model.fit(iris.data)

In [None]:
model.cluster_centers_

In [None]:
df = pd.DataFrame(iris.data, columns=iris.feature_names) 
df['target'] = iris.target

sns.pairplot(df, hue='target')

## Evaluating a clustering

Evaluating a clustering will depend if we already have labels for the observations or if we dont.

In [None]:
model.labels_

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(iris.target, model.labels_)

In [None]:
pd.crosstab(iris.target, model.labels_)

In case we dont have labels, we can still measure the quality of the clusters.

**Inertia** measures how spread out the clusters are (lower is better). Inertia can be understood as the distance from each sample to the centroid of its cluster. 

## Number of clusters

As we add more clusters to the clustering, inertia keeps dropping.
Elbow rule to pick the best number of clusters.


In [None]:
ks = range(1, 6)
inertias = []

for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    model.fit(iris.data)
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
# Plot ks vs inertias
plt.plot(ks, inertias, '-o')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

## Transforming features for better clusterings


In [None]:
wines = pd.read_csv('../data/wine.csv')
wines

In [None]:
wines.isna().sum()

In [None]:
wines.class_name.value_counts()

In [None]:
model = KMeans(n_clusters=3)

model.fit(wines[['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 
                 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 
                 'color_intensity', 'hue', 'od280', 'proline']])
model.labels_

In [None]:
pd.crosstab(wines.class_name, model.labels_)

The result doesnt look so great.

In [None]:
sns.pairplot(wines, hue='class_name')

In [None]:
wines.describe()

Means and std vary a lot from feature to feature. Lets try scaling and standardizing them.

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() 
wines_scaled = scaler.fit_transform(wines[['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 
                 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 
                 'color_intensity', 'hue', 'od280', 'proline']])

model = KMeans(n_clusters=3)

model.fit(wines_scaled)

pd.crosstab(model.labels_, wines.class_name)


## Unsupervised Visualizing Techniques

### Visualizing hierarchies

Hierarchical clustering helps organizing samples in hierarchies.



In [None]:
eurovision = pd.read_csv('../data/eurovision-2016.csv')
eurovision

Hierarchical clustering can be *agglomerative* when starting with one cluster for element and combining them little by little or *divisive*, who starts with one single cluster and split it step by step.


In [None]:
samples=[[14.88  , 14.57  ,  0.8811,  5.554 ,  3.333 ,  1.018 ,  4.956 ],
       [14.69  , 14.49  ,  0.8799,  5.563 ,  3.259 ,  3.586 ,  5.219 ],
       [14.03  , 14.16  ,  0.8796,  5.438 ,  3.201 ,  1.717 ,  5.001 ],
       [13.99  , 13.83  ,  0.9183,  5.119 ,  3.383 ,  5.234 ,  4.781 ],
       [14.11  , 14.26  ,  0.8722,  5.52  ,  3.168 ,  2.688 ,  5.219 ],
       [13.02  , 13.76  ,  0.8641,  5.395 ,  3.026 ,  3.373 ,  4.825 ],
       [15.49  , 14.94  ,  0.8724,  5.757 ,  3.371 ,  3.412 ,  5.228 ],
       [16.2   , 15.27  ,  0.8734,  5.826 ,  3.464 ,  2.823 ,  5.527 ],
       [13.5   , 13.85  ,  0.8852,  5.351 ,  3.158 ,  2.249 ,  5.176 ],
       [15.36  , 14.76  ,  0.8861,  5.701 ,  3.393 ,  1.367 ,  5.132 ],
       [15.78  , 14.91  ,  0.8923,  5.674 ,  3.434 ,  5.593 ,  5.136 ],
       [14.46  , 14.35  ,  0.8818,  5.388 ,  3.377 ,  2.802 ,  5.044 ],
       [11.23  , 12.63  ,  0.884 ,  4.902 ,  2.879 ,  2.269 ,  4.703 ],
       [14.34  , 14.37  ,  0.8726,  5.63  ,  3.19  ,  1.313 ,  5.15  ],
       [16.84  , 15.67  ,  0.8623,  5.998 ,  3.484 ,  4.675 ,  5.877 ],
       [17.32  , 15.91  ,  0.8599,  6.064 ,  3.403 ,  3.824 ,  5.922 ],
       [18.72  , 16.19  ,  0.8977,  6.006 ,  3.857 ,  5.324 ,  5.879 ],
       [18.88  , 16.26  ,  0.8969,  6.084 ,  3.764 ,  1.649 ,  6.109 ],
       [18.76  , 16.2   ,  0.8984,  6.172 ,  3.796 ,  3.12  ,  6.053 ],
       [19.31  , 16.59  ,  0.8815,  6.341 ,  3.81  ,  3.477 ,  6.238 ],
       [17.99  , 15.86  ,  0.8992,  5.89  ,  3.694 ,  2.068 ,  5.837 ],
       [18.85  , 16.17  ,  0.9056,  6.152 ,  3.806 ,  2.843 ,  6.2   ],
       [19.38  , 16.72  ,  0.8716,  6.303 ,  3.791 ,  3.678 ,  5.965 ],
       [18.96  , 16.2   ,  0.9077,  6.051 ,  3.897 ,  4.334 ,  5.75  ],
       [18.14  , 16.12  ,  0.8772,  6.059 ,  3.563 ,  3.619 ,  6.011 ],
       [18.65  , 16.41  ,  0.8698,  6.285 ,  3.594 ,  4.391 ,  6.102 ],
       [18.94  , 16.32  ,  0.8942,  6.144 ,  3.825 ,  2.908 ,  5.949 ],
       [17.36  , 15.76  ,  0.8785,  6.145 ,  3.574 ,  3.526 ,  5.971 ],
       [13.32  , 13.94  ,  0.8613,  5.541 ,  3.073 ,  7.035 ,  5.44  ],
       [11.43  , 13.13  ,  0.8335,  5.176 ,  2.719 ,  2.221 ,  5.132 ],
       [12.01  , 13.52  ,  0.8249,  5.405 ,  2.776 ,  6.992 ,  5.27  ],
       [11.34  , 12.87  ,  0.8596,  5.053 ,  2.849 ,  3.347 ,  5.003 ],
       [12.02  , 13.33  ,  0.8503,  5.35  ,  2.81  ,  4.271 ,  5.308 ],
       [12.44  , 13.59  ,  0.8462,  5.319 ,  2.897 ,  4.924 ,  5.27  ],
       [11.55  , 13.1   ,  0.8455,  5.167 ,  2.845 ,  6.715 ,  4.956 ],
       [11.26  , 13.01  ,  0.8355,  5.186 ,  2.71  ,  5.335 ,  5.092 ],
       [12.46  , 13.41  ,  0.8706,  5.236 ,  3.017 ,  4.987 ,  5.147 ],
       [11.81  , 13.45  ,  0.8198,  5.413 ,  2.716 ,  4.898 ,  5.352 ],
       [11.27  , 12.86  ,  0.8563,  5.091 ,  2.804 ,  3.985 ,  5.001 ],
       [12.79  , 13.53  ,  0.8786,  5.224 ,  3.054 ,  5.483 ,  4.958 ],
       [12.67  , 13.32  ,  0.8977,  4.984 ,  3.135 ,  2.3   ,  4.745 ],
       [11.23  , 12.88  ,  0.8511,  5.14  ,  2.795 ,  4.325 ,  5.003 ]]

In [None]:
varieties=['Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Kama wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Rosa wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat',
 'Canadian wheat']

In [None]:
# Perform the necessary imports
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

# Calculate the linkage: mergings
mergings = linkage(samples, method="complete")

# Plot the dendrogram, using varieties as labels
dendrogram(mergings,
           labels=varieties,
           leaf_rotation=90,
           leaf_font_size=6,
)
plt.show()



The intermediate clusters and the height on a dendrogram can be useful too.

The height on a dendrogram is the distance between merging clusters.

The distance between clusters is defined by the linkage method.

In complete linkage the distance between clusters is the max distance between their samples. 

The cluster lables for any intermediate level can be retrieved by the fcluster() function.


In [None]:
from scipy.cluster.hierarchy import fcluster

mergings = linkage(samples, method="complete")

labels = fcluster(mergings, 5, criterion='distance')

print(labels)

In [None]:
ct = pd.crosstab(labels, varieties)

In [None]:
ct

## t-SNE (t-distributed Stochastic Neighbor Embedding)

Maps samples to 2D (or 3d) spaces

Map approximately preserves nearness of samples

Great for inspecting datasets



In [None]:
from sklearn.manifold import TSNE

model = TSNE(learning_rate=100)

transformed = model.fit_transform(iris.data) 
xs = transformed[:,0]
ys = transformed[:,1] 
sns.scatterplot(x=xs, y=ys, hue=iris.target)

TSNE doesnt have fit and transform separate methods, so each time we want to perform a TSNE thingy we have to start from scratch.

The learning_rate in TSNE can lead to all the datapoint sitting together in the scatterplot. Normally its enough trying few values between 50 and 200.

The axis on the tsne representation dont have any interpretation.

## Dimension Reduction

Finds patterns in data and use them to express the data in a compressed form.

### Principal Component Analysis (PCA)

Its the a fundamental dimension reduction technique. First of all it:
- Rotates data samples to be aligned with axes -> decorrelates data
- Shifts data samples so they have mean 0
- No information is lost.

The PCA implementation in scikit learn follows the fit/transform pattern.

Principal Components refers to the directions in the n-dimensional space of the observations where the variance changes the most.


In [None]:
grains = pd.read_csv('../data/seeds-width-vs-length.csv')

In [None]:
from sklearn.decomposition import PCA

model = PCA()
model.fit(grains)
pca_samples = model.transform(grains)

model.components_

In [None]:
# Perform the necessary imports
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# Assign the 0th column of grains: width
width = grains.values[:,0]

# Assign the 1st column of grains: length
length = grains.values[:,1]

# Scatter plot width vs length
plt.scatter(width, length)
plt.axis('equal')
plt.show()

# Calculate the Pearson correlation
correlation, pvalue = pearsonr(width, length)

# Display the correlation
print(correlation)

In [None]:
# Import PCA
from sklearn.decomposition import PCA

# Create PCA instance: model
model = PCA()

# Apply the fit_transform method of model to grains: pca_features
pca_features = model.fit_transform(grains.values)

# Assign 0th column of pca_features: xs
xs = pca_features[:,0]

# Assign 1st column of pca_features: ys
ys = pca_features[:,1]

# Scatter plot xs vs ys
plt.scatter(xs, ys)
plt.axis('equal')
plt.show()

# Calculate the Pearson correlation of xs and ys
correlation, pvalue = pearsonr(xs, ys)

# Display the correlation
print(correlation)

## Intrinsic Dimension

The intrinsic dimension of a dataset is the number of features needed to approximate the dataset. Its an essential idea behind dimension reduction. Can be detected using PCA: it suffice counting the number of PCA features with significant variance. Yet, depending on the dataset, the intrinsic dimension could not be that clear.





In [None]:
# Make a scatter plot of the untransformed points
plt.scatter(grains.values[:,0], grains.values[:,1])

# Create a PCA instance: model
model = PCA()

# Fit model to points
model.fit(grains.values)

# Get the mean of the grain samples: mean
mean = model.mean_

# Get the first principal component: first_pc
first_pc = model.components_[0,:]

# Plot first_pc as an arrow, starting at mean
plt.arrow(mean[0], mean[1], first_pc[0], first_pc[1], color='red', width=0.01)

# Keep axes on same scale
plt.axis('equal')

plt.show()

In [None]:
from sklearn.pipeline import make_pipeline

# Create scaler: scaler
scaler = StandardScaler()

# Create a PCA instance: pca
pca = PCA()

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)

# Fit the pipeline to 'samples'
pipeline.fit(wines.drop(['class_name'], axis=1).values)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()


## Dimension Reduction 

Dimension reduction is a technique used to represent the same data using less features

Its an important part of machine-learning techniques.

Can be done with scikit learn PCA using the n_components attribute.

## Non-Negative Matrix Factorization (NMF)

NMF is a dimension reduction technique.

NMF are interpretable (images are combinations of patterns, documents are combination of topics...)

All sample features must be non-negative

sklearn implementation of NMF follows the fit/transform pattern. Accepts a n_components parameter that in this case, is mandatory.

NMF has components (as PCA) and they have the same number of dimensions as the samples.

The NMF features are non negative and they can be used to reconstruct the samples.

In [None]:
from sklearn.decomposition import NMF

model = NMF(n_components=2)

model.fit(samples)

nmf_features = model.transform(samples)



In [None]:
nmf_features

In [None]:
model.components_

In [None]:
samples