# Unsupervised Learning + Model Evaluation

**Goal: predict the Body Mass from the other columns.**

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../data/penguins_simple.csv', sep=';')
df.head()

In [None]:
sns.scatterplot(data=df, x='Culmen Length (mm)', y='Culmen Depth (mm)')

In [None]:
train, test = train_test_split(df, random_state=777)
train.shape, test.shape

### K-Means Clustering

* clusters are spherical
* all clusters have the same size
* every cluster has a center point
* you set the number of clusters before

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

In [None]:
df.columns

In [None]:
X = train
X.shape

In [None]:
col = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='error', drop='first'), ['Species', 'Sex']),
    ('scale', MinMaxScaler(), ['Culmen Length (mm)', 'Culmen Depth (mm)',
       'Flipper Length (mm)', 'Body Mass (g)'])
])

In [None]:
col.fit(X)
Xt = col.transform(X)
Xt.shape

In [None]:
Xt[0]

In [None]:
km = KMeans(n_clusters=3)
km.fit(Xt)
km.cluster_centers_

In [None]:
clusters = km.predict(Xt)

In [None]:
clusters

In [None]:
train = train.copy()
train['cluster'] = clusters

In [None]:
sns.scatterplot(data=train, x='Culmen Length (mm)', y='Body Mass (g)', hue='cluster')

In [None]:
sns.scatterplot(data=train, x='Culmen Length (mm)', y='Culmen Depth (mm)', hue='cluster')

#### Caveat:

* clustering with Euclidean distance does not work well with many features

### Evaluation metrics
* silhouette score: describes the overall shape of the cluster (penalizes large surface)
* compare to a reference set (e.g. Species)
* calculate clusters for different hyperparameters and compare some metric

### DBSCAN clustering

* you get the number of clusters as a result
* two hyperparameters: 
  * minimum # points belonging to a cluster
  * maximum distance for two points in the same cluster (eps)
* finds outliers (-1)

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
m = DBSCAN(eps=0.13, min_samples=5, metric='euclidean')
m.fit(Xt)
train['dbscan'] = m.fit_predict(Xt)

In [None]:
sns.scatterplot(data=train, x='Culmen Length (mm)', y='Culmen Depth (mm)', hue='dbscan')

### Distance Metrics:

* euclidean : works well only <10 features
* manhattan : works well only <10 features
* cosine similarity : angle between two vectors, good for large number of features
* Jaccard distance / Tanimoto score : for large numbers of binary columns

----

## Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
Xt.shape

In [None]:
# we measure statistical dependence of features with correlation coefficients
# 1.0 = identical, 0.0 = independent
sns.heatmap(pd.DataFrame(Xt, columns=['sp1', 'sp2', 'sex', 'beak_len', 'beak_wid', 'flipper', 'mass']).corr().round(2), annot=True)

In [None]:
# we need to have a mean of zero for PCA
sc = StandardScaler()
Xs = sc.fit_transform(Xt)

In [None]:
pca = PCA(n_components=7)
Xp = pca.fit_transform(Xs)
Xp.shape

PCA performs a linear transformation. All the features get transformed into new features.

In [None]:
# the output data of the PCA are *orthogonal* or *independent* features
# --> super important for linear models
sns.heatmap(pd.DataFrame(Xp).corr().round(2), annot=True)

the output features of the PCA are *ranked* : the first feature is the most important one, the second feature is the second is the second most important etc.

In [None]:
# we can use only the most representative features -> Dimensionality Reduction
# --> models can be trained faster
# --> we avoid overfitting

## Disadvantage: we don't know what the new features after PCA mean (no labels)

In [None]:
# how many components to use?
pd.Series(pca.explained_variance_ratio_).plot()
# first new feature explains 50% of the variance in the data
# second new feature explains 25% of the variance
# features 3-6 only explain noise, we can remove them

### What to use PCA for?

* use the output as an input for further modeling (as a preprocessing step)
* use the output for plotting / clustering to explore the shape

In [None]:
sns.scatterplot(data=pd.DataFrame(Xp), x=0, y=1, hue=train['Sex'])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

### Clean Pipeline

In [None]:
col = ColumnTransformer([
    # check Andreas Müllers scikit videos on this
    # for how to write your own preprocessors
    ('onehot', OneHotEncoder(handle_unknown='error', drop='first'), ['Species', 'Sex']),
    ('scale', 'passthrough', ['Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)'])
])

In [None]:
pipeline = make_pipeline(
    col,
    StandardScaler(),
    PCA(n_components=4),
    LinearRegression()
)

In [None]:
Xtrain = train.iloc[:, :-2]
ytrain = train['Body Mass (g)']

pipeline.fit(Xtrain, ytrain)
ypred_train = pipeline.predict(Xtrain)

### How to debug/inspect models

In [None]:
Xtrain.shape

In [None]:
# 1. validation score (should be easy with a pipeline)
Xval = test
yval = test['Body Mass (g)']

## NEVER FIT ANYTHING ON val/test DATA!!! pipeline.fit(Xval, yval)
ypred_val = pipeline.predict(Xval)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
mean_squared_error(ytrain, ypred_train)

In [None]:
mean_squared_error(yval, ypred_val)

In [None]:
mean_absolute_error(ytrain, ypred_train).round() # g in penguin weight

In [None]:
mean_absolute_error(yval, ypred_val).round()

In [None]:
# 2. cross-validation
#    good rule of thumb, 5x training time
from sklearn.model_selection import cross_validate

cv = cross_validate(pipeline, Xtrain, ytrain,
                    cv=5,
                    scoring='neg_mean_absolute_error',
                    return_train_score=True
)
pd.DataFrame(cv).round()

In [None]:
# look for:
# - big differences between test/train scores in the same row (e.g. overfitting)
# - lots of variation in the same column


In [None]:
# 3. examine residuals (errors)
#    (in classification: inspect some misclassified points)
residual = ytrain - ypred_train
residual.hist(bins=20)
# what are the biggest/smallest errors?
# is it a gaussian distribution

In [None]:
# look for dependence of the errors with some feature (e.g. time in a time series)
train['residual'] = residual
sns.scatterplot(data=train, y='residual', x='Body Mass (g)')
# are the residuals distributed evenly?

In [None]:
# 4. in linear regression, check other assumptions
#    (includes some statistical tests and more plots)
#    in time series you *must* check for autocorrelation

In [None]:
# 5. bootstrapping
#    (resamples the dataset 100-1000 times) -> takes much more training time
#    gives you a reliable estimate of your training/test score with confidence interval
#    -> do this at the very end of a training with the test data

In [None]:
# 6. inspect the influence of different features
#    linear regression: check coefficients directly
#    better output with statsmodels (p-values and confidence intervals for each coefficient)
pipeline.named_steps['linearregression'].coef_

In [None]:
# "1 cm of beak length means 385 g of penguin weight"

In [None]:
#    random forest / boosting
#    - m.feature_importance_
#    - Shapley values (in catboost)