<a href="https://colab.research.google.com/github/krakowiakpawel9/ml_course/blob/master/ul/11_pca_math.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Strona biblioteki: [https://scikit-learn.org](https://scikit-learn.org)  

Dokumentacja/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

Podstawowa biblioteka do uczenia maszynowego w języku Python.

Aby zainstalować bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install scikit-learn
```
Aby zaktualizować do najnowszej wersji bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install --upgrade scikit-learn
```
Kurs stworzony w oparciu o wersję `0.22.1`

### Spis treści:
1. [Import bibliotek](#0)
2. [Wygenerowanie danych](#1)
3. [Wizualizacja danych](#2)
4. [Algorytm K-średnich](#3)
5. [Wizualizacja klastrów](#4)




### <a name='0'></a> Import bibliotek

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

np.set_printoptions(precision=8, suppress=True, edgeitems=5, linewidth=200)

In [0]:
from sklearn.datasets import load_iris

In [0]:
raw_data = load_iris()
data = raw_data['data']
target = raw_data['target']
feature_names = list(raw_data['feature_names'])

In [4]:
raw_data['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
df = pd.DataFrame(data=np.c_[data, target], columns=feature_names + ['class'])
df['class'] = df['class'].map({0.0: 'setosa', 1.0: 'versicolor', 2.0: 'virginica'})
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [0]:
X = df.iloc[:, 0:4]
y = df.iloc[:, -1]

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ],
       [-0.53717756,  1.93979142, -1.16971425, -1.05217993],
       [-1.50652052,  0.78880759, -1.34022653, -1.18381211],
       [-1.02184904,  0.78880759, -1.2833891 , -1.3154443 ],
       [-1.74885626, -0.36217625, -1.34022653, -1.3154443 ],
       [-1.14301691,  0.09821729, -1.2833891 , -1.44707648],
       [-0.53717756,  1.47939788, -1.2833891 , -1.3154443 ],
       [-1.26418478,  0.78880759, -1.22655167, -1.3154443 ],
       [-1.26418478, -0.13197948, -1.34022653, -1.44707648],
       [-1.87002413, -0.13197948, -1.51073881, -1.44707648],
       [-0.05250608,  2.16998818, -1.45390138, -1.3154443 ],
       [-0.17367395,  3.09077525, -1.2833891 , -1.05217993],
       [-0.53717756,  1.

In [8]:
cov_mat = np.cov(X_std, rowvar=False)
cov_mat

array([[ 1.00671141, -0.11835884,  0.87760447,  0.82343066],
       [-0.11835884,  1.00671141, -0.43131554, -0.36858315],
       [ 0.87760447, -0.43131554,  1.00671141,  0.96932762],
       [ 0.82343066, -0.36858315,  0.96932762,  1.00671141]])

In [0]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)

In [10]:
print(f'Wartości własne:\n{eig_vals}\n')
print(f'Wektory własne:\n{eig_vecs}')

Wartości własne:
[2.93808505 0.9201649  0.14774182 0.02085386]

Wektory własne:
[[ 0.52106591 -0.37741762 -0.71956635  0.26128628]
 [-0.26934744 -0.92329566  0.24438178 -0.12350962]
 [ 0.5804131  -0.02449161  0.14212637 -0.80144925]
 [ 0.56485654 -0.06694199  0.63427274  0.52359713]]


In [11]:
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
eig_pairs.sort()
eig_pairs.reverse()
eig_pairs

[(2.9380850501999953,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9201649041624852,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.1477418210449476,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.020853862176463147,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [12]:
total = sum(eig_vals)
explained_variance_ratio = [(i / total) for i in sorted(eig_vals, reverse=True)]
explained_variance_ratio

[0.729624454132999,
 0.22850761786701723,
 0.03668921889282867,
 0.005178709107155017]

In [13]:
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
cumulative_explained_variance

array([0.72962445, 0.95813207, 0.99482129, 1.        ])

In [14]:
results = pd.DataFrame(data={'explained_variance_ratio': explained_variance_ratio})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results

Unnamed: 0,explained_variance_ratio,cumulative
0,0.729624,0.729624
1,0.228508,0.958132
2,0.036689,0.994821
3,0.005179,1.0


In [15]:
fig = go.Figure(data=[go.Bar(x=results.index, y=results['explained_variance_ratio'], name='explained_variance_ratio'),
                      go.Scatter(x=results.index, y=results['cumulative'], name='cumulative')],
                layout=go.Layout(title='PCA - 3 components', width=700))
fig.show()

In [16]:
eig_pairs

[(2.9380850501999953,
  array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654])),
 (0.9201649041624852,
  array([-0.37741762, -0.92329566, -0.02449161, -0.06694199])),
 (0.1477418210449476,
  array([-0.71956635,  0.24438178,  0.14212637,  0.63427274])),
 (0.020853862176463147,
  array([ 0.26128628, -0.12350962, -0.80144925,  0.52359713]))]

In [17]:
eig_pairs[0]

(2.9380850501999953,
 array([ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654]))

In [18]:
eig_pairs[1]

(0.9201649041624852,
 array([-0.37741762, -0.92329566, -0.02449161, -0.06694199]))

In [19]:
W = np.hstack((eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1)))
W

array([[ 0.52106591, -0.37741762],
       [-0.26934744, -0.92329566],
       [ 0.5804131 , -0.02449161],
       [ 0.56485654, -0.06694199]])

In [20]:
X_pca = X_std.dot(W)
pca_df_1 = pd.DataFrame(data=X_pca, columns=['pca_1', 'pca_2'])
pca_df_1['class'] = df['class']
pca_df_1['pca_2'] = - pca_df_1['pca_2']
pca_df_1

Unnamed: 0,pca_1,pca_2,class
0,-2.264703,0.480027,setosa
1,-2.080961,-0.674134,setosa
2,-2.364229,-0.341908,setosa
3,-2.299384,-0.597395,setosa
4,-2.389842,0.646835,setosa
...,...,...,...
145,1.870503,0.386966,virginica
146,1.564580,-0.896687,virginica
147,1.521170,0.269069,virginica
148,1.372788,1.011254,virginica


In [21]:
px.scatter(pca_df_1, 'pca_1', 'pca_2', color='class', width=700)

In [22]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)
pca_df = pd.DataFrame(data=X_pca, columns=['pca_1', 'pca_2'])
pca_df['class'] = df['class']
pca_df

Unnamed: 0,pca_1,pca_2,class
0,-2.264703,0.480027,setosa
1,-2.080961,-0.674134,setosa
2,-2.364229,-0.341908,setosa
3,-2.299384,-0.597395,setosa
4,-2.389842,0.646835,setosa
...,...,...,...
145,1.870503,0.386966,virginica
146,1.564580,-0.896687,virginica
147,1.521170,0.269069,virginica
148,1.372788,1.011254,virginica


In [23]:
px.scatter(pca_df, 'pca_1', 'pca_2', color='class', width=700)