<a href="https://colab.research.google.com/github/krakowiakpawel9/ml_course/blob/master/ul/09_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Strona biblioteki: [https://scikit-learn.org](https://scikit-learn.org)  

Dokumentacja/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

Podstawowa biblioteka do uczenia maszynowego w języku Python.

Aby zainstalować bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install scikit-learn
```
Aby zaktualizować do najnowszej wersji bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install --upgrade scikit-learn
```
Kurs stworzony w oparciu o wersję `0.22.1`

### Spis treści:
1. [Import bibliotek](#0)
2. [Wygenerowanie danych](#1)
3. [Wizualizacja danych](#2)
4. [Algorytm K-średnich](#3)
5. [Wizualizacja klastrów](#4)




### <a name='0'></a> Import bibliotek

In [0]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from sklearn.datasets import load_breast_cancer

np.set_printoptions(precision=4, suppress=True, edgeitems=5, linewidth=200)

In [2]:
raw_data = load_breast_cancer()
all_data = raw_data.copy()
all_data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [3]:
data = all_data['data']
target = all_data['target']

data

array([[  17.99  ,   10.38  ,  122.8   , 1001.    ,    0.1184, ...,    0.6656,    0.7119,    0.2654,    0.4601,    0.1189],
       [  20.57  ,   17.77  ,  132.9   , 1326.    ,    0.0847, ...,    0.1866,    0.2416,    0.186 ,    0.275 ,    0.089 ],
       [  19.69  ,   21.25  ,  130.    , 1203.    ,    0.1096, ...,    0.4245,    0.4504,    0.243 ,    0.3613,    0.0876],
       [  11.42  ,   20.38  ,   77.58  ,  386.1   ,    0.1425, ...,    0.8663,    0.6869,    0.2575,    0.6638,    0.173 ],
       [  20.29  ,   14.34  ,  135.1   , 1297.    ,    0.1003, ...,    0.205 ,    0.4   ,    0.1625,    0.2364,    0.0768],
       ...,
       [  21.56  ,   22.39  ,  142.    , 1479.    ,    0.111 , ...,    0.2113,    0.4107,    0.2216,    0.206 ,    0.0712],
       [  20.13  ,   28.25  ,  131.2   , 1261.    ,    0.0978, ...,    0.1922,    0.3215,    0.1628,    0.2572,    0.0664],
       [  16.6   ,   28.08  ,  108.3   ,  858.1   ,    0.0846, ...,    0.3094,    0.3403,    0.1418,    0.2218,    0.078

In [4]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
     

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_std = scaler.fit_transform(data)
data_std[:5]

array([[ 1.0971, -2.0733,  1.2699,  0.9844,  1.5685,  3.2835,  2.6529,  2.5325,  2.2175,  2.2557,  2.4897, -0.5653,  2.833 ,  2.4876, -0.214 ,  1.3169,  0.724 ,  0.6608,  1.1488,  0.9071,  1.8867,
        -1.3593,  2.3036,  2.0012,  1.3077,  2.6167,  2.1095,  2.2961,  2.7506,  1.937 ],
       [ 1.8298, -0.3536,  1.686 ,  1.9087, -0.827 , -0.4871, -0.0238,  0.5481,  0.0014, -0.8687,  0.4993, -0.8762,  0.2633,  0.7424, -0.6054, -0.6929, -0.4408,  0.2602, -0.8055, -0.0994,  1.8059,
        -0.3692,  1.5351,  1.8905, -0.3756, -0.4304, -0.1467,  1.0871, -0.2439,  0.2812],
       [ 1.5799,  0.4562,  1.5665,  1.5589,  0.9422,  1.0529,  1.3635,  2.0372,  0.9397, -0.398 ,  1.2287, -0.7801,  0.8509,  1.1813, -0.297 ,  0.815 ,  0.2131,  1.4248,  0.237 ,  0.2936,  1.5119,
        -0.024 ,  1.3475,  1.4563,  0.5274,  1.0829,  0.855 ,  1.955 ,  1.1523,  0.2014],
       [-0.7689,  0.2537, -0.5927, -0.7645,  3.2836,  3.4029,  1.9159,  1.4517,  2.8674,  4.9109,  0.3264, -0.1104,  0.2866, -0.2884,  0.68

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
data_pca = pca.fit_transform(data_std)
data_pca[:5]

array([[ 9.1928,  1.9486],
       [ 2.3878, -3.7682],
       [ 5.7339, -1.0752],
       [ 7.123 , 10.2756],
       [ 3.9353, -1.9481]])

In [7]:
pca_2 = pd.DataFrame(data={'pca_1': data_pca[:, 0], 'pca_2': data_pca[:, 1], 'class': target})

pca_2.replace(0, 'Benign', inplace=True)
pca_2.replace(1, 'Malignant', inplace=True)

pca_2.head()

Unnamed: 0,pca_1,pca_2,class
0,9.192837,1.948583,Benign
1,2.387802,-3.768172,Benign
2,5.733896,-1.075174,Benign
3,7.122953,10.275589,Benign
4,3.935302,-1.948072,Benign


In [8]:
results = pd.DataFrame(data={'explained_variance_ratio': pca.explained_variance_ratio_})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results

Unnamed: 0,explained_variance_ratio,cumulative
0,0.44272,0.44272
1,0.189712,0.632432


In [9]:
fig = go.Figure(data=[go.Bar(x=results.index, y=results['explained_variance_ratio'], name='explained_variance_ratio'),
                      go.Scatter(x=results.index, y=results['cumulative'], name='cumulative')],
                layout=go.Layout(title='PCA - 2 components', width=700))
fig.show()

In [10]:
px.scatter(pca_2, 'pca_1', 'pca_2', color=pca_2['class'], width=700)

In [11]:
pca = PCA(n_components=3)
data_pca = pca.fit_transform(data_std)
data_pca[:10]

array([[ 9.1928,  1.9486, -1.1232],
       [ 2.3878, -3.7682, -0.5293],
       [ 5.7339, -1.0752, -0.5517],
       [ 7.123 , 10.2756, -3.2328],
       [ 3.9353, -1.9481,  1.3898],
       [ 2.3802,  3.9499, -2.9349],
       [ 2.2389, -2.69  , -1.6399],
       [ 2.1433,  2.3402, -0.8719],
       [ 3.1749,  3.3918, -3.12  ],
       [ 6.3517,  7.7272, -4.3419]])

In [12]:
pca_3 = pd.DataFrame(data={'pca_1': data_pca[:, 0], 'pca_2': data_pca[:, 1], 'pca_3': data_pca[:, 2], 'class': target})

pca_3.replace(0, 'Benign', inplace=True)
pca_3.replace(1, 'Malignant', inplace=True)

pca_3.head()

Unnamed: 0,pca_1,pca_2,pca_3,class
0,9.192837,1.948583,-1.123162,Benign
1,2.387802,-3.768172,-0.529281,Benign
2,5.733896,-1.075174,-0.551749,Benign
3,7.122953,10.275589,-3.232782,Benign
4,3.935302,-1.948072,1.389778,Benign


In [13]:
results = pd.DataFrame(data={'explained_variance_ratio': pca.explained_variance_ratio_})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results

Unnamed: 0,explained_variance_ratio,cumulative
0,0.44272,0.44272
1,0.189712,0.632432
2,0.093932,0.726364


In [14]:
fig = go.Figure(data=[go.Bar(x=results.index, y=results['explained_variance_ratio'], name='explained_variance_ratio'),
                      go.Scatter(x=results.index, y=results['cumulative'], name='cumulative')],
                layout=go.Layout(title='PCA - 3 components', width=700))
fig.show()

In [16]:
px.scatter_3d(pca_3, x='pca_1', y='pca_2', z='pca_3', color='class', symbol='class', opacity=0.7, size_max=10, width=800)