<a href="https://colab.research.google.com/github/krakowiakpawel9/ml_course/blob/master/ul/07_pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Strona biblioteki: [https://scikit-learn.org](https://scikit-learn.org)  

Dokumentacja/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

Podstawowa biblioteka do uczenia maszynowego w języku Python.

Aby zainstalować bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install scikit-learn
```
Aby zaktualizować do najnowszej wersji bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install --upgrade scikit-learn
```
Kurs stworzony w oparciu o wersję `0.22.1`

### Spis treści:
1. [Import bibliotek](#0)
2. [Wygenerowanie danych](#1)
3. [Wizualizacja danych](#2)
4. [Algorytm K-średnich](#3)
5. [Wizualizacja klastrów](#4)




### <a name='0'></a> Import bibliotek

In [0]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

sns.set()
np.random.seed(42)
np.set_printoptions(precision=6, suppress=True)

In [2]:
raw_data = load_iris()
all_data = raw_data.copy()
all_data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [3]:
df = pd.DataFrame(data=np.c_[all_data['data'], all_data['target']], columns=all_data['feature_names'] + ['target'])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [4]:
px.scatter_3d(df, x='petal length (cm)', y='petal width (cm)', z='sepal length (cm)', color='target', symbol='target', opacity=0.7, width=1000, height=800)

In [5]:
data = df[['petal length (cm)', 'petal width (cm)', 'sepal length (cm)']]
data.head()

Unnamed: 0,petal length (cm),petal width (cm),sepal length (cm)
0,1.4,0.2,5.1
1,1.4,0.2,4.9
2,1.3,0.2,4.7
3,1.5,0.2,4.6
4,1.4,0.2,5.0


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data = scaler.fit_transform(data)
data[:10]

array([[-1.340227, -1.315444, -0.900681],
       [-1.340227, -1.315444, -1.143017],
       [-1.397064, -1.315444, -1.385353],
       [-1.283389, -1.315444, -1.506521],
       [-1.340227, -1.315444, -1.021849],
       [-1.169714, -1.05218 , -0.537178],
       [-1.340227, -1.183812, -1.506521],
       [-1.283389, -1.315444, -1.021849],
       [-1.340227, -1.315444, -1.748856],
       [-1.283389, -1.447076, -1.143017]])

In [7]:
from sklearn.decomposition import PCA

pca = PCA(random_state=42)
pca.fit_transform(data)

array([[-2.06036 , -0.298674, -0.059476],
       [-2.195981, -0.101727, -0.020166],
       [-2.365221,  0.080749, -0.024341],
       [-2.365794,  0.208165,  0.082282],
       [-2.128171, -0.200201, -0.039821],
       [-1.603256, -0.412703, -0.152053],
       [-2.323005,  0.262683, -0.043234],
       [-2.094552, -0.18573 ,  0.003663],
       [-2.535034,  0.390641,  0.078108],
       [-2.238771, -0.156245,  0.10535 ],
       [-1.82331 , -0.579624, -0.074957],
       [-2.196554,  0.025689,  0.086457],
       [-2.3402  , -0.072243,  0.081521],
       [-2.780109,  0.376712,  0.049343],
       [-1.652923, -1.016932, -0.284029],
       [-1.467062, -0.737067, -0.297986],
       [-1.737731, -0.470588, -0.32599 ],
       [-1.983952, -0.229685, -0.141509],
       [-1.476232, -0.777114, -0.128986],
       [-1.950333, -0.215214, -0.098024],
       [-1.756072, -0.550682,  0.012011],
       [-1.873925, -0.146225, -0.180057],
       [-2.533888,  0.135809, -0.135138],
       [-1.730279, -0.048293, -0.1

In [8]:
pca.explained_variance_ratio_

array([0.923247, 0.066471, 0.010282])

In [9]:
explained_variance = pd.DataFrame(data={'explained_variance_ratio': pca.explained_variance_ratio_})
explained_variance = explained_variance.reset_index().rename(columns={'index': 'principal_component'})
explained_variance

Unnamed: 0,principal_component,explained_variance_ratio
0,0,0.923247
1,1,0.066471
2,2,0.010282


In [10]:
px.bar(explained_variance, x='principal_component', y='explained_variance_ratio', width=700, height=400)

In [11]:
pca = PCA(n_components=2, random_state=42)
data_pca_2 = pca.fit_transform(data)
data_pca_2

array([[-2.06036 , -0.298674],
       [-2.195981, -0.101727],
       [-2.365221,  0.080749],
       [-2.365794,  0.208165],
       [-2.128171, -0.200201],
       [-1.603256, -0.412703],
       [-2.323005,  0.262683],
       [-2.094552, -0.18573 ],
       [-2.535034,  0.390641],
       [-2.238771, -0.156245],
       [-1.82331 , -0.579624],
       [-2.196554,  0.025689],
       [-2.3402  , -0.072243],
       [-2.780109,  0.376712],
       [-1.652923, -1.016932],
       [-1.467062, -0.737067],
       [-1.737731, -0.470588],
       [-1.983952, -0.229685],
       [-1.476232, -0.777114],
       [-1.950333, -0.215214],
       [-1.756072, -0.550682],
       [-1.873925, -0.146225],
       [-2.533888,  0.135809],
       [-1.730279, -0.048293],
       [-2.095698,  0.069102],
       [-2.060933, -0.171258],
       [-1.908117, -0.03328 ],
       [-1.958931, -0.382677],
       [-1.992549, -0.397148],
       [-2.264365,  0.124163],
       [-2.196554,  0.025689],
       [-1.670493, -0.441646],
       [

In [12]:
pca.explained_variance_ratio_

array([0.923247, 0.066471])

In [13]:
df['target']

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
145    2.0
146    2.0
147    2.0
148    2.0
149    2.0
Name: target, Length: 150, dtype: float64

In [20]:
data_pca = pd.DataFrame(data={'pca_1': data_pca_2[:, 0], 'pca_2': data_pca_2[:, 1], 'target': df['target']})
data_pca

Unnamed: 0,pca_1,pca_2,target
0,-2.060360,-0.298674,0.0
1,-2.195981,-0.101727,0.0
2,-2.365221,0.080749,0.0
3,-2.365794,0.208165,0.0
4,-2.128171,-0.200201,0.0
...,...,...,...
145,1.906692,0.124424,2.0
146,1.262579,0.213420,2.0
147,1.541846,0.114404,2.0
148,1.634876,0.645735,2.0


In [28]:
px.scatter(data_pca, x='pca_1', y='pca_2', color='target', width=800, height=600, color_continuous_midpoint=1.2)

In [16]:
pca = PCA(n_components=0.99, random_state=42)
data_pca = pca.fit_transform(data)
data_pca

array([[-2.06036 , -0.298674, -0.059476],
       [-2.195981, -0.101727, -0.020166],
       [-2.365221,  0.080749, -0.024341],
       [-2.365794,  0.208165,  0.082282],
       [-2.128171, -0.200201, -0.039821],
       [-1.603256, -0.412703, -0.152053],
       [-2.323005,  0.262683, -0.043234],
       [-2.094552, -0.18573 ,  0.003663],
       [-2.535034,  0.390641,  0.078108],
       [-2.238771, -0.156245,  0.10535 ],
       [-1.82331 , -0.579624, -0.074957],
       [-2.196554,  0.025689,  0.086457],
       [-2.3402  , -0.072243,  0.081521],
       [-2.780109,  0.376712,  0.049343],
       [-1.652923, -1.016932, -0.284029],
       [-1.467062, -0.737067, -0.297986],
       [-1.737731, -0.470588, -0.32599 ],
       [-1.983952, -0.229685, -0.141509],
       [-1.476232, -0.777114, -0.128986],
       [-1.950333, -0.215214, -0.098024],
       [-1.756072, -0.550682,  0.012011],
       [-1.873925, -0.146225, -0.180057],
       [-2.533888,  0.135809, -0.135138],
       [-1.730279, -0.048293, -0.1

In [17]:
data_pca_3d = pd.DataFrame(data={'pca_1': data_pca[:, 0], 'pca_2': data_pca[:, 1], 'pca_3': data_pca[:, 2], 'target': df['target']})
data_pca_3d

Unnamed: 0,pca_1,pca_2,pca_3,target
0,-2.060360,-0.298674,-0.059476,0.0
1,-2.195981,-0.101727,-0.020166,0.0
2,-2.365221,0.080749,-0.024341,0.0
3,-2.365794,0.208165,0.082282,0.0
4,-2.128171,-0.200201,-0.039821,0.0
...,...,...,...,...
145,1.906692,0.124424,-0.444239,2.0
146,1.262579,0.213420,-0.124459,2.0
147,1.541846,0.114404,-0.158833,2.0
148,1.634876,0.645735,-0.258997,2.0


In [18]:
px.scatter_3d(data_pca_3d, x='pca_1', y='pca_2', z='pca_3', color='target', symbol='target', opacity=0.7, width=700)