<a href="https://colab.research.google.com/github/intencjusz/machine-learning/blob/main/unsupervised/02_dimensionality_reduction/03_pca_wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

np.set_printoptions(precision=4, suppress=True, edgeitems=5, linewidth=200)

In [2]:

df_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df = df_raw.copy()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:

data = df.iloc[:, 1:]
target = df.iloc[:, 0]
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:

target.value_counts()

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
2,71
1,59
3,48


Podział na zbiór treningowy i testowy

In [5]:


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape: (133, 13)
X_test shape: (45, 13)


Standaryzacja

In [6]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_train_std[:5]


array([[-0.1278, -0.6953,  0.5519, -0.5782, -0.3231,  0.3809,  0.4055, -0.86  , -0.161 , -0.4311,  0.5663,  1.4835,  0.9665],
       [-0.6496, -0.6863, -0.6087,  0.7862,  0.5457, -0.3828,  0.1253, -0.2281,  0.0966, -1.2408,  0.4356,  0.5423, -1.263 ],
       [-0.7103, -0.6683, -0.2219,  1.3668, -0.7909, -0.0328,  0.4856,  0.2459,  0.5945, -1.2148, -0.3049,  0.2895, -1.273 ],
       [ 0.0906, -1.3234, -2.2969, -1.1007, -0.9246, -0.4624,  0.0653, -1.018 , -0.161 , -0.141 ,  1.0019, -0.1319, -1.1066],
       [ 1.122 ,  2.3915, -0.468 ,  0.0605, -1.3256, -2.0058, -1.6257,  0.2459, -1.5004, -0.0111, -1.6554, -1.7473, -1.0301]])

PCA

In [7]:


from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
X_train_pca.shape

(133, 3)

Wyjaśniona wariancja

In [8]:

results = pd.DataFrame(data={'explained_variance_ratio': pca.explained_variance_ratio_})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results['component'] = results.index + 1
results


Unnamed: 0,explained_variance_ratio,cumulative,component
0,0.347308,0.347308,1
1,0.185828,0.533136,2
2,0.122276,0.655412,3


In [9]:

fig = go.Figure(data=[go.Bar(x=results['component'], y=results['explained_variance_ratio'], name='explained variance ratio'),
                      go.Scatter(x=results['component'], y=results['cumulative'], name='cumulative explained variance')],
                layout=go.Layout(title=f'PCA - {pca.n_components_} components', width=950, template='plotly_dark'))
fig.show()


In [10]:

X_train_pca_df = pd.DataFrame(data=np.c_[X_train_pca, y_train], columns=['pca1', 'pca2', 'pca3', 'target'])
X_train_pca_df.head()

Unnamed: 0,pca1,pca2,pca3,target
0,1.789115,-0.370628,-0.013203,1.0
1,0.073935,-1.860432,0.617959,2.0
2,-0.266533,-1.843051,1.284835,2.0
3,0.4682,-1.967577,-2.413139,2.0
4,-4.159022,0.719552,-1.157137,3.0


In [11]:


px.scatter_3d(X_train_pca_df, x='pca1', y='pca2', z='pca3', color='target', template='plotly_dark', width=950)

In [12]:


X_train_pca[:5]

array([[ 1.7891, -0.3706, -0.0132],
       [ 0.0739, -1.8604,  0.618 ],
       [-0.2665, -1.8431,  1.2848],
       [ 0.4682, -1.9676, -2.4131],
       [-4.159 ,  0.7196, -1.1571]])

In [13]:

X_test_pca[:5]

array([[-0.2985, -1.803 ,  1.2284],
       [-1.3818, -0.8288, -0.8348],
       [-2.1951,  0.4928, -0.0503],
       [ 2.2639,  0.8285, -0.995 ],
       [ 2.1796, -1.7171, -0.046 ]])