<a href="https://colab.research.google.com/github/idziksmart/data-science/blob/master/PCA_wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

np.set_printoptions(precision=4, suppress=True, edgeitems=5, linewidth=200)

# Zaladowanie danych

In [3]:
df_raw = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df = df_raw.copy()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [4]:
data = df.iloc[:, 1:]
target = df.iloc[:, 0]
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
target.value_counts()

2    71
1    59
3    48
Name: 0, dtype: int64

# Podział na zbiór treningowy i testowy

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, target)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

X_train shape: (133, 13)
X_test shape: (45, 13)


# Standaryzacja

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_train_std[:5]

array([[-0.3598,  1.1977, -0.0016,  0.6515,  0.3991, -0.9482, -0.7825, -1.54  , -1.2095,  0.0002, -0.837 , -1.823 , -0.4895],
       [ 0.426 , -0.4956, -0.7735, -0.7436, -0.4303,  0.1904,  0.198 , -0.7343, -0.3495, -0.4641,  0.2622,  0.241 ,  1.759 ],
       [-0.2681,  0.1243, -0.2823,  0.0315, -0.9832, -1.4525, -1.4658,  0.9578, -1.5468,  2.1666, -1.7987, -1.3489, -0.9163],
       [ 1.0022, -0.5419, -0.3876, -0.9296,  1.2285,  0.5158,  0.9012, -1.2177,  0.1057,  0.376 , -0.1958,  0.8406,  1.0205],
       [-0.3336, -0.4124, -0.563 , -0.1855, -0.9832, -0.1349,  0.5347, -0.8148,  0.3587, -0.4862,  0.9034,  0.757 , -0.1153]])

# PCA

In [8]:
from sklearn.decomposition import PCA

# robimy zwykle 3 zeby zobaczyc wykres ;)
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
X_train_pca.shape

(133, 3)

## Wyjaśniona wariancja

In [9]:
results = pd.DataFrame(data={'explained_variance_ratio': pca.explained_variance_ratio_})
results['cumulative'] = results['explained_variance_ratio'].cumsum()
results['component'] = results.index + 1
results

Unnamed: 0,explained_variance_ratio,cumulative,component
0,0.354853,0.354853,1
1,0.182509,0.537362,2
2,0.109844,0.647206,3


In [10]:

fig = go.Figure(data=[go.Bar(x=results['component'], y=results['explained_variance_ratio'], name='explained variance ratio'),
                      go.Scatter(x=results['component'], y=results['cumulative'], name='cumulative explained variance')],
                layout=go.Layout(title=f'PCA - {pca.n_components_} components', width=950, template='plotly_dark'))
fig.show()


In [11]:
X_train_pca_df = pd.DataFrame(data=np.c_[X_train_pca, y_train], columns=['pca1', 'pca2', 'pca3', 'target'])
X_train_pca_df.head()

Unnamed: 0,pca1,pca2,pca3,target
0,2.138127,0.081655,-0.092596,3.0
1,-1.26486,-0.201761,-1.492388,1.0
2,3.770079,0.690397,-1.308927,3.0
3,-2.1351,0.855426,-1.115507,1.0
4,-1.072424,-1.368625,-0.039715,2.0


In [12]:
px.scatter_3d(X_train_pca_df, x='pca1', y='pca2', z='pca3', color='target', template='plotly_dark', width=950)

In [13]:
X_train_pca[:5]

array([[ 2.1381,  0.0817, -0.0926],
       [-1.2649, -0.2018, -1.4924],
       [ 3.7701,  0.6904, -1.3089],
       [-2.1351,  0.8554, -1.1155],
       [-1.0724, -1.3686, -0.0397]])

In [14]:
X_test_pca[:5]

array([[-3.6933,  2.9162, -0.6967],
       [ 2.3859,  0.0733, -0.5299],
       [-1.614 , -1.0945,  2.0578],
       [ 2.6631,  1.841 , -0.2028],
       [ 0.4   , -1.8476,  2.4089]])