<h1 style="text-align:center;">Computational Intelligence</h1>
<h2 style="text-align:center;">Clustering</h2>
<h4 style="text-align:center;">by H. Naderan</h4>
<h5 style="text-align:center;">
Mechanical Engineering Department<br>
Amirkabir University of Technology
</h5>

***

### Importing data

In [None]:
import pandas as pd

# load the training dataset
df = pd.read_csv('seeds.csv')

# Display a random sample of 10 observations (just the features)
df.sample(10)

### Preprocessing data

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# Normalize the numeric features so they're on the same scale
scaledFeatures = MinMaxScaler().fit_transform(df[df.columns[0:7]])
labels = df.species

### Get the principal components

In [None]:
# Get two principal components
pca = PCA(n_components = 2).fit(scaledFeatures)
reducedFeatures = pca.transform(scaledFeatures)
reducedFeatures[0:10]

In [None]:
import matplotlib.pyplot as plt

cmap = plt.cm.jet
plt.scatter(scaledFeatures[:,2],scaledFeatures[:,4], alpha = 0.6, c = labels, cmap = cmap)

In [None]:
plt.scatter(reducedFeatures[:,0],reducedFeatures[:,1], alpha = 0.6, c = labels, cmap = cmap)
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Data')
plt.show()

### Explained variance

In [None]:
pca.explained_variance_ratio_

In [None]:
PCA(n_components = 3).fit(scaledFeatures).explained_variance_ratio_

In [None]:
import numpy as np

evr = PCA(n_components = 7).fit(scaledFeatures).explained_variance_ratio_
plt.plot(range(1, 8), np.cumsum(evr))
plt.xlabel("Number of principal components")
plt.ylabel("Comulative explained variance")

In [None]:
np.cumsum(evr)

### Dataset with large number of features

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape

### 2D projection

In [None]:
pca = PCA(2)  # project from 64 to 2 dimensions
projected = pca.fit_transform(digits.data)
print(digits.data.shape)
print(projected.shape)

In [None]:
plt.scatter(projected[:, 0], projected[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=cmap)
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

In [None]:
pca = PCA().fit(digits.data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.grid()

In [None]:
def plot_digits(data):
    fig, axes = plt.subplots(4, 10, figsize=(10, 4),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
    for i, ax in enumerate(axes.flat):
        ax.imshow(data[i].reshape(8, 8),
                  cmap='binary', interpolation='nearest',
                  clim=(0, 16))
plot_digits(digits.data)

### PCA as noise filtering method

In [None]:
np.random.seed(42)
noisy = np.random.normal(digits.data, 4)
plot_digits(noisy)

In [None]:
pca = PCA(0.50).fit(noisy)
pca.n_components_

In [None]:
components = pca.transform(noisy)
filtered = pca.inverse_transform(components)
plot_digits(filtered)