# Principal Component Analysis (PCA)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Read in the data and perform basic exploratory analysis

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/hukim1112/MLDL/master/lecture3/wine.data.csv')
df.head(10)

#### Basic statistics

In [None]:
df.iloc[:,1:].describe()

#### Boxplots by output labels/classes

In [None]:
for c in df.columns[1:]:
    df.boxplot(c,by='Class',figsize=(7,4),fontsize=14)
    plt.title("{}\n".format(c),fontsize=16)
    plt.xlabel("Wine Class", fontsize=16)

**It can be seen that some features classify the wine labels pretty clearly.** For example, Alcalinity, Total Phenols, or Flavonoids produce boxplots with well-separated medians, which are clearly indicative of wine classes.

Below is an example of class seperation using two variables

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df['OD280/OD315 of diluted wines'],df['Flavanoids'],c=df['Class'],edgecolors='k',alpha=0.75,s=150)
plt.grid(True)
plt.title("Scatter plot of two features showing the \ncorrelation and class seperation",fontsize=15)
plt.xlabel("OD280/OD315 of diluted wines",fontsize=15)
plt.ylabel("Flavanoids",fontsize=15)
plt.show()

#### Are the features independent? Plot co-variance matrix

It can be seen that there are some good amount of correlation between features i.e. they are not independent of each other, as assumed in Naive Bayes technique. However, we will still go ahead and apply yhe classifier to see its performance.

In [None]:
def correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure(figsize=(16,12))
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    ax1.grid(True)
    plt.title('Wine data set features correlation\n',fontsize=15)
    labels=df.columns
    ax1.set_xticklabels(labels,fontsize=9)
    ax1.set_yticklabels(labels,fontsize=9)
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    fig.colorbar(cax, ticks=[0.1*i for i in range(-11,11)])
    plt.show()

correlation_matrix(df)

## Principal Component Analysis

### Data scaling
PCA requires scaling/normalization of the data to work properly

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X = df.drop('Class',axis=1)
y = df['Class']

In [None]:
X = scaler.fit_transform(X)

In [None]:
dfx = pd.DataFrame(data=X,columns=df.columns[1:])

In [None]:
dfx.head(10)

In [None]:
dfx.describe()

### PCA class import and analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=None)

In [None]:
dfx_pca = pca.fit(dfx)

#### Plot the _explained variance ratio_

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(x=[i+1 for i in range(len(dfx_pca.explained_variance_ratio_))],
            y=dfx_pca.explained_variance_ratio_,
           s=200, alpha=0.75,c='orange',edgecolor='k')
plt.grid(True)
plt.title("Explained variance ratio of the \nfitted principal component vector\n",fontsize=25)
plt.xlabel("Principal components",fontsize=15)
plt.xticks([i+1 for i in range(len(dfx_pca.explained_variance_ratio_))],fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Explained variance ratio",fontsize=15)
plt.show()

In [None]:
sums = []

for idx, value in enumerate(pca.explained_variance_ratio_) :
    if idx == 0 :
        temp = value
    else :
        temp = value + sums[-1]
    sums.append( temp )

plt.figure(figsize=(10, 6))
plt.hlines(0.8, xmin=0, xmax=14)
plt.bar(x = range(1,14), height = sums)

plt.show()

**The above plot means that the $1^{st}$ principal component explains about 36% of the total variance in the data and the $2^{nd}$ component explians further 20%. Therefore, if we just consider first two components, they together explain 56% of the total variance.**

### Showing better class separation using principal components

#### Transform the scaled data set using the fitted PCA object

In [None]:
dfx_trans = pca.transform(dfx)

#### Put it in a data frame

In [None]:
dfx_trans = pd.DataFrame(data=dfx_trans)
dfx_trans.head(10)

#### Plot the first two columns of this transformed data set with the color set to original ground truth class label

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(dfx_trans[0],dfx_trans[1],c=df['Class'],edgecolors='k',alpha=0.75,s=150)
plt.grid(True)
plt.title("Class separation using first two principal components\n",fontsize=20)
plt.xlabel("Principal component-1",fontsize=15)
plt.ylabel("Principal component-2",fontsize=15)
plt.show()

# K-means clustering

In [None]:
from sklearn.cluster import KMeans

kms = KMeans(3, random_state = 2021).fit(dfx_trans.loc[:,:4])

In [None]:
cluster = kms.predict(dfx_trans.loc[:,:4])
cluster

In [None]:
cluster[y==1]

In [None]:
cluster[y==2]

In [None]:
cluster[y==3]