<a href="https://colab.research.google.com/github/inamansari21/datascience/blob/main/PCA_assng_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [None]:
# Import Dataset
wine=pd.read_csv('wine.csv')
wine

In [None]:
wine['Type'].value_counts()

In [None]:
wine2=wine.iloc[:,1:]
wine2

In [None]:
wine2.shape

In [None]:
wine2.info()


In [None]:
# Converting data to numpy array
wine_ary=wine2.values
wine_ary

In [None]:
# Normalizing the numerical data 
wine_norm=scale(wine_ary)
wine_norm

**PCA Implementation**

In [None]:
# Applying PCA Fit Transform to dataset
pca=PCA(n_components=13)

wine_pca=pca.fit_transform(wine_norm)
wine_pca

In [None]:
# PCA Components matrix or covariance Matrix
pca.components_

In [None]:
# The amount of variance that each PCA has
var=pca.explained_variance_ratio_
var

In [None]:
# Cummulative variance of each PCA
var1=np.cumsum(np.round(var,4)*100)
var1

In [None]:
# Variance plot for PCA components obtained 
plt.plot(var1,color='magenta')

In [None]:
# Final Dataframe
final_df=pd.concat([wine['Type'],pd.DataFrame(wine_pca[:,0:3],columns=['PC1','PC2','PC3'])],axis=1)
final_df

In [None]:
# Visualization of PCAs
fig=plt.figure(figsize=(16,12))
sns.scatterplot(data=final_df)

**Checking with other Clustering Algorithms**

**1. Hierarchical Clustering**

In [None]:
# Import Libraries
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize

In [None]:
# As we already have normalized data, create Dendrograms
plt.figure(figsize=(10,8))
dendrogram=sch.dendrogram(sch.linkage(wine_norm,'complete'))

In [None]:
# Create Clusters (y)
hclusters=AgglomerativeClustering(n_clusters=3,affinity='euclidean',linkage='ward')
hclusters

In [None]:
y=pd.DataFrame(hclusters.fit_predict(wine_norm),columns=['clustersid'])
y['clustersid'].value_counts()

In [None]:
# Adding clusters to dataset
wine3=wine.copy()
wine3['clustersid']=hclusters.labels_
wine3

**2. K-Means Clustering**

In [None]:
# Import Libraries
from sklearn.cluster import KMeans

In [None]:
# As we already have normalized data
# Use Elbow Graph to find optimum number of  clusters (K value) from K values range
# The K-means algorithm aims to choose centroids that minimise the inertia, or within-cluster sum-of-squares criterion WCSS 
# random state can be anything from 0 to 42, but the same number to be used everytime,so that the results don't change. 

In [None]:
# within-cluster sum-of-squares criterion 
wcss=[]
for i in range (1,6):
    kmeans=KMeans(n_clusters=i,random_state=2)
    kmeans.fit(wine_norm)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot K values range vs WCSS to get Elbow graph for choosing K (no. of clusters)
plt.plot(range(1,6),wcss)
plt.title('Elbow Graph')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

**Build Cluster algorithm using K=3**

In [None]:
# Cluster algorithm using K=3
clusters3=KMeans(3,random_state=30).fit(wine_norm)
clusters3


In [None]:
clusters3.labels_

In [None]:
# Assign clusters to the data set
wine4=wine.copy()
wine4['clusters3id']=clusters3.labels_
wine4

In [None]:
wine4['clusters3id'].value_counts()