In [101]:
from math import * 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
import matplotlib.pyplot as plt

In [102]:
iris = pd.read_csv("iris.data", header=None)
iris.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [103]:
# clean data by removing rows with blank values
def cleanData(data):
    data = data.replace('-', np.nan)
    data = data.replace('', np.nan)
    data = data.dropna()
    return data

In [104]:
def dropColumns(data):
    data = data.drop(data.columns[-1], axis=1)
    return data

In [105]:
def eigenDecomposition(cov):
    # finding eigen values & eigen vectors
    eigenValues, eigenVectors = np.linalg.eig(cov)
    return eigenValues, eigenVectors


def modifiedVariance(eigenValues):
    var = eigenValues / np.sum(eigenValues)
    return var


def bestVectors(n, variances, eigenVectors):
    features = []
    for _ in range(n):
        temp = np.argmax(variances)
        features.append(temp)
        variances[temp] = 0
    return eigenVectors[:, features]


def principalComponents(vectors, data):
    return np.dot(vectors.T, data.T).T

In [106]:
def plotPrincipalComponents(principalComponents, dataMatrix, results):
    plt.figure()
    plt.title("Reconstructed data")
    plt.plot(PC)
    plt.show()

In [107]:
# removing unecessary values
iris = cleanData(iris)
results = iris[4]
data = dropColumns(iris)
dataMatrix = data.values

# finding mean
means = np.mean(dataMatrix, axis=0)
# subtracting mean
dataMatrix = dataMatrix-means

# finding covariance
cov = np.cov(dataMatrix.T)

eigenValues, eigenVectors = eigenDecomposition((len(dataMatrix)-1)*cov)
variances = modifiedVariance(eigenValues)

# finding top vectors
vectors = bestVectors(3, variances, eigenVectors)
magnitude = []
for i in vectors:
    temp = []
    for j in i:
        temp.append(j**2)
        
    magnitude.append(sum(temp))

print("Top 3 vectors for PC 1: \n", vectors)
# plotting eigen values in sorted order
plt.figure()
plt.title("Eigen values")
plt.plot(magnitude)
plt.show()

Top 3 vectors for PC 1: 
 [[ 0.36158968 -0.65653988 -0.58099728]
 [-0.08226889 -0.72971237  0.59641809]
 [ 0.85657211  0.1757674   0.07252408]
 [ 0.35884393  0.07470647  0.54906091]]


In [108]:
PC1 = principalComponents(vectors, dataMatrix)
print("Principal components: \n", PC1)

plotPrincipalComponents(PC1, dataMatrix, results)

Principal components: 
 [[-2.68420713 -0.32660731 -0.02151184]
 [-2.71539062  0.16955685 -0.20352143]
 [-2.88981954  0.13734561  0.02470924]
 [-2.7464372   0.31112432  0.03767198]
 [-2.72859298 -0.33392456  0.0962297 ]
 [-2.27989736 -0.74778271  0.17432562]
 [-2.82089068  0.08210451  0.26425109]
 [-2.62648199 -0.17040535 -0.01580151]
 [-2.88795857  0.57079803  0.02733541]
 [-2.67384469  0.1066917  -0.1915333 ]
 [-2.50652679 -0.65193501 -0.069275  ]
 [-2.61314272 -0.02152063  0.10765035]
 [-2.78743398  0.22774019 -0.20032779]
 [-3.22520045  0.50327991  0.06841363]
 [-2.64354322 -1.1861949  -0.1445057 ]
 [-2.38386932 -1.34475434  0.28373066]
 [-2.6225262  -0.81808967  0.14531599]
 [-2.64832273 -0.31913667  0.03339425]
 [-2.19907796 -0.87924409 -0.11452146]
 [-2.58734619 -0.52047364  0.21957209]
 [-2.3105317  -0.39786782 -0.23369561]
 [-2.54323491 -0.44003175  0.21483637]
 [-3.21585769 -0.14161557  0.29961898]
 [-2.30312854 -0.10552268  0.04568004]
 [-2.35617109  0.03120959  0.12940758]
 

In [109]:
# finding top vectors
vectors = bestVectors(5, variances, eigenVectors)
magnitude = []
for i in vectors:
    temp = []
    for j in i:
        temp.append(j**2)
        
    magnitude.append(sum(temp))

print("Top 3 vectors for PC 2: \n", vectors)
# plotting eigen values in sorted order
plt.figure()
plt.title("Eigen values")
plt.plot(magnitude)
plt.show()

Top 3 vectors for PC 2: 
 [[ 0.31725455  0.36158968  0.36158968  0.36158968  0.36158968]
 [-0.32409435 -0.08226889 -0.08226889 -0.08226889 -0.08226889]
 [-0.47971899  0.85657211  0.85657211  0.85657211  0.85657211]
 [ 0.75112056  0.35884393  0.35884393  0.35884393  0.35884393]]


In [110]:
PC2 = principalComponents(vectors, dataMatrix)
print("Principal components: \n", PC1)

plotPrincipalComponents(PC2, dataMatrix, results)

Principal components: 
 [[-2.68420713 -0.32660731 -0.02151184]
 [-2.71539062  0.16955685 -0.20352143]
 [-2.88981954  0.13734561  0.02470924]
 [-2.7464372   0.31112432  0.03767198]
 [-2.72859298 -0.33392456  0.0962297 ]
 [-2.27989736 -0.74778271  0.17432562]
 [-2.82089068  0.08210451  0.26425109]
 [-2.62648199 -0.17040535 -0.01580151]
 [-2.88795857  0.57079803  0.02733541]
 [-2.67384469  0.1066917  -0.1915333 ]
 [-2.50652679 -0.65193501 -0.069275  ]
 [-2.61314272 -0.02152063  0.10765035]
 [-2.78743398  0.22774019 -0.20032779]
 [-3.22520045  0.50327991  0.06841363]
 [-2.64354322 -1.1861949  -0.1445057 ]
 [-2.38386932 -1.34475434  0.28373066]
 [-2.6225262  -0.81808967  0.14531599]
 [-2.64832273 -0.31913667  0.03339425]
 [-2.19907796 -0.87924409 -0.11452146]
 [-2.58734619 -0.52047364  0.21957209]
 [-2.3105317  -0.39786782 -0.23369561]
 [-2.54323491 -0.44003175  0.21483637]
 [-3.21585769 -0.14161557  0.29961898]
 [-2.30312854 -0.10552268  0.04568004]
 [-2.35617109  0.03120959  0.12940758]
 

In [111]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model = GaussianNB()
X_train, X_test, Y_train,  Y_test = train_test_split(PC1, results.ravel(), test_size=0.2)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
print(classification_report(Y_test, predictions))

model = GaussianNB()
X_train, X_test, Y_train,  Y_test = train_test_split(PC2, results.ravel(), test_size=0.2)
model.fit(X_train, Y_train)

predictions = model.predict(X_test)
print(classification_report(Y_test, predictions))

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       0.77      1.00      0.87        10
 Iris-virginica       1.00      0.67      0.80         9

       accuracy                           0.90        30
      macro avg       0.92      0.89      0.89        30
   weighted avg       0.92      0.90      0.90        30

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       0.89      0.89      0.89         9
 Iris-virginica       0.90      0.90      0.90        10

       accuracy                           0.93        30
      macro avg       0.93      0.93      0.93        30
   weighted avg       0.93      0.93      0.93        30

