In [65]:
# Importing Libraries
import numpy as np  
import pandas as pd

In [66]:
# Importing Dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"  
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']  
dataset = pd.read_csv(url, names=names) 

In [67]:
# What our dataset looks like
dataset.head() 

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [68]:
# Divide the dataset into a feature set and corresponding labels
X = dataset.drop('Class', 1)  
y = dataset['Class'] 

In [69]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)  

In [70]:
# PCA performs best with a normalized feature set
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()  
X_train = sc.fit_transform(X_train)  
X_test = sc.transform(X_test)  
X_train_original = X_train
X_test_original = X_test

In [71]:
# PCA depends only upon the feature set and not the label data. 
# Therefore, PCA can be considered as an unsupervised machine learning technique.
from sklearn.decomposition import PCA

pca = PCA()  
X_train = pca.fit_transform(X_train_original) 
X_test = pca.transform(X_test_original)  

In [72]:
# variance caused by each of the principal components
explained_variance = pca.explained_variance_ratio_ 
print(explained_variance)

[0.72226528 0.23974795 0.03338117 0.0046056 ]


In [73]:
# Let's first try to use 1 principal component to train our algorithm
from sklearn.decomposition import PCA

pca = PCA(n_components=1)  
X_train = pca.fit_transform(X_train_original)  
X_test = pca.transform(X_test_original) 

In [74]:
# Training and Making Predictions
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)  
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)  

In [75]:
# Performance Evaluation
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy ' + str(accuracy_score(y_test, y_pred)) )

[[11  0  0]
 [ 0 12  1]
 [ 0  1  5]]
Accuracy 0.9333333333333333


In [77]:
# Let's first try to use 2 principal component to train our algorithm
from sklearn.decomposition import PCA

pca = PCA(n_components=2)  
X_train = pca.fit_transform(X_train_original)  
X_test = pca.transform(X_test_original) 

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)  
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test) 

from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy ' + str(accuracy_score(y_test, y_pred)) )


[[11  0  0]
 [ 0 10  3]
 [ 0  2  4]]
Accuracy 0.8333333333333334


In [78]:
# Let's first try to use 3 principal component to train our algorithm
from sklearn.decomposition import PCA

pca = PCA(n_components=3)  
X_train = pca.fit_transform(X_train_original)  
X_test = pca.transform(X_test_original) 

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)  
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test) 

from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy ' + str(accuracy_score(y_test, y_pred)) )


[[11  0  0]
 [ 0 12  1]
 [ 0  1  5]]
Accuracy 0.9333333333333333


In [81]:
# Results with Full Feature Set
from sklearn.decomposition import PCA

#pca = PCA()  
#X_train = pca.fit_transform(X_train_original)  
#X_test = pca.transform(X_test_original)

X_train = X_train_original 
X_test = X_test_original

from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)  
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test) 

from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)  
print(cm)  
print('Accuracy ' + str(accuracy_score(y_test, y_pred)) )


[[11  0  0]
 [ 0 13  0]
 [ 0  2  4]]
Accuracy 0.9333333333333333
