In [1]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import KernelPCA
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
import time

### Import IRIS data

In [2]:
# Read Iris data from website, this data does not have headers
url = "https://www.cs.nmsu.edu/~hcao/teaching/cs487519/data/iris.data"
iris_data = pd.read_csv(url, header=None)
print(np.shape(iris_data))
iris_x = iris_data.iloc[:, 0:-1].values
iris_y = iris_data.iloc[:, -1].values
# Split data to train and test datasets
iris_x_train, iris_x_test, iris_y_train, iris_y_test = train_test_split(iris_x, iris_y, test_size=0.3, random_state=1, stratify=iris_y)

(150, 5)


### Standardize the datasets

In [3]:
sc = StandardScaler()
sc.fit(iris_x_train)
iris_x_train_std = sc.transform(iris_x_train)
iris_x_test_std = sc.transform(iris_x_test)

### Apply decision tree on data without any reduction in dimensionality

In [25]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree_model.fit(iris_x_train_std, iris_y_train)
print(" Running time: %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
iris_y_pred = tree_model.predict(iris_x_test_std)
# Predict the train dataset by decision tree model
iris_y_train_pred = tree_model.predict(iris_x_train_std)

 Running time: 0.008025884628295898 seconds 


In [5]:
print('Accuracy: %f' % accuracy_score(iris_y_pred, iris_y_test))
print('Precision: %f' % precision_score(iris_y_pred, iris_y_test, average="micro"))
print('Recall: %f' % recall_score(iris_y_pred, iris_y_test, average="micro"))
print('F1: %f' % f1_score(iris_y_pred, iris_y_test, average="micro"))

Accuracy: 0.977778
Precision: 0.977778
Recall: 0.977778
F1: 0.977778


In [6]:
print('Accuracy: %f' % accuracy_score(iris_y_train_pred, iris_y_train))
print('Precision: %f' % precision_score(iris_y_train_pred, iris_y_train, average="micro"))
print('Recall: %f' % recall_score(iris_y_train_pred, iris_y_train, average="micro"))
print('F1: %f' % f1_score(iris_y_train_pred, iris_y_train, average="micro"))

Accuracy: 0.971429
Precision: 0.971429
Recall: 0.971429
F1: 0.971429


### Principal Component Analysis (PCA)

In [21]:
pca = PCA(n_components=3)
iris_x
iris_x_train_pca = pca.fit_transform(iris_x_train_std)
iris_x_test_pca = pca.transform(iris_x_test_std)

In [22]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree_model.fit(iris_x_train_pca, iris_y_train)
print(" Running time: %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
iris_y_pca_pred = tree_model.predict(iris_x_test_pca)
# Predict the train dataset by decision tree model
iris_y_train_pca_pred = tree_model.predict(iris_x_train_pca)

 Running time: 0.006083488464355469 seconds 


In [9]:
print('Accuracy: %f' % accuracy_score(iris_y_pca_pred, iris_y_test))
print('Precision: %f' % precision_score(iris_y_pca_pred, iris_y_test, average="micro"))
print('Recall: %f' % recall_score(iris_y_pca_pred, iris_y_test, average="micro"))
print('F1: %f' % f1_score(iris_y_pca_pred, iris_y_test, average="micro"))

Accuracy: 0.977778
Precision: 0.977778
Recall: 0.977778
F1: 0.977778


In [10]:
print('Accuracy: %f' % accuracy_score(iris_y_train_pca_pred, iris_y_train))
print('Precision: %f' % precision_score(iris_y_train_pca_pred, iris_y_train, average="micro"))
print('Recall: %f' % recall_score(iris_y_train_pca_pred, iris_y_train, average="micro"))
print('F1: %f' % f1_score(iris_y_train_pca_pred, iris_y_train, average="micro"))

Accuracy: 0.990476
Precision: 0.990476
Recall: 0.990476
F1: 0.990476


### Linear Discriminant Analysis

In [48]:
lda = LDA(n_components=1)
iris_x_train_lda = lda.fit_transform(iris_x_train_std, iris_y_train)
iris_x_test_lda = lda.transform(iris_x_test_std)

In [51]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree_model.fit(iris_x_train_lda, iris_y_train)
print("Running time:  %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
iris_y_lda_pred = tree_model.predict(iris_x_test_lda)
# Predict the train dataset by decision tree model
iris_y_train_lda_pred = tree_model.predict(iris_x_train_lda)

Running time:  0.0043544769287109375 seconds 


In [37]:
print('Accuracy: %f' % accuracy_score(iris_y_lda_pred, iris_y_test))
print('Precision: %f' % precision_score(iris_y_lda_pred, iris_y_test, average="micro"))
print('Recall: %f' % recall_score(iris_y_lda_pred, iris_y_test, average="micro"))
print('F1: %f' % f1_score(iris_y_lda_pred, iris_y_test, average="micro"))

Accuracy: 0.955556
Precision: 0.955556
Recall: 0.955556
F1: 0.955556


In [38]:
print('Accuracy: %f' % accuracy_score(iris_y_train_lda_pred, iris_y_train))
print('Precision: %f' % precision_score(iris_y_train_lda_pred, iris_y_train, average="micro"))
print('Recall: %f' % recall_score(iris_y_train_lda_pred, iris_y_train, average="micro"))
print('F1: %f' % f1_score(iris_y_train_lda_pred, iris_y_train, average="micro"))

Accuracy: 0.990476
Precision: 0.990476
Recall: 0.990476
F1: 0.990476


### Kernel PCA

In [72]:
kpca = KernelPCA(n_components=2, kernel='sigmoid', gamma=1)
iris_x_train_kpca = kpca.fit_transform(iris_x_train_std)
iris_x_test_kpca = kpca.transform(iris_x_test_std)

In [73]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree_model.fit(iris_x_train_kpca, iris_y_train)
print("Running time:  %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
iris_y_kpca_pred = tree_model.predict(iris_x_test_kpca)
# Predict the train dataset by decision tree model
iris_y_train_kpca_pred = tree_model.predict(iris_x_train_kpca)

Running time:  0.00549769401550293 seconds 


In [74]:
print('Accuracy: %f' % accuracy_score(iris_y_kpca_pred, iris_y_test))
print('Precision: %f' % precision_score(iris_y_kpca_pred, iris_y_test, average="micro"))
print('Recall: %f' % recall_score(iris_y_kpca_pred, iris_y_test, average="micro"))
print('F1: %f' % f1_score(iris_y_kpca_pred, iris_y_test, average="micro"))

Accuracy: 0.911111
Precision: 0.911111
Recall: 0.911111
F1: 0.911111


In [75]:
print('Accuracy: %f' % accuracy_score(iris_y_train_kpca_pred, iris_y_train))
print('Precision: %f' % precision_score(iris_y_train_kpca_pred, iris_y_train, average="micro"))
print('Recall: %f' % recall_score(iris_y_train_kpca_pred, iris_y_train, average="micro"))
print('F1: %f' % f1_score(iris_y_train_kpca_pred, iris_y_train, average="micro"))

Accuracy: 0.971429
Precision: 0.971429
Recall: 0.971429
F1: 0.971429


### Analysis the different dimensionality reduction techniques on decision tree classifier for IRIS data

Three different methods are applied for reducing dimensionality on IRIS data. Firstly, a decision tree classification is applied for the original data which has 4 features. Then, the performance of these techniques is analyzed by feeding the dimension reduced data to a decision tree classifier.

The running time of the decision tree classifier on the original data is about 0.008 seconds and the accuracy on test and train datasets is approximately 97%.

The original data which has 4 dimensions converts to 3 dimensions with the help of PCA. In this method, the running time is 0.006 seconds, which is less than the previous method due to dimensionality reduction. This technique leads to losing some of the variance (information). The accuracy for the training dataset is as same as the previous method (97%) while the accuracy on testing data is increased and reached 99%. Precision, recall, and f1 also have the same values.

The next method is LDA, which reduces dimensionality on 1 dimension with 95% accuracy (precision, recall, and f1) on the test dataset and 99% accuracy (precision, recall, and f1) on the training dataset. The running time is 0.004 seconds. Here, the dimension is one and less than the previous method so the running time is faster but the accuracy on the training dataset is the same as PCA but better than applying a decision tree on the original dataset. The accuracy of test data is a little less than the two previous methods.

The last method is KPCA, which has 2 components and the kernel is 'sigmoid' with gamma=1. The running time is 0.005 seconds and the accuracy (precision, recall, and f1) on test and train datasets is 91% and 97%, respectively. Here, the accuracy, precision, recall, and f1 are less than the previous methods. This technique is a non-linear dimensionality reduction that reduces 4-dimensional data into 2 dimensions.
 
All of these techniques are tuned to have better accuracy, precision, recall, and f1.

### Import MNIST data and extract a subset 

In [76]:
X, Y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
# Extract the subset of MNISt data to avoid memory problem
X_train, mnist_x, y_train, mnist_y = train_test_split(X, Y, test_size=0.02, random_state=1, stratify=Y)
print(np.shape(mnist_x))
# Split data to train and test datasets
mnist_x_train, mnist_x_test, mnist_y_train, mnist_y_test = train_test_split(mnist_x, mnist_y, test_size=0.3, random_state=1, stratify=mnist_y)

(1400, 784)


### Standardize the datasets

In [77]:
sc1 = StandardScaler()
sc1.fit(mnist_x_train)
mnist_x_train_std = sc1.transform(mnist_x_train)
mnist_x_test_std = sc1.transform(mnist_x_test)

### Apply decision tree on data without any reduction in dimensionality

In [78]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=1)
tree_model.fit(mnist_x_train_std, mnist_y_train)
print("Running time:  %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
mnist_y_pred = tree_model.predict(mnist_x_test_std)
# Predict the train dataset by decision tree model
mnist_y_train_pred = tree_model.predict(mnist_x_train_std)

Running time:  0.5965995788574219 seconds 


In [79]:
print('Accuracy: %f' % accuracy_score(mnist_y_pred, mnist_y_test))
print('Precision: %f' % precision_score(mnist_y_pred, mnist_y_test, average="micro"))
print('Recall: %f' % recall_score(mnist_y_pred, mnist_y_test, average="micro"))
print('F1: %f' % f1_score(mnist_y_pred, mnist_y_test, average="micro"))

Accuracy: 0.695238
Precision: 0.695238
Recall: 0.695238
F1: 0.695238


In [80]:
print('Accuracy: %f' % accuracy_score(mnist_y_train_pred, mnist_y_train))
print('Precision: %f' % precision_score(mnist_y_train_pred, mnist_y_train, average="micro"))
print('Recall: %f' % recall_score(mnist_y_train_pred, mnist_y_train, average="micro"))
print('F1: %f' % f1_score(mnist_y_train_pred, mnist_y_train, average="micro"))

Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000


### Principal Component Analysis (PCA)

In [81]:
pca1 = PCA(n_components=50)
mnist_x_train_pca = pca1.fit_transform(mnist_x_train_std)
mnist_x_test_pca = pca1.transform(mnist_x_test_std)

In [82]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=1)
tree_model.fit(mnist_x_train_pca, mnist_y_train)
print("Running time:  %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
mnist_y_pca_pred = tree_model.predict(mnist_x_test_pca)
# Predict the train dataset by decision tree model
mnist_y_train_pca_pred = tree_model.predict(mnist_x_train_pca)

Running time:  0.2814319133758545 seconds 


In [83]:
print('Accuracy: %f' % accuracy_score(mnist_y_pca_pred, mnist_y_test))
print('Precision: %f' % precision_score(mnist_y_pca_pred, mnist_y_test, average="micro"))
print('Recall: %f' % recall_score(mnist_y_pca_pred, mnist_y_test, average="micro"))
print('F1: %f' % f1_score(mnist_y_pca_pred, mnist_y_test, average="micro"))

Accuracy: 0.714286
Precision: 0.714286
Recall: 0.714286
F1: 0.714286


In [84]:
print('Accuracy: %f' % accuracy_score(mnist_y_train_pca_pred, mnist_y_train))
print('Precision: %f' % precision_score(mnist_y_train_pca_pred, mnist_y_train, average="micro"))
print('Recall: %f' % recall_score(mnist_y_train_pca_pred, mnist_y_train, average="micro"))
print('F1: %f' % f1_score(mnist_y_train_pca_pred, mnist_y_train, average="micro"))

Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000


### Linear Discriminant Analysis

In [96]:
lda1 = LDA(n_components=9)
mnist_x_train_lda = lda1.fit_transform(mnist_x_train_std, mnist_y_train)
mnist_x_test_lda = lda1.transform(mnist_x_test_std)

In [97]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=1)
tree_model.fit(mnist_x_train_lda, mnist_y_train)
print("Running time:  %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
mnist_y_lda_pred = tree_model.predict(mnist_x_test_lda)
# Predict the train dataset by decision tree model
mnist_y_train_lda_pred = tree_model.predict(mnist_x_train_lda)

Running time:  0.025817155838012695 seconds 


In [98]:
print('Accuracy: %f' % accuracy_score(mnist_y_lda_pred, mnist_y_test))
print('Precision: %f' % precision_score(mnist_y_lda_pred, mnist_y_test, average="micro"))
print('Recall: %f' % recall_score(mnist_y_lda_pred, mnist_y_test, average="micro"))
print('F1: %f' % f1_score(mnist_y_lda_pred, mnist_y_test, average="micro"))

Accuracy: 0.502381
Precision: 0.502381
Recall: 0.502381
F1: 0.502381


In [99]:
print('Accuracy: %f' % accuracy_score(mnist_y_train_lda_pred, mnist_y_train))
print('Precision: %f' % precision_score(mnist_y_train_lda_pred, mnist_y_train, average="micro"))
print('Recall: %f' % recall_score(mnist_y_train_lda_pred, mnist_y_train, average="micro"))
print('F1: %f' % f1_score(mnist_y_train_lda_pred, mnist_y_train, average="micro"))

Accuracy: 1.000000
Precision: 1.000000
Recall: 1.000000
F1: 1.000000


### Kernel PCA

In [100]:
kpca1 = KernelPCA(n_components=9, kernel='sigmoid', gamma=20)
mnist_x_train_kpca = kpca1.fit_transform(mnist_x_train_std)
mnist_x_test_kpca = kpca1.transform(mnist_x_test_std)

In [101]:
start_time = time.time()
tree_model = DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=1)
tree_model.fit(mnist_x_train_kpca, mnist_y_train)
print("Running time:  %s seconds " % (time.time() - start_time))
# Predict the test dataset by decision tree model
mnist_y_kpca_pred = tree_model.predict(mnist_x_test_kpca)
# Predict the train dataset by decision tree model
mnist_y_train_kpca_pred = tree_model.predict(mnist_x_train_kpca)

Running time:  0.059204816818237305 seconds 


In [102]:
print('Accuracy: %f' % accuracy_score(mnist_y_kpca_pred, mnist_y_test))
print('Precision: %f' % precision_score(mnist_y_kpca_pred, mnist_y_test, average="micro"))
print('Recall: %f' % recall_score(mnist_y_kpca_pred, mnist_y_test, average="micro"))
print('F1: %f' % f1_score(mnist_y_kpca_pred, mnist_y_test, average="micro"))

Accuracy: 0.671429
Precision: 0.671429
Recall: 0.671429
F1: 0.671429


In [103]:
print('Accuracy: %f' % accuracy_score(mnist_y_train_kpca_pred, mnist_y_train))
print('Precision: %f' % precision_score(mnist_y_train_kpca_pred, mnist_y_train, average="micro"))
print('Recall: %f' % recall_score(mnist_y_train_kpca_pred, mnist_y_train, average="micro"))
print('F1: %f' % f1_score(mnist_y_train_kpca_pred, mnist_y_train, average="micro"))

Accuracy: 0.962245
Precision: 0.962245
Recall: 0.962245
F1: 0.962245


### Analysis the different dimensionality reduction techniques on decision tree classifier for MNIST data

Here, the decision tree classifier is used on MNIST data in 4 different ways.

The first method is applying DT on the original data which has 784 features.
The running time is about 0.59 seconds. The accuracy on the test dataset is 69% and on the training dataset is 100%.

The second method is reducing the data to 50 dimensions with the usage of PCA.
The running time is about 0.28 seconds. The accuracy of test and training datasets is 71% and 100%, respectively. Here, by reducing the data space from 784 to 50, we reach the higher accuracy in the test dataset without losing accuracy in the training dataset.

The third method is applying DT on reduced data with LDA which has 9 components.
The running time is about 0.02 seconds. The accuracy on the test dataset is 50% and on the training dataset is 100%. Here, the accuracy for the test data is higher than the previous method but the running time is less than that. But it shows that this accuracy is reached considering 9 dimension space of data compared to 784 dimension space of the original data.

The last method is using KPCA for reducing the dimensions of data into 9 and then applying DT.
The running time is about 0.05 seconds. The accuracy of test and training datasets is 67% and 96%, respectively. Here, we reach good accuracy for the test dataset by using the reduced data which is converted to the 9 dimensions space.

The advantage of reducing dimensionality of data is significant when the data is too big. Here, for solvinh the memory problem, the subset of MNIST data is used. The command of train-test-split is used to obtain the subest of data which has 1400 instances. This data originally has 784 features.

The comparison of these dimensionality reduction methods shows that the running time has a direct relationship to number of dimensions in the reduced space. It means, if we convert the dimensions of original features to less dimentional space, the running time is reduced more.

LDA focuses on finding a feature subspace that maximizes the separability between the groups. While Principal component analysis is an unsupervised dimensionality reduction technique, it ignores the class label. PCA focuses on capturing the direction of maximum variation in the data set.

Both LDA and PCA are linear transformation techniques. LDA is a supervised whereas PCA is unsupervised, PCA ignores class labels. PCA is a technique that finds the directions of maximal variance. In contrast to PCA, LDA attempts to find a feature subspace that maximizes class separability. It means, the disparity between the data groups is modeled by the LDA, while the PCA does not detect such a disparity between groups. The factor analysis in PCA constructs the combinations of features based on disparities rather than similarities in LDA. As in LDA, the discriminant analysis is different from the factor analysis conducted in PCA where eigenvalues, eigenvectors, and covariance matrices are used. In addition, the number of components of LDA are limited to the number of class -1 but the maximun number of components in PCA is the number of features. Thus, fo MNIST dataset, the number of components in LDA is much less than the PCA. Kernel PCA is a non-linear transformation technique that using the kernel method.