# Data Visualiztion Course Final Projet
> by Kfir Goldfarb

<a href="https://github.com/kggold4"><img src="images/github.png" width="25px" height="25px" align="left"></a>
<a href="https://www.linkedin.com/in/kfir-goldfarb/"><img src="images/in.png"  width="25px" height="25px" align="left"></a>
<a href="mailto:kfir.goldfarb@msmail.ariel.ac.il"><img src="images/email.png" width="25px" height="25px" align="left"></a>
<a href="https://www.youtube.com/channel/UCypEWlruyG_I5A48GqB5c6g"><img src="images/youtube.png" width="25px" height="25px" align="left"></a>
<a href="https://www.hackerrank.com/kggold4?hr_r=1"><img src="images/hackerrank.png" width="25px" height="25px" align="left"></a>
<a href="https://stackoverflow.com/users/14749277/kfir-goldfarb"><img src="images/stackoverflow.png" width="25px" height="25px" align="left"></a>


<br>

# Jupyter Notebook Number 2
<i>Dimensionality Reduction and PCA for Fashion MNIST</i>

In [24]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
import warnings
warnings.filterwarnings('ignore')

# basic models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# sklearn
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

# ensemble
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

In [25]:
# import data:

# trains data
mnist_train = pd.read_csv("data/task_2/fashion-mnist_train.csv")
mnist_train = mnist_train.dropna()
X_train = mnist_train.iloc[:, 1:].values
y_train = mnist_train.iloc[:,0].values

# test data
mnist_test = pd.read_csv("data/task_2/fashion-mnist_test.csv")
mnist_test = mnist_test.dropna()

X_test = mnist_test.iloc[:,1:].values
y_test = mnist_test.iloc[:,0].values

# prints the data shape
print("shape of the train data is:", mnist_train.shape)
print("shape of the test data is: ", mnist_test.shape)

shape of the train data is: (60000, 785)
shape of the test data is:  (10000, 785)


In [30]:
print(y_train)
print(sum(y_train))

[2 9 6 ... 8 8 7]
270000


In [3]:
# first five test mnist:
# we can see that each row has 785 features
mnist_test.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0,0,0,0,0,0,0,0,9,8,...,103,87,56,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,34,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,14,53,99,...,0,0,0,0,63,53,31,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,137,126,140,0,133,224,222,56,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# function that print the accuracy
def print_accuracy(clf, X_train, X_test, y_train, y_test):
    # fit the train data to the model
    clf.fit(X_train, y_train)

    # get the pridictions of X_test
    pridictions = clf.predict(X_test)

    # get the accuracy of the model
    accuracy = clf.score(X_test, y_test)

    # print the accuracy
    print('accuracy of the model is: {} %'.format("%.2f" % (accuracy * 100)))

In [5]:
# first five train mnist:
# we can see that each row has 785 features
mnist_train.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# seeing the mnist data widely
# we can see we have 10000 rows
mnist_test.describe()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.5,0.0004,0.0103,0.0521,0.077,0.2086,0.3492,0.8267,2.3212,5.4578,...,34.3208,23.0719,16.432,17.8706,22.86,17.7902,8.3535,2.5416,0.6295,0.0656
std,2.872425,0.024493,0.525187,2.494315,2.208882,4.669183,5.657849,8.591731,15.031508,23.359019,...,57.888679,49.049749,42.159665,44.140552,51.706601,45.128107,28.765769,16.417363,7.462533,1.93403
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,55.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,9.0,2.0,45.0,218.0,185.0,227.0,223.0,247.0,218.0,244.0,...,254.0,252.0,255.0,255.0,255.0,255.0,240.0,225.0,205.0,107.0


# 154 Component PCA:

In [14]:
pca_components = PCA(n_components=154)
X_transformed_train = pca_components.fit(X_train).transform(X_train)
print("shape of 2 component PCA test data is:", X_transformed.shape)

shape of 2 component PCA test data is: (60000, 154)


In [15]:
pca_components.explained_variance_ratio_

array([0.29011353, 0.17727668, 0.060227  , 0.04970272, 0.03839289,
       0.03470997, 0.02344044, 0.01913937, 0.01343605, 0.01310725,
       0.0098979 , 0.0091405 , 0.00765206, 0.00660427, 0.00609188,
       0.00591506, 0.00552046, 0.0053049 , 0.00459959, 0.00455534,
       0.00432823, 0.00406103, 0.00383469, 0.00374221, 0.00361877,
       0.0035091 , 0.00333586, 0.00318696, 0.00309619, 0.00295278,
       0.00276008, 0.00265339, 0.00262962, 0.0025768 , 0.0024756 ,
       0.00238413, 0.00230075, 0.00224349, 0.00219128, 0.00209739,
       0.00200786, 0.00197572, 0.00194214, 0.00183776, 0.00175782,
       0.00171549, 0.00168051, 0.00163924, 0.00162221, 0.00156038,
       0.00153962, 0.00150808, 0.00147632, 0.00142855, 0.00138375,
       0.00137028, 0.00130994, 0.00129029, 0.00127055, 0.00126082,
       0.00122874, 0.00120786, 0.00118017, 0.00116343, 0.00115288,
       0.00112007, 0.00111488, 0.00109532, 0.00106025, 0.00104929,
       0.00103988, 0.00102253, 0.00097361, 0.00095532, 0.00093

In [23]:
kmeans = KMeans(init="k-means++", n_clusters=2, n_init=35)
kmeans.fit(X_transformed)
print("score:", kmeans.score(X_transformed))



# X_transformed_test = pca_components.fit(X_test).transform(X_test)
# y_transformed_test = pca_components.fit(y_test).transform(y_test)

y_transformed_train = pca_components.fit(y_train).transform(y_train)


# x = kmeans.predict(X_transformed_test)
    
print(sum(x) / len(x))


score: -193593855416.6155


ValueError: Expected 2D array, got 1D array instead:
array=[2. 9. 6. ... 8. 8. 7.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [21]:
len(X_transformed_train)

60000

In [17]:
a_bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)

# print accuracy
print_accuracy(a_bag_clf, X_transformed_train, X_test, y_train, y_test)

ValueError: Number of features of the model must match the input. Model n_features is 154 and input n_features is 784.