# Appendix

In [None]:
# Helper libraries
import numpy as np
import pandas as pd
from time import time
from collections import Counter

import matplotlib
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.cluster import homogeneity_score

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
training_df = pd.read_csv("/Users/gurjy/Downloads/train.csv")


In [None]:
print(training_df.shape)

In [None]:
#save label in different variable
target = training_df['label']
# Drop the label feature
training_df.drop("label",axis=1,inplace=True)

In [None]:
target.shape

In [None]:
X = training_df.values
#scale features
X_std = StandardScaler().fit_transform(X)

# Calculating Eigenvectors and eigenvalues of Cov matirx
mean_vec = np.mean(X_std, axis=0)
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
# Create a list of (eigenvalue, eigenvector) tuples
eig_pairs = [ (np.abs(eig_vals[i]),eig_vecs[:,i]) for i in range(len(eig_vals))]

# Sort the eigenvalue, eigenvector pair from high to low
eig_pairs.sort(key = lambda x: x[0], reverse= True)

# Calculation of Explained Variance from the eigenvalues
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse=True)] # Individual explained variance
cum_var_exp = np.cumsum(var_exp) # Cumulative explained variance

In [None]:
X = training_df.values
X_std = StandardScaler().fit_transform(X)

mean_vec=np.mean(X_std,axis=0)
cov_mat=np.cov(X_std.T)
eigvalues ,eigvectors =np.linalg.eig(cov_mat)

eigpairs=[(np.abs(eigvalues[i]),eigvectors[:,i] )for i in range(len(eigvalues))]


eigpairs.sort(key=lambda x:x[0],reverse=True)
   
tot=sum(eigvalues)
var_exp=[(i/tot)*100 for i in sorted(eigvalues,reverse=True)]
cum_var_exp=np.cumsum(var_exp)

In [None]:
#shows how many PCAs are good to use to explain data in lower dimension
trace1 = go.Scatter(
    x=list(range(784)),
    y= cum_var_exp,
    mode='lines+markers',
    name="'Cumulative Explained Variance'",
   
    line = dict(
        shape='spline',
        color = 'goldenrod'
    )
)
trace2 = go.Scatter(
    x=list(range(784)),
    y= var_exp,
    mode='lines+markers',
    name="'Individual Explained Variance'",
 
     line = dict(
        shape='linear',
        color = 'black'
    )
)
fig = tls.make_subplots(insets=[{'cell': (1,1), 'l': 0.7, 'b': 0.5}],
                          print_grid=True)

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,1)


fig.layout.title='explained Variance plots'
fig.layout.xaxis=dict(range=[0,800],title='Feature columns')
fig.layout.yaxis=dict(range=[0,100],title='explained variance')


py.iplot(fig,filename='inset example')

In [None]:
#make 30 PCAs
pca=PCA(30)
pca.fit(X_std)

In [None]:
X_pca=pca.transform(X_std)

In [None]:
X_pca.shape

In [None]:
X_std.shape

In [None]:
eigenvectors=pca.components_
eigenvectors.shape

In [None]:
plt.figure(figsize=(17,16))

x_row=4
y_col=7

for i in list(range(x_row*y_col)):
    
    plt.subplot(x_row,y_col,i+1)
    plt.imshow(eigenvectors[i].reshape(28,28),cmap='twilight_shifted')
    title_='Eigenvector'+str(i+1)
    plt.title(title_)
    plt.xticks(())
    plt.yticks(())
plt.show()

In [None]:
plt.figure(figsize=(12,13))

for i in list(range(0,70)):
    plt.subplot(7,10,i+1)
    plt.title(target[i])
    plt.imshow(training_df.iloc[i].values.reshape(28,28), interpolation = "none", cmap='binary')
    plt.xticks([])
    plt.yticks([])
plt.tight_layout()
               
plt.tight_layout

In [None]:
#140 PCS explain 80% of data
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(140)
X_140d=pca_.fit_transform(X_std_)
Target=target

In [None]:
trace = go.Scatter(
    x = X_140d[:,0],
    y = X_140d[:,1],
    name = str(Target),
    
    mode = 'markers',
    text = Target,
    showlegend = False,
    marker = dict(
        size = 8,
        color = Target,
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        
        opacity = 0.8
    )
)

data=[trace]

layout=go.Layout(title='PCA',
                hovermode='closest',
                xaxis=dict(
                    title='First principal direction',
                    ticklen=5,
                    zeroline=False),
                 yaxis=dict(
                 title='Second principal direction',
                 ticklen=5
            ),
                 showlegend=True
                
                    
                )
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')

In [None]:
#now see how good clustering does using PCs
kmeans=KMeans(10)
X_clustered140=kmeans.fit_predict(X_140d)

In [None]:
tracekmeans = go.Scatter(x=X_140d[:, 0], y= X_140d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered140,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )

data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")

In [None]:
x_clusters_df=pd.DataFrame(X_clustered140, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)

In [None]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)

In [None]:
#use three metrics
homogeneity_score(Target, X_clustered140)

In [None]:
metrics.silhouette_score(X_140d, X_clustered140)

In [None]:
metrics.completeness_score(Target, X_clustered140)

In [None]:
#do same thing with 319 PCs and 784 PCs
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(319)
X_319d=pca_.fit_transform(X_std_)
Target=target

In [None]:
trace = go.Scatter(
    x = X_319d[:,0],
    y = X_319d[:,1],
    name = str(Target),
    
    mode = 'markers',
    text = Target,
    showlegend = False,
    marker = dict(
        size = 8,
        color = Target,
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        
        opacity = 0.8
    )
)

data=[trace]

layout=go.Layout(title='PCA',
                hovermode='closest',
                xaxis=dict(
                    title='First principal direction',
                    ticklen=5,
                    zeroline=False),
                 yaxis=dict(
                 title='Second principal direction',
                 ticklen=5
            ),
                 showlegend=True
                
                    
                )
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')

In [None]:
kmeans=KMeans(10)
X_clustered319=kmeans.fit_predict(X_319d)

In [None]:
tracekmeans = go.Scatter(x=X_319d[:, 0], y= X_319d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered319,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")

In [None]:
x_clusters_df=pd.DataFrame(X_clustered319, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)

In [None]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)

In [None]:
homogeneity_score(Target, X_clustered319)

In [None]:
metrics.silhouette_score(X_319d, X_clustered319)

In [None]:
metrics.completeness_score(Target, X_clustered319)

In [None]:
X_=training_df
X_std_=StandardScaler().fit_transform(X_)
pca_=PCA(784)
X_784d=pca_.fit_transform(X_std_)
Target=target

In [None]:
trace = go.Scatter(
    x = X_784d[:,0],
    y = X_784d[:,1],
    name = str(Target),
    
    mode = 'markers',
    text = Target,
    showlegend = False,
    marker = dict(
        size = 8,
        color = Target,
        colorscale ='Jet',
        showscale = False,
        line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        ),
        
        opacity = 0.8
    )
)

data=[trace]

layout=go.Layout(title='PCA',
                hovermode='closest',
                xaxis=dict(
                    title='First principal direction',
                    ticklen=5,
                    zeroline=False),
                 yaxis=dict(
                 title='Second principal direction',
                 ticklen=5
            ),
                 showlegend=True
                
                    
                )
fig=dict(data=data,layout=layout)
py.iplot(fig,filename='pca')

In [None]:
kmeans=KMeans(10)
X_clustered784=kmeans.fit_predict(X_784d)

In [None]:
tracekmeans = go.Scatter(x=X_784d[:, 0], y= X_784d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered784,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")

In [None]:
x_clusters_df=pd.DataFrame(X_clustered784, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)

In [None]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)

In [None]:
homogeneity_score(Target, X_clustered784)

In [None]:
metrics.silhouette_score(X_784d, X_clustered784)

In [None]:
metrics.completeness_score(Target, X_clustered784)

In [None]:
tracekmeans = go.Scatter(x=X_784d[:, 0], y= X_784d[:, 1], mode="markers",
                    showlegend=False,
                    marker=dict(
                            size=8,
                            color = X_clustered784,
                            colorscale = 'Portland',
                            showscale=False, 
                            line = dict(
            width = 2,
            color = 'rgb(255, 255, 255)'
        )
                   ))


layout=go.Layout(title='Kmeans clustering',
                 hovermode='closest',
                 xaxis=dict(title='first principal direction',
                           ticklen=5,
                           zeroline=False,
                           gridwidth=2),
                 yaxis=dict(title='second principal component',
                           ticklen=5,
                           gridwidth=2),
                 showlegend=True
                     )
data = [tracekmeans]
fig1 = dict(data=data, layout= layout)
py.iplot(fig1, filename="svm")

In [None]:
#DO KMEANS WITHOUT PCS using direct 784 features
kmeans=KMeans(10)
X_clustered=kmeans.fit_predict(training_df)

In [None]:
x_clusters_df=pd.DataFrame(X_clustered, dtype=int)
x_clusters_df.columns=['Cluster']
targeted_df=pd.DataFrame(Target,dtype=int)

In [None]:
pd.crosstab(targeted_df.label, x_clusters_df.Cluster)

In [None]:
homogeneity_score(Target, X_clustered)

In [None]:
metrics.silhouette_score(X_, X_clustered)

In [None]:
metrics.completeness_score(Target, X_clustered)

In [None]:
len(X_clustered)

In [None]:
test_df = pd.read_csv("/Users/gurjy/Downloads/test.csv")

In [None]:
test_df.shape

In [None]:
X = test_df.values

In [None]:
kmeans=KMeans(10)
X_clustered=kmeans.fit_predict(X)

In [None]:
len(X_clustered)

In [None]:
ID = [ i for i in range(1, len(X_clustered) + 1)]

In [None]:
df = pd.DataFrame({'ImageId' : ID, 'Label' : X_clustered})

In [None]:
filename = 'MNIST Predictions 1.csv'

df.to_csv(filename,index=False)

print('Saved file: ' + filename)