In [None]:
import pandas as pd

df = pd.read_csv(
        filepath_or_buffer= 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' ,
        header= None,
        sep= ','
      )
df.columns=[ 'sepal_len' , 'sepal_wid' , 'petal_len' , 'petal_wid' , 'class' ]
df.dropna(how= "all" , inplace= True ) # drops the empty line at file-end
# Show the first 5 rows
df.head( 5 )

In [14]:
# split data table into data X and class labels y
X = df.iloc[:, 0 : 4 ].values
y = df.iloc[:, 4 ].values

In [None]:
import plotly.offline as py
# plotting histograms
data = []
legend = { 0 : False , 1 : False , 2 : False , 3 : True }
colors = { 'Iris-setosa' : '#0D76BF' ,
          'Iris-versicolor' : '#00cc96' ,
          'Iris-virginica' : '#EF553B' }
for col in range ( 4 ):
  for key in colors:
    trace = dict (
    type = 'histogram' ,
    x= list (X[y==key, col]),
    opacity= 0.75 ,
    xaxis= 'x%s' %(col+ 1 ),
    marker= dict (color=colors[key]),
    name=key,
    showlegend=legend[col]
    )
    data.append(trace)

layout = dict (
          barmode= 'overlay' ,
          xaxis= dict (domain=[ 0 , 0.25 ], title= 'sepal length (cm)' ),
          xaxis2= dict (domain=[ 0.3 , 0.5 ], title= 'sepal width (cm)' ),
          xaxis3= dict (domain=[ 0.55 , 0.75 ], title= 'petal length (cm)' ),
          xaxis4= dict (domain=[ 0.8 , 1 ], title= 'petal width (cm)' ),
          yaxis= dict (title= 'count' ),
          title= 'Distribution of the different Iris flower features'
          )
fig = dict (data=data, layout=layout)
py.iplot(fig, filename= 'exploratory-vis-histogram' )

In [None]:
import plotly.express as px
df = px.data.iris()
display(df)
fig = px.scatter_matrix(df, dimensions=["sepal_width", "sepal_length", "petal_width", "petal_length" ],color= "species" )
fig.show()

In [19]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)

In [None]:
import numpy as np
mean_vec = np.mean(X_std, axis= 0 )
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) /(X_std.shape[ 0 ] -1 )
print ( 'Covariance matrix \n%s' %cov_mat)

In [None]:
#The more verbose way above was simply used for demonstration purposes,
# equivalently, we could have used the numpy cov function:
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))

In [None]:
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print ( 'Eigenvectors \n%s' %eig_vecs)
print ( '\nEigenvalues \n%s' %eig_vals)

In [None]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np. abs (eig_vals[i]), eig_vecs[:,i]) for i in range ( len (eig_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()
# Visually confirm that the list is correctly sorted by decreasing
# eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
  print(i[0])

In [None]:
import matplotlib.pyplot as plt
tot = sum (np. abs (eig_vals))
var_exp = [(i / tot) for i in sorted (np. abs (eig_vals), reverse= True )]
cum_var_exp = np.cumsum(var_exp)
plt.bar( range ( 1 , eig_vals.size + 1 ), var_exp, alpha= 0.5 , align= 'center' ,
label= 'Individual' )
plt.step( range ( 1 , eig_vals.size + 1 ), cum_var_exp, where= 'mid' ,
label= 'Cumulative' )
plt.ylabel( 'Explained variance ratio' )
plt.xlabel( 'Principal components' )
plt.legend(loc= 'best' )
plt.tight_layout()
plt.savefig( './fig-pca-var-exp.png' , dpi= 300 )
plt.show()

In [None]:
matrix_w = np.hstack((eig_pairs[ 0 ][ 1 ].reshape( 4 , 1 ),
eig_pairs[ 1 ][ 1 ].reshape( 4 , 1 )))
print ( 'Matrix W:\n' , matrix_w)

In [36]:
Y = X_std.dot(matrix_w)

In [None]:
#Plot the result
colors = [ 'r' , 'b' , 'g' ]
markers = [ '1' , '2' , '3' ]
for l, c, m in zip (np.unique(y), colors, markers):
  plt.scatter(Y[y==l, 0 ],
    Y[y==l, 1 ],
    c=c, label=l, marker=m)
plt.title( 'Y' )
plt.xlabel( 'PC 1' )
plt.ylabel( 'PC 2' )
plt.legend(loc= 'lower left' )
plt.tight_layout()
plt.savefig( './fig-pca-z.png' , dpi= 300 )
plt.show()