In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from scipy.io import arff

data = arff.loadarff("datasets/jm1.arff")
df = pd.DataFrame(data[0])
df.dropna(how="all", inplace=True) # drops the empty line at file-end
df.tail()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
10880,18.0,4.0,1.0,4.0,52.0,241.48,0.14,7.33,32.93,1770.86,...,13.0,0.0,2.0,0.0,10.0,15.0,30.0,22.0,7.0,b'false'
10881,9.0,2.0,1.0,2.0,30.0,129.66,0.12,8.25,15.72,1069.68,...,5.0,0.0,2.0,0.0,12.0,8.0,19.0,11.0,3.0,b'false'
10882,42.0,4.0,1.0,2.0,103.0,519.57,0.04,26.4,19.68,13716.72,...,29.0,1.0,10.0,0.0,18.0,15.0,59.0,44.0,7.0,b'false'
10883,10.0,1.0,1.0,1.0,36.0,147.15,0.12,8.44,17.44,1241.57,...,6.0,0.0,2.0,0.0,9.0,8.0,21.0,15.0,1.0,b'false'
10884,19.0,3.0,1.0,1.0,58.0,272.63,0.09,11.57,23.56,3154.67,...,13.0,0.0,2.0,1.0,12.0,14.0,31.0,27.0,5.0,b'false'


In [2]:
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# Removing NaN from the dataset
is_nan = ~np.isnan(X).any(axis=1)
X = X[is_nan]
y = y[is_nan]

X_std = StandardScaler().fit_transform(X)

In [3]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot(X_std - mean_vec) / (X_std.shape[0] - 1)
print("Here are the covariance matrix:\n{}".format(cov_mat))
# We can use np.cov(X_std.T)

Here are the covariance matrix:
[[ 1.00009192  0.81787012  0.51764654  0.78413234  0.88187137  0.90037275
  -0.28661835  0.6895928   0.49997217  0.75064658  0.90004446  0.75064658
   0.92199979  0.61292954  0.80364595  0.27814477  0.53673036  0.76469223
   0.88104782  0.86835442  0.83027038]
 [ 0.81787012  1.00009192  0.70173792  0.85966972  0.73087739  0.7599796
  -0.25284826  0.66915204  0.30309964  0.70960425  0.75973331  0.70960425
   0.80002227  0.3845919   0.53843027  0.20985717  0.64748981  0.70057843
   0.74423192  0.69614165  0.97204738]
 [ 0.51764654  0.70173792  1.00009192  0.63963125  0.46606646  0.44597172
  -0.23388371  0.43407605  0.21328659  0.31560104  0.4457623   0.31560104
   0.45467596  0.2943187   0.33828408  0.19097943  0.26302409  0.33706201
   0.47997957  0.43555821  0.78562538]
 [ 0.78413234  0.85966972  0.63963125  1.00009192  0.70247521  0.74326047
  -0.19771465  0.57540639  0.30974305  0.75779077  0.74307983  0.75779077
   0.77594361  0.35164084  0.54133353 

In [4]:
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print("Here are the eigenvectors:\n{}".format(eig_vecs))
print("\nHere are the eigenvalues:\n{}".format(eig_vals))

Here are the eigenvectors:
[[ 2.53836537e-01 -3.57163180e-02  6.69727837e-03 -8.48310154e-02
  -3.38447324e-02 -1.66240008e-01  8.53509778e-02 -1.56778181e-01
  -1.22518798e-01  3.87278576e-01 -1.83870768e-01 -1.91367790e-01
  -6.83887347e-01 -3.94869778e-01  5.89252153e-02 -1.60625929e-02
  -5.68313526e-02  1.97915770e-02 -1.85049023e-03  3.19357690e-05
   1.56287765e-08]
 [ 2.31408216e-01 -3.21027213e-01 -1.43817348e-01  3.29475071e-02
  -4.41151941e-02  1.53174108e-01 -9.54195293e-02 -1.62664325e-01
   6.45062686e-03  2.77252718e-01 -1.13633778e-02 -3.29647646e-01
   3.31755659e-01 -7.88974422e-02  6.34961425e-02  3.88692293e-02
   6.71956766e-01 -3.96816477e-02  3.47753421e-03 -7.56489397e-04
   5.62473335e-08]
 [ 1.49636977e-01 -3.50760315e-01 -4.13147353e-01 -2.79890333e-01
  -3.64772953e-01  5.70327087e-03 -1.28405139e-01  1.39481984e-01
  -6.80547669e-02 -3.53126895e-01  4.42342932e-01  1.71741792e-01
  -2.77334105e-01 -1.64961332e-02  4.70533132e-03  2.57433237e-03
   6.919394

In [5]:
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs.sort()
eig_pairs.reverse()
print("Eigenvalues in descending order:")
for e in eig_pairs:
    print(e[0])

Eigenvalues in descending order:
13.579653175335618
1.6371977230739057
1.3096430990571084
0.9632330389403704
0.8299072979061671
0.7534020924655931
0.5874137750022079
0.4123335005963594
0.2735698728254324
0.2115016000627314
0.16094817253111762
0.12129591297834619
0.06076374688138924
0.03702038715322056
0.03563525663102484
0.015137792886306236
0.0087144866866817
0.003997989165318903
0.00029234853488995465
0.0002690557637177579
1.49180822186206e-14


In [6]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
cum_var_exp

array([ 64.65907164,  72.45453472,  78.69035723,  83.27675964,
        87.22833593,  90.81563519,  93.61258655,  95.57589894,
        96.87849289,  97.88555079,  98.65190022,  99.22944672,
        99.5187713 ,  99.69504265,  99.86471875,  99.93679686,
        99.9782906 ,  99.99732689,  99.9987189 , 100.        ,
       100.        ])

In [7]:
# Constructing the eigenvector matrix W
count = 0
for i, val in enumerate(cum_var_exp):
    if val > 90:
        count = i + 1
        break

w = [eig_pairs[i][1] for i in range(count)]
matrix_w = np.array(w).T
print("Eigenvector matrix W:\n{}".format(matrix_w))

Eigenvector matrix W:
[[ 2.53836537e-01 -3.57163180e-02  6.69727837e-03 -8.48310154e-02
  -3.38447324e-02 -1.66240008e-01]
 [ 2.31408216e-01 -3.21027213e-01 -1.43817348e-01  3.29475071e-02
  -4.41151941e-02  1.53174108e-01]
 [ 1.49636977e-01 -3.50760315e-01 -4.13147353e-01 -2.79890333e-01
  -3.64772953e-01  5.70327087e-03]
 [ 2.21308249e-01 -3.31651834e-01 -1.92027412e-02 -5.34155757e-02
  -6.02994402e-02  8.28755309e-02]
 [ 2.61677172e-01  1.28018806e-01  4.05702713e-02 -1.18813131e-02
  -7.19727072e-02 -2.05924783e-02]
 [ 2.63913259e-01  6.77496298e-02  1.23163725e-01 -6.22260921e-03
  -4.07483813e-02 -8.98866848e-03]
 [-7.55499866e-02 -4.81252389e-02  5.88235574e-01 -3.71558440e-01
  -2.43060046e-01  4.39642270e-01]
 [ 2.20537515e-01  4.58766019e-02 -1.50144114e-01  1.89740374e-01
   2.18005395e-01 -2.06735952e-02]
 [ 1.58185147e-01  4.66163147e-01 -8.89433799e-02 -2.48601081e-02
  -3.97535123e-01  2.52402265e-01]
 [ 2.16977598e-01 -2.39319060e-01  3.63590342e-01  5.21277325e-02
   

In [8]:
Y = X_std.dot(matrix_w)
print("Matrix Y:\n{}".format(Y))
Y.shape

Matrix Y:
[[-1.73106453 -0.47299188  4.67591852 -3.6935559  -1.11559882  2.84075949]
 [-1.841925   -0.49541227  3.675028   -2.60728826 -0.93741823  1.91903637]
 [ 1.35545096  1.07705061 -0.37871597  0.18822207  0.23380654 -0.06106473]
 ...
 [ 0.051096    0.31215086 -0.28892175  0.87319329  0.64488597 -0.26318993]
 [-1.30116747 -0.00902139  0.20177378  0.23926051  0.21924915 -0.15574914]
 [-0.82669081  0.1706605  -0.13763329  0.14369627  0.56101096  0.15765472]]


(10880, 6)