In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import zscore

In [18]:
data=pd.read_csv("auto-mpg.csv")

In [19]:
data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [25]:
data=data.drop('car name',axis=1)

In [26]:
#hp
hpisdigit=pd.DataFrame(data.horsepower.str.isdigit())
data[hpisdigit['horsepower']==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin


In [27]:
data=data.replace('?',np.nan)
data[hpisdigit['horsepower']==False]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin


In [28]:
medianfiller=lambda x:x.fillna(x.median())
data=data.apply(medianfiller,axis=0)
data['horsepower']=data['horsepower'].astype('float64')

In [30]:
x=data.drop(['mpg'],axis=1)
y=data[['mpg']]

In [31]:
s=StandardScaler()
x_std=s.fit_transform(x)

In [32]:
cov_matrix=np.cov(x_std.T)

In [33]:
print(cov_matrix)

[[ 1.00251889  0.95311615  0.84340357  0.89827376 -0.50669259 -0.34962425
  -0.56396033]
 [ 0.95311615  1.00251889  0.89803457  0.93517383 -0.54505356 -0.37109656
  -0.61094444]
 [ 0.84340357  0.89803457  1.00251889  0.86461476 -0.68831918 -0.41477495
  -0.45323458]
 [ 0.89827376  0.93517383  0.86461476  1.00251889 -0.41850885 -0.30733654
  -0.58248745]
 [-0.50669259 -0.54505356 -0.68831918 -0.41850885  1.00251889  0.28886274
   0.20639158]
 [-0.34962425 -0.37109656 -0.41477495 -0.30733654  0.28886274  1.00251889
   0.18111726]
 [-0.56396033 -0.61094444 -0.45323458 -0.58248745  0.20639158  0.18111726
   1.00251889]]


In [34]:
eigenvalues,eigenvectors=np.linalg.eig(cov_matrix)

In [35]:
print("Eigen vectors:",eigenvectors)

Eigen vectors: [[-0.43855264 -0.1144845   0.02689113  0.2430079   0.69425676 -0.45266412
  -0.21884163]
 [-0.45386128 -0.10580212  0.02465943  0.16082161  0.17706048  0.84999716
  -0.0499858 ]
 [-0.43755724  0.14144964  0.17639693  0.11774532 -0.5934737  -0.15543464
  -0.6059578 ]
 [-0.43219091 -0.20336634 -0.00434321  0.33489636 -0.33523145 -0.20773606
   0.71003121]
 [ 0.29772584 -0.48640295 -0.54872031  0.53485615 -0.12655828  0.02384225
  -0.26649758]
 [ 0.21488225 -0.63315133  0.73994607  0.02603235 -0.02044359  0.00996896
  -0.06503704]
 [ 0.29769303  0.52617954  0.34484761  0.70874202  0.07504026  0.06795576
   0.05504373]]


In [36]:
print("Eigen values:",eigenvalues)

Eigen values: [4.61375285 0.94263114 0.75056808 0.48239402 0.13258963 0.0336822
 0.06201432]


In [37]:
eig_pairs=[(eigenvalues[index],eigenvectors[:,index]) for index in range(len(eigenvalues))]

In [38]:
#sort
eig_pairs.sort()

In [39]:
print(eig_pairs)

[(0.03368220110313546, array([-0.45266412,  0.84999716, -0.15543464, -0.20773606,  0.02384225,
        0.00996896,  0.06795576])), (0.06201432270087935, array([-0.21884163, -0.0499858 , -0.6059578 ,  0.71003121, -0.26649758,
       -0.06503704,  0.05504373])), (0.1325896339928071, array([ 0.69425676,  0.17706048, -0.5934737 , -0.33523145, -0.12655828,
       -0.02044359,  0.07504026])), (0.4823940178801031, array([0.2430079 , 0.16082161, 0.11774532, 0.33489636, 0.53485615,
       0.02603235, 0.70874202])), (0.7505680761984068, array([ 0.02689113,  0.02465943,  0.17639693, -0.00434321, -0.54872031,
        0.73994607,  0.34484761])), (0.9426311428862403, array([-0.1144845 , -0.10580212,  0.14144964, -0.20336634, -0.48640295,
       -0.63315133,  0.52617954])), (4.613752847052039, array([-0.43855264, -0.45386128, -0.43755724, -0.43219091,  0.29772584,
        0.21488225,  0.29769303]))]


In [40]:
eig_pairs.reverse()

In [41]:
print(eig_pairs)

[(4.613752847052039, array([-0.43855264, -0.45386128, -0.43755724, -0.43219091,  0.29772584,
        0.21488225,  0.29769303])), (0.9426311428862403, array([-0.1144845 , -0.10580212,  0.14144964, -0.20336634, -0.48640295,
       -0.63315133,  0.52617954])), (0.7505680761984068, array([ 0.02689113,  0.02465943,  0.17639693, -0.00434321, -0.54872031,
        0.73994607,  0.34484761])), (0.4823940178801031, array([0.2430079 , 0.16082161, 0.11774532, 0.33489636, 0.53485615,
       0.02603235, 0.70874202])), (0.1325896339928071, array([ 0.69425676,  0.17706048, -0.5934737 , -0.33523145, -0.12655828,
       -0.02044359,  0.07504026])), (0.06201432270087935, array([-0.21884163, -0.0499858 , -0.6059578 ,  0.71003121, -0.26649758,
       -0.06503704,  0.05504373])), (0.03368220110313546, array([-0.45266412,  0.84999716, -0.15543464, -0.20773606,  0.02384225,
        0.00996896,  0.06795576]))]


In [42]:
eigenvalues_sorted=[eig_pairs[index][0] for index in range(len(eigenvalues))]

In [43]:
eigenvectors_sorted=[eig_pairs[index][1] for index in range(len(eigenvalues))]

In [44]:
print("Eigenvalues in descending order:",eigenvalues)

Eigenvalues in descending order: [4.61375285 0.94263114 0.75056808 0.48239402 0.13258963 0.0336822
 0.06201432]


In [45]:
p_reduce=np.array(eigenvectors_sorted[0:7])

In [46]:
x_std_4_dim=np.dot(x_std,p_reduce.T)

In [47]:
prj_data=pd.DataFrame(x_std_4_dim)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(prj_data,y,test_size=0.30,random_state=1)