In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('.\electric_motor.csv')

In [3]:
print(data.head()) #Data information 

         Brand  AccelSec  TopSpeedKmH  RangeKm  EfficiencyWhKm  FastChargeKmH  \
0       Tesla        4.6          233      450             161          940.0   
1  Volkswagen       10.0          160      270             167          250.0   
2    Polestar        4.7          210      400             181          620.0   
3         BMW        6.8          180      360             206          560.0   
4       Honda        9.5          145      170             168          190.0   

   Seats  PriceEuro  RapidCharge  
0      5      55480            1  
1      5      30000            1  
2      5      56440            1  
3      5      68040            1  
4      4      32997            1  


In [4]:
print(data.info)

<bound method DataFrame.info of            Brand  AccelSec  TopSpeedKmH  RangeKm  EfficiencyWhKm  \
0         Tesla        4.6          233      450             161   
1    Volkswagen       10.0          160      270             167   
2      Polestar        4.7          210      400             181   
3           BMW        6.8          180      360             206   
4         Honda        9.5          145      170             168   
..           ...       ...          ...      ...             ...   
98       Nissan        7.5          160      330             191   
99         Audi        4.5          210      335             258   
100      Nissan        5.9          200      325             194   
101      Nissan        5.1          200      375             232   
102       Byton        7.5          190      400             238   

     FastChargeKmH  Seats  PriceEuro  RapidCharge  
0            940.0      5      55480            1  
1            250.0      5      30000           

In [6]:
print(data.describe())

         AccelSec  TopSpeedKmH     RangeKm  EfficiencyWhKm  FastChargeKmH  \
count  103.000000   103.000000  103.000000      103.000000      98.000000   
mean     7.396117   179.194175  338.786408      189.165049     456.734694   
std      3.017430    43.573030  126.014444       29.566839     201.262897   
min      2.100000   123.000000   95.000000      104.000000     170.000000   
25%      5.100000   150.000000  250.000000      168.000000     275.000000   
50%      7.300000   160.000000  340.000000      180.000000     440.000000   
75%      9.000000   200.000000  400.000000      203.000000     560.000000   
max     22.400000   410.000000  970.000000      273.000000     940.000000   

            Seats      PriceEuro  RapidCharge  
count  103.000000     103.000000   103.000000  
mean     4.883495   55811.563107     1.048544  
std      0.795834   34134.665280     0.215963  
min      2.000000   20129.000000     1.000000  
25%      5.000000   34429.500000     1.000000  
50%      5.000000 

In [10]:
data.columns

Index(['Brand', 'AccelSec', 'TopSpeedKmH', 'RangeKm', 'EfficiencyWhKm',
       'FastChargeKmH', 'Seats', 'PriceEuro', 'RapidCharge'],
      dtype='object')

In [12]:
electric_column_data_list = list(data.columns)
print(electric_column_data_list)
print('\n')

['Brand', 'AccelSec', 'TopSpeedKmH', 'RangeKm', 'EfficiencyWhKm', 'FastChargeKmH', 'Seats', 'PriceEuro', 'RapidCharge']




In [14]:
# Finding null columns
data.isnull().any()

Brand             False
AccelSec          False
TopSpeedKmH       False
RangeKm           False
EfficiencyWhKm    False
FastChargeKmH      True
Seats             False
PriceEuro         False
RapidCharge       False
dtype: bool

In [29]:
print(data['FastChargeKmH'].mean())
#Filling mean value to the missing value 
data['FastChargeKmH'].fillna(data['FastChargeKmH'].mean(),inplace=True)

456.734693877551


In [30]:
# Is any null value present in column
data['FastChargeKmH'].isnull().any()

False

In [31]:
data['FastChargeKmH'].tail(25)

78     360.000000
79     810.000000
80     470.000000
81     480.000000
82     456.734694
83     380.000000
84     290.000000
85     330.000000
86     740.000000
87     470.000000
88     540.000000
89     440.000000
90     510.000000
91     456.734694
92     320.000000
93     500.000000
94     330.000000
95     470.000000
96     220.000000
97     420.000000
98     440.000000
99     540.000000
100    440.000000
101    450.000000
102    480.000000
Name: FastChargeKmH, dtype: float64

In [35]:
data=data.drop(['Brand'],axis=1)

In [36]:
data.columns

Index(['AccelSec', 'TopSpeedKmH', 'RangeKm', 'EfficiencyWhKm', 'FastChargeKmH',
       'Seats', 'PriceEuro', 'RapidCharge'],
      dtype='object')

In [37]:
data.columns.isnull().any()

False

In [40]:
input_column_list = list(set(electric_column_data_list)-set(['Brand']))
input_column_list

['Seats',
 'AccelSec',
 'FastChargeKmH',
 'RapidCharge',
 'PriceEuro',
 'TopSpeedKmH',
 'RangeKm',
 'EfficiencyWhKm']

In [41]:
output_list = list(['RapidCharge'])
print(output_list)

['RapidCharge']


In [44]:
scaler = StandardScaler()
data[input_column_list]=scaler.fit_transform(data[input_column_list])
print(round(data,2))

     AccelSec  TopSpeedKmH  RangeKm  EfficiencyWhKm  FastChargeKmH  Seats  \
0       -0.93         1.24     0.89           -0.96           2.47   0.15   
1        0.87        -0.44    -0.55           -0.75          -1.06   0.15   
2       -0.90         0.71     0.49           -0.28           0.84   0.15   
3       -0.20         0.02     0.17            0.57           0.53   0.15   
4        0.70        -0.79    -1.35           -0.72          -1.37  -1.12   
..        ...          ...      ...             ...            ...    ...   
98       0.03        -0.44    -0.07            0.06          -0.09   0.15   
99      -0.96         0.71    -0.03            2.34           0.43   0.15   
100     -0.50         0.48    -0.11            0.16          -0.09   0.15   
101     -0.76         0.48     0.29            1.46          -0.03   0.15   
102      0.03         0.25     0.49            1.66           0.12   0.15   

     PriceEuro  RapidCharge  
0        -0.01        -0.23  
1        -0.76 

In [48]:
# Computing covariance matrix
input_data = data[input_column_list]
covariance_matrix = input_data.cov()
print(round(covariance_matrix,2))

                Seats  AccelSec  FastChargeKmH  RapidCharge  PriceEuro  \
Seats            1.01     -0.18           0.08        -0.43       0.02   
AccelSec        -0.18      1.01          -0.62         0.52      -0.63   
FastChargeKmH    0.08     -0.62           1.01         0.00       0.64   
RapidCharge     -0.43      0.52           0.00         1.01      -0.20   
PriceEuro        0.02     -0.63           0.64        -0.20       1.01   
TopSpeedKmH      0.13     -0.79           0.75        -0.25       0.84   
RangeKm          0.30     -0.68           0.64        -0.41       0.68   
EfficiencyWhKm   0.30     -0.39           0.31        -0.11       0.40   

                TopSpeedKmH  RangeKm  EfficiencyWhKm  
Seats                  0.13     0.30            0.30  
AccelSec              -0.79    -0.68           -0.39  
FastChargeKmH          0.75     0.64            0.31  
RapidCharge           -0.25    -0.41           -0.11  
PriceEuro              0.84     0.68            0.40  
Top

In [53]:
# Computing eigen value and eigenvector
eig_val,eig_vec = np.linalg.eig(covariance_matrix.to_numpy())
len(eig_val)
eig_pairs = [(np.abs(eig_val[i]),eig_vec[:,i])for i in range(len(eig_val))]

In [55]:
# Absolute Value
eig_pairs.sort(key = lambda x: x[0],reverse=True)
print('Eigen value in descending order\n')
for i in eig_pairs:
    print(i[0])


Eigen value in descending order

4.235923439548401
1.4196853662162119
0.913583389364097
0.6069057972628527
0.3639516500978206
0.25971535644721644
0.1819772391411559
0.0966891344712704


In [56]:
# Setting Threshold value is 95%
threshold = 0.95

In [57]:
# Computing number of PC's required to capture specific variance
print('Explain Variance in Percentage \n')
total_variance = 0.0
count = 0
eig_sum = np.sum(eig_val)

Explain Variance in Percentage 



In [59]:
for i,j in enumerate (eig_pairs):
    variance_explained = (j[0]/eig_sum).real
    print('eigenvalue {}: {}'.format(i+1, (j[0]/eig_sum).real*100))
    total_variance = total_variance+variance_explained
    count = count+1
    if(total_variance>threshold):
        break
print(total_variance)

eigenvalue 1: 52.43497461576899
eigenvalue 2: 17.57377516432688
eigenvalue 3: 11.308920596497307
eigenvalue 4: 7.512668849612976
eigenvalue 5: 4.505226736647776
eigenvalue 6: 3.214923101652434
0.9655048906450636


In [65]:
len(eig_vec)
count


6

In [67]:
# Select required Pc's based on count projection matrix w=d*k
reduced_dimension = np.zeros((len(eig_vec),count))
for i   in range(count):
    reduced_dimension[:,i] = eig_pairs[i][1]

In [69]:
# Projecting the scale data into reduced space
projected_data = data[input_column_list].to_numpy().dot(reduced_dimension)
projected_dataframe = pd.DataFrame(projected_data, columns=['PC1','PC2'])
projected_dataframe_with_class_info = pd.concat([projected_dataframe, data['RapidChargeKmH']],axis=1)

ValueError: Shape of passed values is (103, 6), indices imply (103, 2)