## Hamoye ML Project Electric Cars - Team Prophet v3

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('D:\\People\\Amit\\Hamoye\\project\\electric_cars_with-outlier.csv')

In [3]:
df.isnull().sum()

vehicle_name         0
model                0
battery              0
seats                0
acceleration         0
top_speed            0
distance             0
efficiency           0
fast_charge          5
price_pounds        45
price_germany       13
price_netherland    25
dtype: int64

In [4]:
# Considering the fact that five vehicles dont have Fast Charge we will replace the string - with zero
df['fast_charge'] = df['fast_charge'].replace(to_replace=np.nan, value=0)

In [5]:
df.dtypes

vehicle_name         object
model                object
battery             float64
seats                 int64
acceleration        float64
top_speed             int64
distance              int64
efficiency            int64
fast_charge         float64
price_pounds        float64
price_germany       float64
price_netherland    float64
dtype: object

In [6]:
df['fast_charge'].isnull().sum()

0

In [25]:
## Filling Null Values in germany using pounds and netherland
for i in range(len(df)):
    if (df.isna()['price_germany'][i]):
        if (df['price_pounds'][i]!=0) and (not df.isna()['price_pounds'][i]):
            df['price_germany'][i] = round(df['price_pounds'][i]/1.17)
        elif (df['price_netherland'][i]!=0) and (not df.isna()['price_netherland'][i]):
             df['price_germany'][i]= round(df['price_netherland'][i])
    
df.isnull().sum()

vehicle_name          0
model                 0
battery               0
seats                 0
acceleration          0
top_speed             0
distance              0
efficiency            0
fast_charge           0
price_pounds         45
price_germany         0
price_netherland     25
price_usd             0
vehicle_name_code     0
dtype: int64

In [8]:
df.dtypes

vehicle_name         object
model                object
battery             float64
seats                 int64
acceleration        float64
top_speed             int64
distance              int64
efficiency            int64
fast_charge         float64
price_pounds        float64
price_germany       float64
price_netherland    float64
dtype: object

In [9]:
#### Now we use price_germany to get prices in USD
df['price_usd'] = round((df['price_germany']/1.18))

In [10]:
df.head()

Unnamed: 0,vehicle_name,model,battery,seats,acceleration,top_speed,distance,efficiency,fast_charge,price_pounds,price_germany,price_netherland,price_usd
0,Tesla,Model 3 Standard Range Plus LFP,52.5,5,5.6,225,350,150,560.0,40990.0,43560.0,49990.0,36915.0
1,Mercedes,EQS 580 4MATIC,107.8,5,4.3,210,610,177,800.0,115000.0,135529.0,140000.0,114855.0
2,Tesla,Model Y Long Range Dual Motor,70.0,7,5.0,217,410,171,520.0,54000.0,59965.0,65010.0,50818.0
3,Kia,EV6 GT,77.4,5,3.5,260,395,196,920.0,58295.0,65990.0,63595.0,55924.0
4,Tesla,Model 3 Long Range Dual Motor,70.0,5,4.4,233,455,154,570.0,48490.0,41444.0,,35122.0


In [11]:
## Replacing categorical variable Vehicle brand name to numerical values
df["vehicle_name_code"] = df["vehicle_name"].astype('category').cat.codes

In [12]:
df.head()

Unnamed: 0,vehicle_name,model,battery,seats,acceleration,top_speed,distance,efficiency,fast_charge,price_pounds,price_germany,price_netherland,price_usd,vehicle_name_code
0,Tesla,Model 3 Standard Range Plus LFP,52.5,5,5.6,225,350,150,560.0,40990.0,43560.0,49990.0,36915.0,33
1,Mercedes,EQS 580 4MATIC,107.8,5,4.3,210,610,177,800.0,115000.0,135529.0,140000.0,114855.0,20
2,Tesla,Model Y Long Range Dual Motor,70.0,7,5.0,217,410,171,520.0,54000.0,59965.0,65010.0,50818.0,33
3,Kia,EV6 GT,77.4,5,3.5,260,395,196,920.0,58295.0,65990.0,63595.0,55924.0,14
4,Tesla,Model 3 Long Range Dual Motor,70.0,5,4.4,233,455,154,570.0,48490.0,41444.0,,35122.0,33


## Features and Target

In [13]:
features = df[['battery', 'seats', 'acceleration',
       'top_speed', 'distance', 'efficiency', 'fast_charge']]
y = df['price_usd']

## Normalizing the data using MinMax Scaler

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

## Train Test Split


In [15]:
## Train Test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)

## Linear Regression


In [16]:
# Import Linear Regression Model
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
#fit the model to the training dataset
linear_model.fit(x_train, y_train)
#obtain predictions
pred_1 = linear_model.predict(x_test)

In [17]:
# Import Metrics for evaluation
from sklearn import metrics

In [18]:
print('Mean Absolute Error (MAE)', metrics.mean_absolute_error(y_test,pred_1))
print('Mean Square Error (MSE)', metrics.mean_squared_error(y_test,pred_1))
print('Root Mean Square Error (RMSE)', np.sqrt(metrics.mean_squared_error(y_test,pred_1)))
print('R2 score', metrics.r2_score(y_test,pred_1))
print('Explained Variance score', metrics.explained_variance_score(y_test,pred_1))

Mean Absolute Error (MAE) 15364.976071038482
Mean Square Error (MSE) 533655163.1166564
Root Mean Square Error (RMSE) 23100.97753595411
R2 score 0.48044712829934066
Explained Variance score 0.48795903007103747


## Random Forest

In [19]:
#Import Random Forest
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor(n_estimators=300)
rfc.fit(x_train,y_train)
pred_2 = rfc.predict(x_test)

In [20]:
print('Mean Absolute Error (MAE)', metrics.mean_absolute_error(y_test,pred_2))
print('Mean Square Error (MSE)', metrics.mean_squared_error(y_test,pred_2))
print('Root Mean Square Error (RMSE)', np.sqrt(metrics.mean_squared_error(y_test,pred_2)))
print('R2 score', metrics.r2_score(y_test,pred_2))
print('Explained Variance score', metrics.explained_variance_score(y_test,pred_2))

Mean Absolute Error (MAE) 9753.805686940837
Mean Square Error (MSE) 271542808.2819617
Root Mean Square Error (RMSE) 16478.55601325437
R2 score 0.735632941301245
Explained Variance score 0.7402825984290665


In [21]:
features.head()

Unnamed: 0,battery,seats,acceleration,top_speed,distance,efficiency,fast_charge
0,52.5,5,5.6,225,350,150,560.0
1,107.8,5,4.3,210,610,177,800.0
2,70.0,7,5.0,217,410,171,520.0
3,77.4,5,3.5,260,395,196,920.0
4,70.0,5,4.4,233,455,154,570.0


In [22]:
x.head()

Unnamed: 0,battery,seats,acceleration,top_speed,distance,efficiency,fast_charge
0,0.195308,0.428571,0.172414,0.355401,0.291429,0.259887,0.397163
1,0.496999,0.428571,0.108374,0.303136,0.588571,0.412429,0.567376
2,0.29078,0.714286,0.142857,0.327526,0.36,0.378531,0.368794
3,0.331151,0.428571,0.068966,0.477352,0.342857,0.519774,0.652482
4,0.29078,0.428571,0.1133,0.383275,0.411429,0.282486,0.404255


In [23]:
#Xgboost Model
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
xgb.fit(x_train,y_train)
pred_6 = xgb.predict(x_test)
print('Mean Absolute Error (MAE)', metrics.mean_absolute_error(y_test,pred_6))
print('Mean Square Error (MSE)', metrics.mean_squared_error(y_test,pred_6))
print('Root Mean Square Error (RMSE)', np.sqrt(metrics.mean_squared_error(y_test,pred_6))) #RMSE
print('R2 score', metrics.r2_score(y_test,pred_6)) #R2 Score
print('Explained Variance score', metrics.explained_variance_score(y_test,pred_6))

Mean Absolute Error (MAE) 9533.327592329546
Mean Square Error (MSE) 256950657.8594525
Root Mean Square Error (RMSE) 16029.680528926723
R2 score 0.7498394817421278
Explained Variance score 0.7542700205788013


In [24]:
features

Unnamed: 0,battery,seats,acceleration,top_speed,distance,efficiency,fast_charge
0,52.5,5,5.6,225,350,150,560.0
1,107.8,5,4.3,210,610,177,800.0
2,70.0,7,5.0,217,410,171,520.0
3,77.4,5,3.5,260,395,196,920.0
4,70.0,5,4.4,233,455,154,570.0
...,...,...,...,...,...,...,...
176,45.0,8,12.1,130,185,243,250.0
177,45.0,9,12.1,130,185,243,250.0
178,45.0,8,12.1,130,180,250,240.0
179,45.0,9,12.1,130,175,257,230.0
