In [1]:
import pandas as pd
import numpy as np

# Agenda
## 1. [Importing Data](#importing-data)
## 2. Data Cleaning (preprocessing)
## 3. [Train-Test Split](#train-test-split)
## 4. [Feature Engineering](#feature-engineering)
## 5. [Data Encoding](#data-encoding)
## 6. [Scaling](#scaling)
## 7. [Train-Test Split](#train-test-split)
## 8. [Model Selection](#model-selection)


# Importing Data

In [2]:
# Import the data
bm_df = pd.read_csv("dataset/bmw.csv")
bm_df.head(10)

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,5 Series,2014,11200,Automatic,67068,Diesel,125,57.6,2.0
1,6 Series,2018,27000,Automatic,14827,Petrol,145,42.8,2.0
2,5 Series,2016,16000,Automatic,62794,Diesel,160,51.4,3.0
3,1 Series,2017,12750,Automatic,26676,Diesel,145,72.4,1.5
4,7 Series,2014,14500,Automatic,39554,Diesel,160,50.4,3.0
5,5 Series,2016,14900,Automatic,35309,Diesel,125,60.1,2.0
6,5 Series,2017,16000,Automatic,38538,Diesel,125,60.1,2.0
7,2 Series,2018,16250,Manual,10401,Petrol,145,52.3,1.5
8,4 Series,2017,14250,Manual,42668,Diesel,30,62.8,2.0
9,5 Series,2016,14250,Automatic,36099,Diesel,20,68.9,2.0


# Remove columns with one unique value

In [3]:
# Remove columns with one unique value
unique_values = bm_df.nunique()
unique_values # No need to remove any entire column since there is no single unique value column

model             24
year              25
price           3777
transmission       3
mileage         8086
fuelType           5
tax               38
mpg              102
engineSize        17
dtype: int64

# Removing and Checking missing data percentage and Imputing if required

In [7]:
# Check for missing data
missing_values = [" ", "", None]
# Convert "", " " and None to NaN
bm_df.replace(missing_values, np.nan, inplace=True)
# Check for the NaN values
bm_df.isna().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [8]:
# Another approach to see missing data
bm_df.isin(missing_values).mean().sort_values(ascending=False) * 100 # To get percentage of missing values/

model           0.0
year            0.0
price           0.0
transmission    0.0
mileage         0.0
fuelType        0.0
tax             0.0
mpg             0.0
engineSize      0.0
dtype: float64

# Train-Test Split

In [20]:
# Separating Target Variable / Column / Feature
X = bm_df.drop(columns=["price"])
y = bm_df["price"]

# Feature Engineering

In [21]:
# Feature Engineering - adding a new feature called car_type
# Dictionary of BMW models and their corresponding car types
car_type = {
    "1 Series": "hatchback",
    "2 Series": "coupe",
    "3 Series": "sedan",
    "4 Series": "coupe",
    "5 Series": "sedan",
    "6 Series": "coupe",
    "7 Series": "sedan",
    "8 Series": "coupe",
    "X1": "suv",
    "X2": "suv",
    "X3": "suv",
    "X4": "suv",
    "X5": "suv",
    "X6": "suv",
    "X7": "suv",
    "Z3": "convertible",
    "Z4": "convertible",
    "i3": "electric",
    "i8": "electric",
    "M2": "sports",
    "M3": "sports",
    "M4": "sports",
    "M5": "sports",
    "M6": "sports"
}
X["model"] = X["model"].str.strip()
X["carType"] = X["model"].map(car_type)
X

Unnamed: 0,model,year,transmission,mileage,fuelType,tax,mpg,engineSize,carType
0,5 Series,2014,Automatic,67068,Diesel,125,57.6,2.0,sedan
1,6 Series,2018,Automatic,14827,Petrol,145,42.8,2.0,coupe
2,5 Series,2016,Automatic,62794,Diesel,160,51.4,3.0,sedan
3,1 Series,2017,Automatic,26676,Diesel,145,72.4,1.5,hatchback
4,7 Series,2014,Automatic,39554,Diesel,160,50.4,3.0,sedan
...,...,...,...,...,...,...,...,...,...
10776,X3,2016,Automatic,40818,Diesel,150,54.3,2.0,suv
10777,5 Series,2016,Automatic,42947,Diesel,125,60.1,2.0,sedan
10778,3 Series,2017,Manual,25468,Petrol,200,42.8,2.0,sedan
10779,1 Series,2014,Automatic,45000,Diesel,30,64.2,2.0,hatchback


In [22]:
# Lets check the percentages of the carTypes
X["carType"].value_counts(normalize=True) * 100

carType
sedan          33.438457
suv            22.734440
coupe          21.992394
hatchback      18.263612
sports          1.947871
convertible     1.066691
electric        0.556535
Name: proportion, dtype: float64

# Data Encoding - One Hot Encoding

In [23]:
# Data Encoding - Convert categorical variables into dummy/indicator variables
X = pd.get_dummies(X, drop_first=True, dtype=int)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,carType_coupe,carType_electric,carType_hatchback,carType_sedan,carType_sports,carType_suv
0,2014,67068,125,57.6,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,2018,14827,145,42.8,2.0,0,0,0,0,1,...,0,0,0,1,1,0,0,0,0,0
2,2016,62794,160,51.4,3.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,2017,26676,145,72.4,1.5,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2014,39554,160,50.4,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,2016,40818,150,54.3,2.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
10777,2016,42947,125,60.1,2.0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
10778,2017,25468,200,42.8,2.0,0,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
10779,2014,45000,30,64.2,2.0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


# Scaling

In [24]:
# Scaling Data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# Creat a scaled df
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X

Unnamed: 0,year,mileage,tax,mpg,engineSize,model_2 Series,model_3 Series,model_4 Series,model_5 Series,model_6 Series,...,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,carType_coupe,carType_electric,carType_hatchback,carType_sedan,carType_sports,carType_suv
0,0.750000,0.313399,0.215517,0.111971,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.916667,0.069281,0.250000,0.080163,0.303030,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.833333,0.293427,0.275862,0.098646,0.454545,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.875000,0.124650,0.250000,0.143778,0.227273,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.750000,0.184828,0.275862,0.096497,0.454545,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10776,0.833333,0.190735,0.258621,0.104879,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10777,0.833333,0.200683,0.215517,0.117344,0.303030,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10778,0.875000,0.119005,0.344828,0.080163,0.303030,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
10779,0.750000,0.210277,0.051724,0.126155,0.303030,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Train-Test Split

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

# Model Selection - Which model to Use? - Here are 3 Examples

### Basic Convention

0. Import ML Algorithm and Import Metric of Evaluation of our choice
1. Create a model object
2. Fit the model object to our data (Training model with data)
3. Create predictions with our newly trained model
4. Measure efficacy of your algorithm using your metric

## Multiple Linear Regression

In [30]:
# Metric used: Mean Absolute Error (MAE) - average of absolute error to the true value - can be above or below true value

# 0 import ML algorithm and metric
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# 1. Create a model object
linear_regressor = LinearRegression()
# 2. Fit the model object to our data
linear_regressor.fit(X_train, y_train)

# 3. Create predictions with our newly trained model
linear_prediction = linear_regressor.predict(X_test)

# 4. Measure efficacy of your algorithm using your metric
mae_regression = mean_absolute_error(y_test, linear_prediction)
mae_regression

2800.707715133531

In [39]:
# Check average price
print("Average price of car: ", y_test.mean())
# Check % of error
percent_error = mae_regression/y_test.mean() * 100
print("Percentage of error: ", percent_error.round(2), "%")

Average price of car:  22728.95103857567
Percentage of error:  12.32 %


## Random Forest - Ensemble algorithm

In [40]:
# 0. Imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# 1. Create a model object
rf_regressor = RandomForestRegressor(n_estimators=1000, random_state=123) # Creates 1000 trees
# 2. Fit the model object to our data
rf_regressor.fit(X_train, y_train)
# 3. Create predictions with our newly trained model
rf_prediction = rf_regressor.predict(X_test)
# 4. Measure efficacy of your algorithm using your metric
mae_rf = mean_absolute_error(y_test, rf_prediction)
mae_rf

1575.6679109049815

In [41]:
# Check % of error
percent_error = mae_rf/y_test.mean() * 100
print("Percentage of error: ", percent_error.round(2), "%")

Percentage of error:  6.93 %


## Boosting

In [42]:
# 0. Imports
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
# 1. Create a model object
boost_model = XGBRegressor()
# 2. Fit the model object to our data
boost_model.fit(X_train, y_train)
# 3. Create predictions with our newly trained model
boost_prediction = boost_model.predict(X_test)
# 4. Measure efficacy of your algorithm using your metric
mae_boost = mean_absolute_error(y_test, boost_prediction)
mae_boost

1540.4756669856674

In [43]:
# Check % of error
percent_error = mae_boost / y_test.mean() * 100
print("Percentage of error: ", percent_error.round(2), "%")

Percentage of error:  6.78 %


# Hyperparameter Tuning

In [54]:
# Using GridSearchCV methodology
from sklearn.model_selection import GridSearchCV

n_estimators = [1500, 1600]
max_features = ['log2', 'sqrt']
max_depth = [80, 90]
min_samples_split = [5]
min_samples_leaf = [1]
bootstrap = [True]

# Create the parameter grid
random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'bootstrap': bootstrap
}
print(random_grid)


{'n_estimators': [1500, 1600], 'max_features': ['log2', 'sqrt'], 'max_depth': [80, 90], 'min_samples_split': [5], 'min_samples_leaf': [1], 'bootstrap': [True]}


In [55]:
# Using random grid to search for best hyperparameters
rf = RandomForestRegressor()
rf_random = GridSearchCV(estimator=rf, param_grid=random_grid, n_jobs=-1, cv=3, verbose=2)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [58]:
best_params = rf_random.best_params_
best_params

{'bootstrap': True,
 'max_depth': 80,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 1600}

In [59]:
# Run Random forrest again with tuning
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# 1. Create a model object
rf_regressor = RandomForestRegressor(**best_params) # Creates 1000 trees
# 2. Fit the model object to our data
rf_regressor.fit(X_train, y_train)
# 3. Create predictions with our newly trained model
rf_prediction = rf_regressor.predict(X_test)
# 4. Measure efficacy of your algorithm using your metric
mae_rf = mean_absolute_error(y_test, rf_prediction)
mae_rf

1540.8909100277137

In [60]:
# Check % of error
percent_error = mae_rf/y_test.mean() * 100
print("Percentage of error: ", percent_error.round(2), "%")

Percentage of error:  6.78 %
