### Dataset: https://archive.ics.uci.edu/ml/datasets/automobile

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, KFold

In [86]:
cars = pd.read_csv("C://Users//johng//Desktop//data//imports-85.data", encoding="utf-8", names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors','body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width','height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size','fuel-system', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm','city-mpg', 'highway-mpg', 'price'])

In [31]:
cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [21]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

# Numeric columns: symboling, wheel-base, length, width, height, curb-weight, engine-size, compression-rate, city-mpg, highway-mpg
# Target column: price (and it is an object column)

In [87]:
clean_df = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight','engine-size', 'bore', 'stroke', 'compression-rate', 'horsepower', 'peak-rpm','city-mpg', 'highway-mpg', 'price']
cars = cars[clean_df]
cars.shape

(205, 15)

In [40]:
# Verifying the clean dataframe columns
cars['normalized-losses'].value_counts()

161    11
91      8
150     7
128     6
134     6
104     6
95      5
102     5
103     5
74      5
85      5
168     5
94      5
65      5
106     4
122     4
148     4
118     4
93      4
101     3
125     3
137     3
154     3
83      3
115     3
119     2
87      2
194     2
197     2
108     2
89      2
164     2
158     2
145     2
192     2
188     2
81      2
110     2
113     2
129     2
153     2
107     1
78      1
186     1
231     1
77      1
98      1
121     1
90      1
142     1
256     1
Name: normalized-losses, dtype: int64

In [92]:
# Remove '?' from normalized_losses and change its type to float
cars["normalized-losses"] = cars.loc[cars["normalized-losses"] != '?', "normalized-losses"].astype(float)
# Remove all NA values
cars.dropna(axis=0, inplace=True)
# Remove '?' from 'bore' and change its type to float
cars["bore"] = cars.loc[cars["bore"] != '?', "bore"].astype(float)

In [104]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160 entries, 3 to 204
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   normalized-losses  160 non-null    float64
 1   wheel-base         160 non-null    float64
 2   length             160 non-null    float64
 3   width              160 non-null    float64
 4   height             160 non-null    float64
 5   curb-weight        160 non-null    int64  
 6   engine-size        160 non-null    int64  
 7   bore               160 non-null    float64
 8   stroke             160 non-null    float64
 9   compression-rate   160 non-null    float64
 10  horsepower         160 non-null    int64  
 11  peak-rpm           160 non-null    int64  
 12  city-mpg           160 non-null    int64  
 13  highway-mpg        160 non-null    int64  
 14  price              160 non-null    int64  
dtypes: float64(8), int64(7)
memory usage: 20.0 KB


In [103]:
# Verify the other columns for numeric values
cars["stroke"].value_counts()
# change object type columns to float
cars["stroke"] = pd.to_numeric(cars["stroke"])
cars["horsepower"] = pd.to_numeric(cars["horsepower"])
cars["peak-rpm"] = pd.to_numeric(cars["peak-rpm"])
cars["price"] = pd.to_numeric(cars["price"])

In [105]:
cars.head()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-rate,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950
4,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450
6,158.0,105.8,192.7,71.4,55.7,2844,136,3.19,3.4,8.5,110,5500,19,25,17710
8,158.0,105.8,192.7,71.4,55.9,3086,131,3.13,3.4,8.3,140,5500,17,20,23875
10,192.0,101.2,176.8,64.8,54.3,2395,108,3.5,2.8,8.8,101,5800,23,29,16430


## Normalize columns (x - mean / standard deviation)

In [106]:
normalized_cars = (cars - cars.mean()) / (cars.std())
# mantain the column 'price' in the normalized data frame
normalized_cars['price'] = cars['price'].copy()