In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

For this dataset, we have 398 rows and 9 columns, all numeric except car name and horsepower. <br>
We drop Car name since it is a column that can work like an "ID" and we don't believe it is important for the model. <br>
For horsepower we transform the column into a float and turn the missing values '?' into nan, which are then imputed with SimpleImputer, using then mean. <br>
Since we will use this data for a RandomForest Model, scaling the data is not a necessary step.


In [None]:
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
# read data 
data = pd.read_csv('auto-mpg.data', sep='\\s+', header=None, names=columns, quotechar='"')

# drop car_name - Like id
data.drop(["car_name"], axis=1, inplace=True)

data.dtypes #horsepower is object but should be float

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
dtype: object

In [24]:
# Replace '?' with NaN
data['horsepower']=pd.to_numeric(data['horsepower'], errors='coerce')
data[data.isnull().any(axis=1)]


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
32,25.0,4,98.0,,2046.0,19.0,71,1
126,21.0,6,200.0,,2875.0,17.0,74,1
330,40.9,4,85.0,,1835.0,17.3,80,2
336,23.6,4,140.0,,2905.0,14.3,80,1
354,34.5,4,100.0,,2320.0,15.8,81,2
374,23.0,4,151.0,,3035.0,20.5,82,1


In [38]:
imp_median =SimpleImputer(strategy='median', missing_values=np.nan)
data_clean=pd.DataFrame(imp_median.fit_transform(data), columns=data.columns)
data_clean

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0
...,...,...,...,...,...,...,...,...
393,27.0,4.0,140.0,86.0,2790.0,15.6,82.0,1.0
394,44.0,4.0,97.0,52.0,2130.0,24.6,82.0,2.0
395,32.0,4.0,135.0,84.0,2295.0,11.6,82.0,1.0
396,28.0,4.0,120.0,79.0,2625.0,18.6,82.0,1.0


In [39]:
pd.concat([data_clean,data]).drop_duplicates(keep=False)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
32,25.0,4.0,98.0,93.5,2046.0,19.0,71.0,1.0
126,21.0,6.0,200.0,93.5,2875.0,17.0,74.0,1.0
330,40.9,4.0,85.0,93.5,1835.0,17.3,80.0,2.0
336,23.6,4.0,140.0,93.5,2905.0,14.3,80.0,1.0
354,34.5,4.0,100.0,93.5,2320.0,15.8,81.0,2.0
374,23.0,4.0,151.0,93.5,3035.0,20.5,82.0,1.0
32,25.0,4.0,98.0,,2046.0,19.0,71.0,1.0
126,21.0,6.0,200.0,,2875.0,17.0,74.0,1.0
330,40.9,4.0,85.0,,1835.0,17.3,80.0,2.0
336,23.6,4.0,140.0,,2905.0,14.3,80.0,1.0


In [41]:
data_clean.model_year.value_counts()

model_year
73.0    40
78.0    36
76.0    34
82.0    31
75.0    30
80.0    29
79.0    29
81.0    29
70.0    29
71.0    28
72.0    28
77.0    28
74.0    27
Name: count, dtype: int64

In [42]:
X = data_clean.drop(['mpg'], axis=1)
y = data_clean['mpg']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)