# AutoMPG
The task is to predict the MPG for different cars based on a selected set of properties about the cars. It is a regression task, but the main challenge is data preparation.

In [1]:
# Common imports
import sklearn
import numpy as np
import os
import pandas as pd

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [2]:
# Load the dataset
filepath = "datasets/AutoMPG/auto-mpg.data"
data = pd.read_fwf(filepath, header=None);
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,"""chevrolet chevelle malibu"""
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,"""buick skylark 320"""
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,"""plymouth satellite"""
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,"""amc rebel sst"""
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,"""ford torino"""


In [3]:
# Remove the last column which is non-numerical
data = data.drop(8, axis=1)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       398 non-null    float64
 1   1       398 non-null    int64  
 2   2       398 non-null    float64
 3   3       398 non-null    object 
 4   4       398 non-null    float64
 5   5       398 non-null    float64
 6   6       398 non-null    int64  
 7   7       398 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [4]:
# Rename columns
data = data.rename(columns={0: 'mpg', 1: 'cylinders', 2: 'displacement', 3: 'hp', 4: 'weight', 5: 'acc', 6: 'year', 7: 'origin'})
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   hp            398 non-null    object 
 4   weight        398 non-null    float64
 5   acc           398 non-null    float64
 6   year          398 non-null    int64  
 7   origin        398 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 25.0+ KB


In [10]:
# Remove rows where horsepower is ?
data = data[data.hp != '?']
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   hp            392 non-null    object 
 4   weight        392 non-null    float64
 5   acc           392 non-null    float64
 6   year          392 non-null    int64  
 7   origin        392 non-null    int64  
dtypes: float64(4), int64(3), object(1)
memory usage: 27.6+ KB


In [15]:

from sklearn.preprocessing import OneHotEncoder
from sklearn. preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Do not touch the target attribute
targetAttributes = ["mpg"]

# Standardize the features except 'origin'.
scaledAttributes = ["cylinders", "displacement", "hp", "weight", "acc", "year"]

# Convert the origin attribute to 3 binary attributes using 1hot encoding
originAttributes = ["origin"]

fullPipeline = ColumnTransformer([
    ("target", 'passthrough', targetAttributes),
    ("scaled", StandardScaler(), scaledAttributes),
    ("onehot", OneHotEncoder(sparse=False), originAttributes)
])

dataPrepared = fullPipeline.fit_transform(data)
dataPrepared

array([[18.        ,  1.48394702,  1.07728956, ...,  1.        ,
         0.        ,  0.        ],
       [15.        ,  1.48394702,  1.48873169, ...,  1.        ,
         0.        ,  0.        ],
       [18.        ,  1.48394702,  1.1825422 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [32.        , -0.86401356, -0.56847897, ...,  1.        ,
         0.        ,  0.        ],
       [28.        , -0.86401356, -0.7120053 , ...,  1.        ,
         0.        ,  0.        ],
       [31.        , -0.86401356, -0.72157372, ...,  1.        ,
         0.        ,  0.        ]])

In [16]:
dataPrepared.shape

(392, 10)