In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)
# Turn off the warning altogether
pd.set_option('mode.chained_assignment',None)

In [3]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

Visualize the model's training progress

In [4]:
def plot_loss(history):
  plt.plot(history.history['loss'], label='loss')
  plt.plot(history.history['val_loss'], label='val_loss')
  plt.ylim([0, 10])
  plt.xlabel('Epoch')
  plt.ylabel('Error [MPG]')
  plt.legend()
  plt.grid(True)

# The Auto MPG dataset
The dataset is available from the UCI [Machine Learning Repository](http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data).

## Get the data
First download and import the dataset using pandas:

In [5]:
import numpy as np

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
names = ["mpg", "cylinders", "displacement", "horsepower", "weight","acceleration", "model year", "origin", "car name"]
widths = [7, 4, 10, 10, 11, 7, 4, 4, 30]

# Get the data
X_full = pd.read_fwf(url, names=names, widths=widths, na_values=['?'])
X = X_full.copy()
X.tail()

## Data Correlation

Correlation map to see how features are correlated with each other and with "mpg".

In [6]:
corr_matrix = X_full.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corr_matrix, vmax=0.9, square=True)

Look at some correlation values in a list format.

In [7]:
print(corr_matrix["mpg"].sort_values(ascending=False)[:5], '\n')
print(corr_matrix["mpg"].sort_values(ascending=False)[-5:])

As we can see, the columns "displacement" and "weight" are strongly negatively correlated.

We can guess if we predict the "mpg" value from given columns, the "displacement" and "weight" will predict better than others.

# Split features from labels

Separate the target value—the "label"—from the features. This label is the value that you will train the model to predict.

In [8]:
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=["mpg"], inplace=True)

y = X["mpg"]
X.drop(["mpg"], axis=1, inplace=True)

## Split the data into training and test sets

Now, split the dataset into a training set and a test set. You will use the test set in the final evaluation of your models.

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

## Clean tha data

Identify columns with missing values

In [10]:
X.count()

The column "horsepower" contains missing value.

Use SimpleImputer to replace missing values with the mean value and create a new column name "hp".

In [11]:
from sklearn.impute import SimpleImputer

# Imputation
hp_imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputed_hp_train = hp_imputer.fit(X_train[["horsepower"]])
imputed_hp_valid = hp_imputer.fit(X_valid[["horsepower"]])

# Put them back to X_train and X_valid dataframe
X_train["hp"] = imputed_hp_train.transform(X_train[["horsepower"]]).ravel()
X_valid["hp"] = imputed_hp_valid.transform(X_valid[["horsepower"]]).ravel()

In [12]:
print(X_train.isna().sum())
print(X_valid.isna().sum())

The "origin" column is categorical, not numeric. 

So the next step is to one-hot encode the values in the column with OneHotEncoder class from scikit-learn.

In [13]:
X_train.tail()

In [14]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[["origin"]]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[["origin"]]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Add one-hot encoded columns to numerical features
X_train = pd.concat([X_train, OH_cols_train], axis=1)
X_valid = pd.concat([X_valid, OH_cols_valid], axis=1)

In [15]:
X_train.tail()

In [16]:
X_valid.tail()

Select features we will use to train the model

In [17]:
features = ["cylinders", "displacement", "weight", "acceleration", "model year", "hp", 0, 1, 2]
X_train[features].tail()

# Normalization

It is good practice to normalize features that use different scales and ranges.

The ```tf.keras.layers.Normalization``` is a clean and simple way to add feature normalization into your model.

# Linear Regression

## Linear regression with one variable

Begin with a single-variable linear regression to predict "mpg" 
from "cylinders", "displacement", "weight", "acceleration", "model year", "hp", one by one.

In [18]:
%%time
def linear_regression_single(column):
    # Create a numpy array made of the feature with column name 
    feature = np.array(X_train[column])
    # Init the tf.keras.layers.Normalization
    feature_normalizer = layers.Normalization(input_shape=[1,], axis=None)
    # Fit the state of the preprocessing layer to the horsepower data
    feature_normalizer.adapt(feature)
    
    # Build the Keras Sequential model
    model = tf.keras.Sequential([
        feature_normalizer,
        layers.Dense(units=1)
    ])
    
    # Configure the training procedure using the Keras Model.compile method
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.1),
        loss="mean_absolute_error"
    )

    # Execute the training for 100 epochs
    history = model.fit(
        X_train[column], y_train, epochs=100, verbose=0, validation_split=0.2
    )
    print("Finish ", column)

    return model, history

test_results = {}
for f in ["cylinders", "displacement", "weight", "acceleration", "model year", "hp"]:
    feature_model, history = linear_regression_single(f)
    test_results[f+"_model"] = feature_model.evaluate(
        X_valid[f],
        y_valid, verbose=0)

Back to our first assumption, the "displacement" and "weight" features have a better score than others.

In [19]:
test_results

## Linear regression with multiple inputs

In [34]:
%%time
def linear_regression_multi(columns):
    # Create a numpy array made of the feature with column name 
    features = np.array(X_train[columns])
    # Create the tf.keras.layers.Normalization
    features_normalizer = layers.Normalization(axis=-1)
    # Fit the state of the preprocessing layer to the horsepower data
    features_normalizer.adapt(features)
    
    # Build the Keras Sequential model
    model = tf.keras.Sequential([
        features_normalizer,
        layers.Dense(units=1)
    ])
    
    # Configure the training procedure using the Keras Model.compile method
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=0.1),
        loss="mean_absolute_error"
    )

    # Execute the training for 100 epochs
    history = model.fit(
        X_train[columns], y_train, epochs=100, verbose=0, validation_split=0.2
    )

    return model, history


multi_model, history = linear_regression_multi(features)                                                 
test_results["multi_model"] = multi_model.evaluate(
    X_valid[features],
    y_valid, verbose=0)

# Regression with a deep neural network (DNN)
These models will contain a few more layers than the linear model:

- The normalization layer, as before (with horsepower_normalizer for a single-input model and normalizer for a multiple-input model).

- Two hidden, non-linear, Dense layers with the ReLU (relu) activation function nonlinearity.

- A linear Dense single-output layer.

In [23]:
%%time
def dnn_regression_multi(columns):
    # Create a numpy array made of the feature with column name 
    features = np.array(X_train[columns])
    # Create the tf.keras.layers.Normalization
    features_normalizer = layers.Normalization(axis=-1)
    # Fit the state of the preprocessing layer to the horsepower data
    features_normalizer.adapt(features)
    
    # Build the Keras Sequential model
    model = tf.keras.Sequential([
        features_normalizer,
        layers.Dense(64, activation='relu'),
        layers.Dense(64, activation='relu'),        
        layers.Dense(units=1)
    ])
    
    # Configure the training procedure using the Keras Model.compile method
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.1),
        loss="mean_absolute_error"
    )

    # Execute the training for 100 epochs
    history = model.fit(
        X_train[columns], y_train, epochs=100, verbose=0, validation_split=0.2
    )

    return model, history


dnn_model, history = dnn_regression_multi(features)                                                 
test_results["dnn_model"] = dnn_model.evaluate(
    X_valid[features],
    y_valid, verbose=0)

# Performance

Since all models have been trained, you can review their test set performance.

The "dnn_model" has the highest score which indicates it the best model amongs the stars.

In [36]:
pd.DataFrame(test_results, index=['Mean absolute error [mpg]']).T

In [28]:
predictions = dnn_model.predict(X_valid[features]).flatten()

a = plt.axes(aspect='equal')
plt.scatter(y_valid, predictions)
plt.xlabel('True Values [MPG]')
plt.ylabel('Predictions [MPG]')
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [29]:
error = predictions - y_valid
plt.hist(error, bins=25)
plt.xlabel('Prediction Error [MPG]')
_ = plt.ylabel('Count')

### Save it for later use

In [30]:
dnn_model.save('dnn_model')

### Reload the model

In [35]:
reloaded = tf.keras.models.load_model('dnn_model')

test_results['reloaded'] = reloaded.evaluate(
    X_train[features], y_train, verbose=0)

pd.DataFrame(test_results, index=['Mean absolute error [mpg]']).T