In [18]:
import torch
from torch.nn.functional import one_hot
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn as nn

import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

# Vehicle Fuel Efficiency Prediction

##### Data preprocessing

In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

# get data
df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=" ", skipinitialspace=True)

In [3]:
# remove incomplete rows
df = df.dropna()
df = df.reset_index(drop=True)

In [4]:
# train test set
df_train, df_test = train_test_split(df, train_size=0.8, random_state=1)

# get some stats on the train set
train_stats = df_train.describe().transpose()

In [34]:
# standardize the numerical features
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    # get the mean and std from the stats (use the train set mean and std for the test set)
    mean = train_stats.loc[col_name, 'mean']
    std = train_stats.loc[col_name, 'std']

    # standardize
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean) / std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean) / std

# standardized data
df_train_norm.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


In [35]:
# bucket the year feature into 4 buckets, <73, <76, <79, else
boundaries = torch.tensor([73, 76, 79])

# conver the feature into pytorch tensor
year_tensor = torch.tensor(df_train_norm['Model Year']. values)
# bucket it
df_train_norm['Model Year Bucketed'] = torch.bucketize(year_tensor, boundaries, right=True)

# same for the test set
year_tensor = torch.tensor(df_test_norm['Model Year']. values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(year_tensor, boundaries, right=True)

numeric_column_names.append('Model Year Bucketed')

In [36]:
# one hot encode the categorical feature
number_of_origins = len(set(df_train_norm['Origin']))

# encode feature
origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values) % number_of_origins)

# create training tensor by concating the numerical feature tensor with one hot encoded tensor
x_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)
x_train = torch.cat([x_train_numeric, origin_encoded], 1).float()

# create test set tensor
origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values) % number_of_origins)
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded], 1).float()

In [42]:
# create target tensor
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

In [38]:
# create dataset and dataloader
train_ds = TensorDataset(x_train, y_train)

batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

##### Model and train

In [39]:
# define the model dimensions
hidden_units = [8, 4]  # 2 hidden layers, 8 and 4 units each
input_size = x_train.shape[1]

# define the model layers
all_layers = []
for hidden_unit in hidden_units:
    # Linear layer
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    # activation function
    all_layers.append(nn.ReLU())
    # next layer's input size = this layer's output size
    input_size = hidden_unit

# output layer, 1 node for continuous output
all_layers.append(nn.Linear(hidden_units[-1], 1))

# create model
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [40]:
loss_fn = nn.MSELoss()  # MSE for regression
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

num_epochs = 200
log_epochs = 20

torch.manual_seed(1)

# train
for epoch in range(num_epochs):
    loss_hist_train = 0

    for x_batch, y_batch in train_dl:
        # forward predict
        pred = model(x_batch)[:, 0]
        # calculate error
        loss = loss_fn(pred, y_batch)
        # backward calculate gradient
        loss.backward()
        # update parameters using gradient
        optimizer.step()
        # reset gradient to 0
        optimizer.zero_grad()

        # keep track of error
        loss_hist_train += loss.item()

    if epoch % log_epochs == 0:
        print(f'Epoch {epoch} Loss {loss_hist_train/len(train_dl):.4f}')

Epoch 0 Loss 536.1047
Epoch 20 Loss 8.4361
Epoch 40 Loss 7.8695
Epoch 60 Loss 7.1891
Epoch 80 Loss 6.7064
Epoch 100 Loss 6.7603
Epoch 120 Loss 6.3107
Epoch 140 Loss 6.6884
Epoch 160 Loss 6.7549
Epoch 180 Loss 6.2029


In [43]:
# evaluate on test set
with torch.no_grad():
    pred = model(x_test.float())[:, 0]
    loss = loss_fn(pred, y_test)

print(f'Test MSE: {loss.item():.4f}')
print(f'Test MAE: {nn.L1Loss()(pred, y_test).item():.4f}')

Test MSE: 9.5907
Test MAE: 2.1177
