In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras

In [5]:
# dataset_path = keras.utils.get_file('auto-mpg.data',
#                                   'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data')

Downloading data from http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data


In [7]:
dataset_path = '../data/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
               'Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names,
                         na_values='?', comment='\t',
                         sep=' ', skipinitialspace=True)
dataset = raw_dataset.copy()

In [8]:
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [9]:
# 统计空白数据
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      6
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [10]:
# 删除空白数据
dataset = dataset.dropna()
dataset.isna().sum()

MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [11]:
# 处理类别类型数据 - origin 代表了1，2，3分布产地美国，欧洲，日本
origin = dataset.pop('Origin')
# 加入3列
dataset['USA'] = (origin == 1) * 1.0
dataset['Europe'] = (origin == 2) * 1.0
dataset['Japan'] = (origin == 3) * 1.0
dataset.tail()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
393,27.0,4,140.0,86.0,2790.0,15.6,82,1.0,0.0,0.0
394,44.0,4,97.0,52.0,2130.0,24.6,82,0.0,1.0,0.0
395,32.0,4,135.0,84.0,2295.0,11.6,82,1.0,0.0,0.0
396,28.0,4,120.0,79.0,2625.0,18.6,82,1.0,0.0,0.0
397,31.0,4,119.0,82.0,2720.0,19.4,82,1.0,0.0,0.0


In [12]:
# 切分训练集和测试集
train_dataset = dataset.sample(frac=0.8, random_state=0)
train_dataset.shape

(314, 10)

In [13]:
test_dataset = dataset.drop(train_dataset.index)
test_dataset.shape

(78, 10)

In [14]:
dataset.shape

(392, 10)

In [15]:
# 标签列
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')
train_labels.shape, test_labels.shape

((314,), (78,))

In [16]:
train_dataset.shape, test_dataset.shape

((314, 9), (78, 9))

In [21]:
# 查看特征项的统计数据
train_stats = train_dataset.describe()
train_stats 

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
count,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0
mean,5.477707,195.318471,104.869427,2990.251592,15.559236,75.898089,0.624204,0.178344,0.197452
std,1.699788,104.331589,38.096214,843.898596,2.78923,3.675642,0.485101,0.383413,0.398712
min,3.0,68.0,46.0,1649.0,8.0,70.0,0.0,0.0,0.0
25%,4.0,105.5,76.25,2256.5,13.8,73.0,0.0,0.0,0.0
50%,4.0,151.0,94.5,2822.5,15.5,76.0,1.0,0.0,0.0
75%,8.0,265.75,128.0,3608.0,17.2,79.0,1.0,0.0,0.0
max,8.0,455.0,225.0,5140.0,24.8,82.0,1.0,1.0,1.0


In [22]:
# 特征项统计数据转置
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Cylinders,314.0,5.477707,1.699788,3.0,4.0,4.0,8.0,8.0
Displacement,314.0,195.318471,104.331589,68.0,105.5,151.0,265.75,455.0
Horsepower,314.0,104.869427,38.096214,46.0,76.25,94.5,128.0,225.0
Weight,314.0,2990.251592,843.898596,1649.0,2256.5,2822.5,3608.0,5140.0
Acceleration,314.0,15.559236,2.78923,8.0,13.8,15.5,17.2,24.8
Model Year,314.0,75.898089,3.675642,70.0,73.0,76.0,79.0,82.0
USA,314.0,0.624204,0.485101,0.0,0.0,1.0,1.0,1.0
Europe,314.0,0.178344,0.383413,0.0,0.0,0.0,0.0,1.0
Japan,314.0,0.197452,0.398712,0.0,0.0,0.0,0.0,1.0


In [23]:
train_stats['mean']

Cylinders          5.477707
Displacement     195.318471
Horsepower       104.869427
Weight          2990.251592
Acceleration      15.559236
Model Year        75.898089
USA                0.624204
Europe             0.178344
Japan              0.197452
Name: mean, dtype: float64

In [28]:
train_dataset.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
146,4,90.0,75.0,2125.0,14.5,74,1.0,0.0,0.0
282,4,140.0,88.0,2890.0,17.3,79,1.0,0.0,0.0
69,8,350.0,160.0,4456.0,13.5,72,1.0,0.0,0.0
378,4,105.0,63.0,2125.0,14.7,82,1.0,0.0,0.0
331,4,97.0,67.0,2145.0,18.0,80,0.0,0.0,1.0


In [27]:
t = train_dataset - train_stats['mean']
t.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
146,-1.477707,-105.318471,-29.869427,-865.251592,-1.059236,-1.898089,0.375796,-0.178344,-0.197452
282,-1.477707,-55.318471,-16.869427,-100.251592,1.740764,3.101911,0.375796,-0.178344,-0.197452
69,2.522293,154.681529,55.130573,1465.748408,-2.059236,-3.898089,0.375796,-0.178344,-0.197452
378,-1.477707,-90.318471,-41.869427,-865.251592,-0.859236,6.101911,0.375796,-0.178344,-0.197452
331,-1.477707,-98.318471,-37.869427,-845.251592,2.440764,4.101911,-0.624204,-0.178344,0.802548


In [29]:
# 标准化数据集
def norm(x):
    # 减去每个字段的均值，并除以每个字段的标准差
    return (x - train_stats['mean']) / train_stats['std']

normed_train_data = norm(train_dataset)
normed_test_data = norm(test_dataset)