# 1. 导入数据集

In [3]:
from tensorflow import keras
import numpy as np
import pandas as pd

# 在线下载汽车效能数据集
dataset_path = keras.utils.get_file("auto-mpg.data", "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data")

## 处理数据
dataset_path: 这是 CSV 文件的路径，指定了要读取的数据文件的位置。

names=column_names: 这是一个可选参数，用于指定列名。如果数据文件中不包含列名，或者希望使用其他列名，就可以通过 column_names 列表来指定列名。

na_values="?": 这个参数告诉 Pandas 把所有出现的问号字符 ? 视为缺失值（NaN）。

comment='\t': 这个参数指定了评论字符，默认情况下是 #，在这里设置为制表符 \t。具有评论字符的行将被忽略，不会被读入数据框中。

sep=" ": 这个参数指定了字段之间的分隔符，这里是空格符号。这意味着数据文件中的数据是用空格分隔的。

skipinitialspace=True: 这个参数告诉 Pandas 去除分隔符后的空白字符。默认情况下，该参数为 False，但在这里设置为 True，以便去除字段之间可能存在的额外空格。

In [4]:
# 加速度，型号年份，产地
column_names = ['MPG','Cylinders','Displacement','Horsepower','Weight','Acceleration', 'Model Year', 'Origin']
raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True)
dataset = raw_dataset.copy()
# 查看部分数据
dataset.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [5]:
dataset.isna().sum() # 统计空白数据
dataset = dataset.dropna() # 删除空白数据项
dataset.isna().sum() # 再次统计空白数据

MPG             0
Cylinders       0
Displacement    0
Horsepower      0
Weight          0
Acceleration    0
Model Year      0
Origin          0
dtype: int64

In [6]:
# 处理类别型数据，其中 origin 列代表了类别 1,2,3,分布代表产地：美国、欧洲、日本
# 先弹出(删除并返回)origin 这一列
origin = dataset.pop('Origin')
# 根据 origin 列来写入新的 3 个列
dataset['USA'] = (origin == 1)*1.0
dataset['Europe'] = (origin == 2)*1.0
dataset['Japan'] = (origin == 3)*1.0
dataset.tail() # 查看新表格的后几项

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,USA,Europe,Japan
393,27.0,4,140.0,86.0,2790.0,15.6,82,1.0,0.0,0.0
394,44.0,4,97.0,52.0,2130.0,24.6,82,0.0,1.0,0.0
395,32.0,4,135.0,84.0,2295.0,11.6,82,1.0,0.0,0.0
396,28.0,4,120.0,79.0,2625.0,18.6,82,1.0,0.0,0.0
397,31.0,4,119.0,82.0,2720.0,19.4,82,1.0,0.0,0.0


## 切分数据集

In [7]:
# 切分为训练集和测试集
train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [8]:
# 移动 MPG 油耗效能这一列为真实标签 Y
train_labels = train_dataset.pop('MPG')
test_labels = test_dataset.pop('MPG')

In [9]:
# 查看训练集的输入 X 的统计数据
train_stats = train_dataset.describe()
print(train_stats)

train_stats = train_stats.transpose() # 转置

        Cylinders  Displacement  Horsepower       Weight  Acceleration  \
count  314.000000    314.000000  314.000000   314.000000    314.000000   
mean     5.477707    195.318471  104.869427  2990.251592     15.559236   
std      1.699788    104.331589   38.096214   843.898596      2.789230   
min      3.000000     68.000000   46.000000  1649.000000      8.000000   
25%      4.000000    105.500000   76.250000  2256.500000     13.800000   
50%      4.000000    151.000000   94.500000  2822.500000     15.500000   
75%      8.000000    265.750000  128.000000  3608.000000     17.200000   
max      8.000000    455.000000  225.000000  5140.000000     24.800000   

       Model Year         USA      Europe       Japan  
count  314.000000  314.000000  314.000000  314.000000  
mean    75.898089    0.624204    0.178344    0.197452  
std      3.675642    0.485101    0.383413    0.398712  
min     70.000000    0.000000    0.000000    0.000000  
25%     73.000000    0.000000    0.000000    0.000000

## 标准化数据

In [10]:

def norm(x): # 减去每个字段的均值，并除以标准差
    return (x - train_stats['mean']) / train_stats['std']

In [11]:
normed_train_data = norm(train_dataset) # 标准化训练集
normed_test_data = norm(test_dataset) # 标准化测试集

In [12]:
print(normed_train_data.shape,train_labels.shape)
print(normed_test_data.shape, test_labels.shape)


(314, 9) (314,)
(78, 9) (78,)


(314, 9) (314,) # 训练集共 314 行，输入特征长度为 9,标签用一个标量表示
(78, 9) (78,) # 测试集共 78 行，输入特征长度为 9,标签用一个标量表示

## 构建数据集对象

In [13]:
import tensorflow as tf

train_db = tf.data.Dataset.from_tensor_slices((normed_train_data.values,
train_labels.values)) # 构建 Dataset 对象
train_db = train_db.shuffle(100).batch(32) # 随机打散，批量化

# 创建网络

In [14]:
from tensorflow.keras import layers


class Network(keras.Model):
    # 回归网络模型
   def __init__(self):
       super(Network, self).__init__()
       # 创建 3 个全连接层
       self.fc1 = layers.Dense(64, activation='relu')
       self.fc2 = layers.Dense(64, activation='relu')
       self.fc3 = layers.Dense(1)

   def call(self, inputs, training=None, mask=None):
      # 依次通过 3 个全连接层
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

# 训练与测试

In [15]:
model = Network() # 创建网络类实例
# 通过 build 函数完成内部张量的创建，其中 4 为任意设置的 batch 数量，9 为输入特征长度
model.build(input_shape=(4, 9))
model.summary() # 打印网络信息
optimizer = tf.keras.optimizers.RMSprop(0.001) # 创建优化器，指定学习率

Model: "network"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                multiple                  640       
_________________________________________________________________
dense_1 (Dense)              multiple                  4160      
_________________________________________________________________
dense_2 (Dense)              multiple                  65        
Total params: 4,865
Trainable params: 4,865
Non-trainable params: 0
_________________________________________________________________


Using matplotlib backend: <object object at 0x0000021DB53849E0>


In [21]:
%matplotlib
import matplotlib.pyplot as plt
from tensorflow import losses
train_loss = []
epoch_batch = 200
for epoch in range(200): # 200 个 Epoch
     for step, (x,y) in enumerate(train_db): # 遍历一次训练集
         # 梯度记录器，训练时需要使用它
         with tf.GradientTape() as tape:
            out = model(x) # 通过网络获得输出
            loss = tf.reduce_mean(losses.MSE(y, out)) # 计算 MSE
            mae_loss = tf.reduce_mean(losses.MAE(y, out)) # 计算 MAE
         if step % 10 == 0: # 间隔性地打印训练误差
            print(epoch, step, float(loss))
         # 计算梯度，并更新
         grads = tape.gradient(loss, model.trainable_variables)
         optimizer.apply_gradients(zip(grads, model.trainable_variables))

Using matplotlib backend: TkAgg
