In [224]:
import torch
import numpy as np
import torch.nn as nn
import pandas as pd
import torch.utils.data as Data
%matplotlib inline
'''
Kaggle题目:https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview
参考https://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.16_kaggle-house-price
'''

'\nKaggle题目:https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview\n参考https://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.16_kaggle-house-price\n'

In [225]:
train_data = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('./house-prices-advanced-regression-techniques/test.csv')

train_data.shape, test_data.shape  #train_data比test_data多一个价格

((1460, 81), (1459, 80))

In [226]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))  #将训练集和测试集去掉id后拼接 (2919, 79),
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index    #数据非数字的属性名list
# 对连续数值的特征做标准化,对于缺失的特征值将其替换成该特征的均值。
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)  #标准化后每个数值特征均值为0，直接用0替换缺失值

# 将离散数值转成指示特征 i.e 去掉MSZoning特征，并新加两个特征MSZoning_RL和MSZoning_RM,其值为0或1
# dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征
all_features = pd.get_dummies(all_features, dummy_na=True)  # (2919, 331)

#获取处理后的训练集和测试集
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values, dtype=torch.float).view(-1, 1)

In [227]:
class Net(nn.Module):
    def __init__(self, input_size, output_size):
        super(Net, self).__init__()
        self.linear = nn.Linear(input_size, output_size)
        
    def forward(self, X):
        outputs = self.linear(X)
        return outputs

#对数均方根误差,用来评价模型
def log_rmse(net, features, labels):
    with torch.no_grad():
        # 将小于1的值设成1，使得取对数时数值更稳定
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        criterion = nn.MSELoss()
        rmse = torch.sqrt(2 * criterion(clipped_preds.log(), labels.log()).mean())
    return rmse.item()

In [228]:
#K折交叉验证,返回第i折交叉验证时所需要的训练和验证数据
def get_k_flod_data(k, X, y):
    assert k > 1
    fold_size = X.shape[0] // k   #1460/5 = 292
    X_train, y_train = None, None
    for i in range(k):
        start_idx, end_idx = i * fold_size, (i + 1) * fold_size
        idx = slice(start_idx, end_idx)
        X_valid, y_valid = X[idx, :], y[idx]
        X_train, y_train = torch.cat((X[:start_idx], X[end_idx:]), dim=0), torch.cat((y[:start_idx], y[end_idx:]), dim=0)
        yield (X_train, y_train, X_valid, y_valid)


# def k_flod(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
#     train_loss_sum, vaild_loss_sum = 0, 0
#     for i in range(5):
#         data = get_k_flod_data(k, i ,X_train, y_train)
#         model = Net()
#         train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
#                                    weight_decay, batch_size)
#         train_loss_sum += train_ls[-1]
#         vaild_loss_sum += valid_ls[-1]
#         if i == 0:
#             d2l.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
#                          range(1, num_epochs + 1), valid_ls,
#                          ['train', 'valid'])
#         print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
#     return train_l_sum/k, valid_l_sum/k
        
  
# k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
# train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
# print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))

In [229]:
batch_size, input_size, output_size, k = 64, train_features.shape[1], 1, 5
model = Net(input_size, output_size)
optimizer = torch.torch.optim.Adam(model.parameters(), lr=5)

train_loss_sum, valid_loss_sum = 0, 0
k_data = get_k_flod_data(k, train_features, train_labels)
for i in range(k):
    train_features, train_labels, valid_features, valid_labels = next(k_data)
    
    train_loss, valid_loss = [], []
    for epoch in range(100):
        train_dataset = Data.TensorDataset(train_features, train_labels)
        train_iter = Data.DataLoader(train_dataset, batch_size, shuffle=True)
        for X, Y in train_iter:
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, Y)
            loss.backward()
            optimizer.step()
        train_loss.append(log_rmse(model, train_features, train_labels))
        if valid_labels is not None:
            valid_loss.append(log_rmse(model, valid_features, valid_labels))
    train_loss_sum += train_loss[-1]
    valid_loss_sum += valid_loss[-1]
    print('fold %d, train rmse %f, valid rmse %f' % (i, train_loss[-1], valid_loss[-1]))
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_loss_sum / k, valid_loss_sum / k))

fold 0, train rmse 0.240801, valid rmse 0.222089
fold 1, train rmse 0.200362, valid rmse 0.214741
fold 2, train rmse 0.181916, valid rmse 0.194018
fold 3, train rmse 0.182988, valid rmse 0.176709
fold 4, train rmse 0.170414, valid rmse 0.207701
5-fold validation: avg train rmse 0.195296, avg valid rmse 0.203052


In [230]:
preds = model(test_features).detach().numpy()
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('./submission.csv', index=False)

In [231]:
''' 完成 '''

' 完成 '