<a href="https://colab.research.google.com/github/li199-code/d2l-pytorch/blob/main/house_price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch import nn

device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [None]:
batch_size = 64
lr=5
epochs = 100

In [None]:
train_data=pd.read_csv('/content/drive/MyDrive/data/kaggle_house_pred_train.csv')
test_data=pd.read_csv('/content/drive/MyDrive/data/kaggle_house_pred_test.csv')

In [None]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
# print(len(all_features))
# print(all_features.shape)

In [None]:
# 若无法获得测试数据，则可根据训练数据计算均值和标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [None]:
# 文本数据的处理：one-hot encoding
all_features = pd.get_dummies(all_features, dummy_na=True)

In [None]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(
    train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

In [None]:
def load_array(data, batch_size, train=True):
  dataset = torch.utils.data.TensorDataset(*data)
  return DataLoader(dataset, batch_size, shuffle=train)


trainloader = load_array((train_features, train_labels), batch_size)


In [None]:
in_features = train_features.shape[1]
net = nn.Sequential(
  nn.Linear(in_features, 512),
  nn.ReLU(),
  nn.Dropout(0.2),
  nn.Linear(512, 1),
)

model = net.to(device)

In [None]:
loss_fn = nn.MSELoss()
def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss_fn(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
def train(trainloader, model, loss_fn, optimizer):
  size = n_train
  model.train()
  for batch, (x, y) in enumerate(trainloader):
    x, y = x.to(device), y.to(device)
    pred = model(x)
    l = loss_fn(pred, y)
    l.backward()
    optimizer.step()
    optimizer.zero_grad()

    if batch%10==0:
      loss_num, current = l.item(), (batch + 1) * len(x)
      print(f"loss: {loss_num:>7f}  [{current:>5d}/{size:>5d}]")


In [None]:
for t in range(epochs):
  print(f"Epoch {t+1}\n-------------------------------")
  train(trainloader, model, loss_fn, optimizer)

torch.save(model, '/content/drive/MyDrive/checkpoint/model.pth')
print('Done!')

Epoch 1
-------------------------------
loss: 48022446080.000000  [   64/ 1460]
loss: 15442082816.000000  [  704/ 1460]
loss: 5581964288.000000  [ 1344/ 1460]
Epoch 2
-------------------------------
loss: 5643421696.000000  [   64/ 1460]
loss: 1503381632.000000  [  704/ 1460]
loss: 1047202304.000000  [ 1344/ 1460]
Epoch 3
-------------------------------
loss: 1267903616.000000  [   64/ 1460]
loss: 3767652608.000000  [  704/ 1460]
loss: 2014125824.000000  [ 1344/ 1460]
Epoch 4
-------------------------------
loss: 2221456384.000000  [   64/ 1460]
loss: 1353030144.000000  [  704/ 1460]
loss: 486593600.000000  [ 1344/ 1460]
Epoch 5
-------------------------------
loss: 1240350336.000000  [   64/ 1460]
loss: 1069307648.000000  [  704/ 1460]
loss: 672020224.000000  [ 1344/ 1460]
Epoch 6
-------------------------------
loss: 1280180224.000000  [   64/ 1460]
loss: 1072667392.000000  [  704/ 1460]
loss: 1244533248.000000  [ 1344/ 1460]
Epoch 7
-------------------------------
loss: 2614052352.0

In [None]:
test_features = test_features.to(device)

model = torch.load('/content/drive/MyDrive/checkpoint/model.pth')
model.eval()


preds = model(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['SalePrice'] = pd.Series(preds.reshape(-1, 1)[:,0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)