-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_house.py
100 lines (71 loc) · 3.15 KB
/
train_house.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import torch
from torch import nn, optim
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from MLP import mlp
batchsz = 73
lr = 5
epochs = 50
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
# 先联合,然后对每一列归一化
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
# 将不是字符串的列的索引取出来
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
lambda x: (x - x.mean()) / (x.std())
)
# 这一列虽然是数字,但要当字符串数据处理
all_features['MSSubClass'] = all_features['MSSubClass'].astype(str)
pd.get_dummies(all_features['MSSubClass'], prefix='MSSubClass')
# 对缺失值填充零
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# 离散化 举个例子,假设特征MSZoning里面有两个不同的离散值RL和RM,
# 那么这一步转换将去掉MSZoning特征,
# 并新加两个特征MSZoning_RL和MSZoning_RM,其值为0或1。
# 如果一个样本原来在MSZoning里的值为RL,那么有MSZoning_RL=1且MSZoning_RM=0。
all_features = pd.get_dummies(all_features, dummy_na=True)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[: int(n_train * 0.8)].values, dtype=torch.float)
valid_features = torch.tensor(all_features[int(n_train * 0.8):n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(
train_data[: int(n_train * 0.8)].SalePrice.values, dtype=torch.float).view(-1, 1)
valid_labels = torch.tensor(
train_data[int(n_train * 0.8):n_train].SalePrice.values, dtype=torch.float).view(-1, 1)
train_dataset = TensorDataset(train_features, train_labels)
train_iters = DataLoader(train_dataset, batch_size=batchsz, shuffle=True)
valid_dataset = TensorDataset(valid_features, valid_labels)
valid_iters = DataLoader(valid_dataset, batch_size=batchsz)
net = mlp()
print(net)
criteon = nn.MSELoss()
optimizer = optim.Adam(params=net.parameters(), lr=lr)
for epoch in range(epochs):
for batch_id, (x, y) in enumerate(train_iters):
logits = net(x)
loss = criteon(logits, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch_id % 1000 == 0:
print('epoch:[{}/{}], loss:{:.4f}'.format(epoch, epochs, loss))
net.eval()
with torch.no_grad():
correct = 0
total = 0
for batch_id, (x, y) in enumerate(valid_iters):
logits = net(x)
correct += ((logits.log() - y.log()) ** 2).sum().float().item()
total += len(y)
rmse = np.sqrt(correct / total)
print('rmse:', rmse)
preds = net(test_features).detach().numpy()
test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
submission.to_csv('./data/submission_mlp.csv', index=False)
"""
参考 https://tangshusen.me/Dive-into-DL-PyTorch/#/chapter03_DL-basics/3.16_kaggle-house-price
结果: 0.13470 效果一般, 不如Adaboosting的0.12296
"""