In [1]:
import torch
import pandas as pd
import torch.nn as nn
from torch.nn import functional as F
from d2l import torch as d2l
from tqdm import tqdm
import numpy as np
from torch.utils import data

NUM_SAVE = 50
net_list = "in->256->64"

class MLP(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.layer1 = nn.Linear(in_features,256)
        self.layer2 = nn.Linear(256,64)
        self.out = nn.Linear(64,1)
        
    def forward(self, X):
        X = F.relu(self.layer1(X))
        X = F.relu(self.layer2(X))
        return self.out(X)

  warn(f"Failed to load image Python extension: {e}")


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
test_data = pd.read_csv('./test.csv/test.csv')
train_data = pd.read_csv('./train.csv/train.csv')
print("train_data and test_data shape",train_data.shape,test_data.shape)

# 去掉冗余数据
redundant_cols = ['Address', 'Summary', 'City', 'State']
for c in redundant_cols:
    del test_data[c], train_data[c]
    
# 数据预处理
large_vel_cols = ['Lot', 'Total interior livable area', 'Tax assessed value', 'Annual tax amount', 'Listed Price', 'Last Sold Price']
for c in large_vel_cols:
    train_data[c] = np.log(train_data[c]+1)
    if c!='Sold Price':
        test_data[c] = np.log(test_data[c]+1)

# 把train和test去除id后放一起，train也要去掉label
all_features = pd.concat((train_data.iloc[:,2:],test_data.iloc[:,1:]))

# 时间数据赋日期格式
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")



train_data and test_data shape (47439, 41) (31626, 40)


In [3]:
for in_object in all_features.dtypes[all_features.dtypes=='object'].index:
    print(in_object.ljust(20),len(all_features[in_object].unique()))

Type                 174
Heating              2660
Cooling              911
Parking              9913
Bedrooms             278
Region               1259
Elementary School    3568
Middle School        809
High School          922
Flooring             1740
Heating features     1763
Cooling features     596
Appliances included  11290
Laundry features     3031
Parking features     9695


In [4]:
# 查询数字列 ->缺失数据赋0 -> 归一化
numeric_features = all_features.dtypes[all_features.dtypes == 'float64'].index
all_features = all_features.fillna(method='bfill', axis=0).fillna(0)
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))

features = list(numeric_features)
features.extend(['Type','Bedrooms'])   # 加上类别数相对较少的Type, ,'Cooling features'
all_features = all_features[features]

print('before one hot code',all_features.shape)
all_features = pd.get_dummies(all_features,dummy_na=True)
all_features.shape
print('after one hot code',all_features.shape)

n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
print('train feature shape:', train_features.shape)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
print('test feature shape:', test_features.shape)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1), dtype=torch.float)
print('train label shape:', train_labels.shape)

before one hot code (79065, 19)
after one hot code (79065, 470)
train feature shape: torch.Size([47439, 470])
test feature shape: torch.Size([31626, 470])
train label shape: torch.Size([47439, 1])


In [5]:
criterion = nn.MSELoss()
in_features = train_features.shape[1]
net = MLP(in_features).to(device)

def load_array(data_arrays, batch_size, is_train=True):  #@save
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
    for epoch in tqdm(range(num_epochs)):
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = net(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
        record_loss = log_rmse(net.to('cpu'), train_features, train_labels)
        train_ls.append(record_loss)
        if (epoch%NUM_SAVE==0 and epoch!=0) or (epoch==num_epochs-1):
            torch.save(net.state_dict(),'checkpoint_'+str(epoch))
            print('save checkpoints on:', epoch, 'rmse loss value is:', record_loss)
        del X, y
        net.to(device)
    return train_ls, test_ls

k, num_epochs, lr, weight_decay, batch_size = 5, 2000, 0.005, 0.05, 256

print("network:",net)

network: MLP(
  (layer1): Linear(in_features=470, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


In [6]:
train_ls, valid_ls = train(net, train_features,train_labels,None,None, num_epochs, lr, weight_decay, batch_size)

# 使用现有训练好的net
net.to('cpu')
# 将网络应用于测试集。
preds = net(test_features).detach().numpy()

# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

  3%|▎         | 51/2000 [01:23<46:10,  1.42s/it]  

save checkpoints on: 50 rmse loss value is: 0.4068301320075989


  5%|▌         | 101/2000 [02:33<46:21,  1.46s/it]

save checkpoints on: 100 rmse loss value is: 0.3262149393558502


  8%|▊         | 151/2000 [03:55<47:31,  1.54s/it]  

save checkpoints on: 150 rmse loss value is: 0.30652928352355957


 10%|█         | 201/2000 [05:10<45:04,  1.50s/it]

save checkpoints on: 200 rmse loss value is: 0.2943722903728485


 13%|█▎        | 251/2000 [06:24<41:05,  1.41s/it]

save checkpoints on: 250 rmse loss value is: 0.28244319558143616


 15%|█▌        | 301/2000 [07:34<44:31,  1.57s/it]

save checkpoints on: 300 rmse loss value is: 0.23231539130210876


 18%|█▊        | 351/2000 [08:47<53:21,  1.94s/it]

save checkpoints on: 350 rmse loss value is: 0.23217058181762695


 20%|██        | 401/2000 [10:00<33:39,  1.26s/it]

save checkpoints on: 400 rmse loss value is: 0.23195071518421173


 23%|██▎       | 451/2000 [11:03<33:03,  1.28s/it]

save checkpoints on: 450 rmse loss value is: 0.23214292526245117


 25%|██▌       | 501/2000 [12:09<32:14,  1.29s/it]

save checkpoints on: 500 rmse loss value is: 0.23262768983840942


 28%|██▊       | 551/2000 [13:16<31:29,  1.30s/it]

save checkpoints on: 550 rmse loss value is: 0.23222635686397552


 30%|███       | 601/2000 [14:26<33:23,  1.43s/it]

save checkpoints on: 600 rmse loss value is: 0.23290172219276428


 33%|███▎      | 651/2000 [15:36<34:11,  1.52s/it]

save checkpoints on: 650 rmse loss value is: 0.23215247690677643


 35%|███▌      | 701/2000 [16:55<33:17,  1.54s/it]

save checkpoints on: 700 rmse loss value is: 0.25127217173576355


 38%|███▊      | 751/2000 [18:03<28:54,  1.39s/it]

save checkpoints on: 750 rmse loss value is: 0.24770860373973846


 40%|████      | 801/2000 [19:10<26:26,  1.32s/it]

save checkpoints on: 800 rmse loss value is: 0.270944744348526


 43%|████▎     | 851/2000 [20:17<26:18,  1.37s/it]

save checkpoints on: 850 rmse loss value is: 0.2693876624107361


 45%|████▌     | 901/2000 [21:25<24:14,  1.32s/it]

save checkpoints on: 900 rmse loss value is: 0.3222208321094513


 48%|████▊     | 951/2000 [22:33<23:24,  1.34s/it]

save checkpoints on: 950 rmse loss value is: 0.3559408187866211


 50%|█████     | 1001/2000 [23:40<22:15,  1.34s/it]

save checkpoints on: 1000 rmse loss value is: 0.3474811613559723


 53%|█████▎    | 1051/2000 [24:47<21:10,  1.34s/it]

save checkpoints on: 1050 rmse loss value is: 0.3563760221004486


 55%|█████▌    | 1101/2000 [25:56<21:21,  1.43s/it]

save checkpoints on: 1100 rmse loss value is: 0.3634069263935089


 58%|█████▊    | 1151/2000 [27:03<19:05,  1.35s/it]

save checkpoints on: 1150 rmse loss value is: 0.3617018163204193


 60%|██████    | 1201/2000 [28:12<18:39,  1.40s/it]

save checkpoints on: 1200 rmse loss value is: 0.38692983984947205


 63%|██████▎   | 1251/2000 [29:21<16:54,  1.35s/it]

save checkpoints on: 1250 rmse loss value is: 0.3491191267967224


 65%|██████▌   | 1301/2000 [30:28<15:40,  1.35s/it]

save checkpoints on: 1300 rmse loss value is: 0.4020913541316986


 68%|██████▊   | 1351/2000 [31:37<15:23,  1.42s/it]

save checkpoints on: 1350 rmse loss value is: 0.4170035719871521


 70%|███████   | 1401/2000 [32:46<13:41,  1.37s/it]

save checkpoints on: 1400 rmse loss value is: 0.429593026638031


 73%|███████▎  | 1451/2000 [33:54<12:10,  1.33s/it]

save checkpoints on: 1450 rmse loss value is: 0.4947836697101593


 75%|███████▌  | 1501/2000 [35:02<11:19,  1.36s/it]

save checkpoints on: 1500 rmse loss value is: 0.467001348733902


 78%|███████▊  | 1551/2000 [36:10<10:03,  1.34s/it]

save checkpoints on: 1550 rmse loss value is: 0.44803768396377563


 80%|████████  | 1601/2000 [37:20<09:16,  1.39s/it]

save checkpoints on: 1600 rmse loss value is: 0.4435217082500458


 83%|████████▎ | 1651/2000 [38:28<07:40,  1.32s/it]

save checkpoints on: 1650 rmse loss value is: 0.5132977366447449


 85%|████████▌ | 1701/2000 [39:41<06:54,  1.39s/it]

save checkpoints on: 1700 rmse loss value is: 0.48308485746383667


 88%|████████▊ | 1751/2000 [40:49<05:31,  1.33s/it]

save checkpoints on: 1750 rmse loss value is: 0.4900437891483307


 90%|█████████ | 1801/2000 [41:57<04:31,  1.36s/it]

save checkpoints on: 1800 rmse loss value is: 0.42743757367134094


 93%|█████████▎| 1851/2000 [43:05<03:24,  1.37s/it]

save checkpoints on: 1850 rmse loss value is: 0.4374784827232361


 95%|█████████▌| 1901/2000 [44:12<02:13,  1.34s/it]

save checkpoints on: 1900 rmse loss value is: 0.43157845735549927


 98%|█████████▊| 1951/2000 [45:20<01:07,  1.37s/it]

save checkpoints on: 1950 rmse loss value is: 0.38938966393470764


100%|██████████| 2000/2000 [46:34<00:00,  1.40s/it]

save checkpoints on: 1999 rmse loss value is: 0.40403980016708374





In [7]:
net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

In [10]:
# 读取已有 继续进行训练
k, num_epochs, lr, weight_decay, batch_size = 5, 500, 0.0005, 0.08, 256
net.load_state_dict(torch.load('checkpoint_1999'))
print("network:",net)
net.to(device)
train_ls, valid_ls = train(net, train_features,train_labels,None,None, num_epochs, lr, weight_decay, batch_size)
net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

network: MLP(
  (layer1): Linear(in_features=470, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (out): Linear(in_features=64, out_features=1, bias=True)
)


 10%|█         | 51/500 [01:11<10:57,  1.46s/it]

save checkpoints on: 50 rmse loss value is: 0.3810369074344635


 20%|██        | 101/500 [02:23<08:58,  1.35s/it]

save checkpoints on: 100 rmse loss value is: 0.3947613835334778


 30%|███       | 151/500 [03:31<07:53,  1.36s/it]

save checkpoints on: 150 rmse loss value is: 0.37567299604415894


 40%|████      | 201/500 [04:40<07:00,  1.41s/it]

save checkpoints on: 200 rmse loss value is: 0.37090811133384705


 50%|█████     | 251/500 [05:50<06:34,  1.58s/it]

save checkpoints on: 250 rmse loss value is: 0.3759949803352356


 60%|██████    | 301/500 [07:01<04:35,  1.39s/it]

save checkpoints on: 300 rmse loss value is: 0.36404433846473694


 70%|███████   | 351/500 [08:18<03:40,  1.48s/it]

save checkpoints on: 350 rmse loss value is: 0.3752710223197937


 80%|████████  | 401/500 [09:40<02:06,  1.28s/it]

save checkpoints on: 400 rmse loss value is: 0.36337804794311523


 90%|█████████ | 451/500 [10:44<01:03,  1.29s/it]

save checkpoints on: 450 rmse loss value is: 0.3765175938606262


100%|██████████| 500/500 [11:47<00:00,  1.42s/it]

save checkpoints on: 499 rmse loss value is: 0.36313241720199585





In [11]:
# 读取网络参数应用于测试集
net = []
net = MLP(test_features.shape[1])
net.load_state_dict(torch.load('checkpoint_250'))
net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

In [12]:
print(len(all_features['Type'].unique()))
print(len(all_features['Heating'].unique()))
print(len(all_features['Cooling'].unique()))
print(len(all_features['Parking'].unique()))
print(len(all_features['Bedrooms'].unique()))
print(len(all_features['Region'].unique()))
print(len(all_features['Elementary School'].unique()))
print(len(all_features['Middle School'].unique()))
print(len(all_features['High School'].unique()))
print(len(all_features['Flooring'].unique()))
print(len(all_features['Heating features'].unique()))
print(len(all_features['Cooling features'].unique()))
print(len(all_features['Appliances included'].unique()))
print(len(all_features['Laundry features'].unique()))
print(len(all_features['Parking features'].unique()))
print(len(all_features['City'].unique()))

KeyError: 'Type'