In [1]:
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# def get_tensorloader(self, tensors, train, indices=(0, None)):
#     tensors = tuple(a[indices] for a in tensors)
#     dataset = torch.utils.data.TensorDataset(*tensors)
#     return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)

x = torch.randn(5,2)
y = torch.randn(5,1)
print(x,y)
tensors = (x,y)
indices = slice(0, None)
data = tuple(a[0:2] for a in tensors)
dataset = torch.utils.data.TensorDataset(*data)
len(dataset)

tensor([[ 1.3351,  0.6800],
        [ 0.9710, -0.2405],
        [-0.1830,  2.1845],
        [-0.7318, -1.7615],
        [-0.8214, -0.1406]]) tensor([[ 0.6006],
        [-0.4472],
        [ 0.5624],
        [-0.2903],
        [ 1.1012]])


2

In [253]:
x, y = torch.randn(5, 3), torch.randn(5, 1)
tensors = (x,y)
dataset = torch.utils.data.TensorDataset(*tensors)
loader = torch.utils.data.DataLoader(dataset,batch_size=2,shuffle=True)
for x,y in loader:
    print(x.shape,y.shape) 

torch.Size([2, 3]) torch.Size([2, 1])
torch.Size([2, 3]) torch.Size([2, 1])
torch.Size([1, 3]) torch.Size([1, 1])


In [275]:
from d2l_common import download, extract, DataModule


class KaggleHouse(DataModule):
    def __init__(self, batch_size, train=None, val=None):
        super().__init__()
        self.save_hyperparameters()
        self.batch_size = batch_size
        if self.train is None:
            self.raw_train = pd.read_csv(
                f'{self.root}/house_predict/train.csv')
            self.raw_val = pd.read_csv(f'{self.root}/house_predict/test.csv')

    def preprocess(self):
        label = 'SalePrice'
        train_drop = self.raw_train.drop(columns=['Id', label])
        val_drop = self.raw_val.drop(columns=['Id'])
        features = pd.concat((train_drop, val_drop))
        numeric_columns = features.dtypes[features.dtypes != 'object'].index
        features[numeric_columns] = features[numeric_columns].apply(
            lambda x: (x-x.mean())/x.std())
        features[numeric_columns] = features[numeric_columns].fillna(
            features[numeric_columns].mean())
        features = pd.get_dummies(features, dummy_na=True)
        # save the data
        self.train = features[:self.raw_train.shape[0]].copy()
        self.train[label] = self.raw_train[label]
        self.val = features[self.raw_train.shape[0]:].copy()

    def get_dataloader(self, train):
        label = 'SalePrice'
        data = self.train if train else self.val
        if label not in data:
            return

        def get_tensor(x): return torch.tensor(x.values, dtype=torch.float32)
        # logarithm of prices
        train_set = get_tensor(data.drop(columns=[label]))
        label_set = get_tensor(data[label]).log().view(-1, 1)
        # print(train_set.shape, label_set.shape)
        tensors = (train_set, label_set)
        loader = self.get_tensorloader(tensors, train)
        dataset = torch.utils.data.TensorDataset(*tensors)
        loader = torch.utils.data.DataLoader(
            dataset, batch_size=self.batch_size, shuffle=True)
        # for x, y in loader:
        #     print(x.shape, y.shape)
        return loader

In [338]:
from d2l_common import Module


class KaggleHouseLinearReg(Module):
    def __init__(self, input_dim, hidden_dims_1, hidden_dims_2, hidden_dims_3, lr=0.01):
        super().__init__()
        self.lr = lr
        self.net = nn.Sequential(
            # nn.Flatten(),
            nn.BatchNorm1d(input_dim),
            nn.ReLU(),
            # nn.LazyLinear(hidden_dims_1),
            # nn.ReLU(),
            # nn.BatchNorm1d(hidden_dims_1),
            # nn.LazyLinear(hidden_dims_2),
            # nn.BatchNorm1d(hidden_dims_2),
            nn.ReLU(),
            # nn.LazyLinear(hidden_dims_3),
            # nn.ReLU(),
            nn.LazyLinear(1),
        )

    def forward(self, x):
        # print(x.shape)
        return self.net(x)

    def loss(self, y_hat, y):
        return F.mse_loss(y_hat, y)

    def validate_step(self, batch):
        y_hat = self.forward(batch[0])
        return y_hat, self.loss(y_hat, batch[1])

In [339]:
from d2l_common import Trainer, LinearRegression


def k_fold_data(data, k):
    rets = []
    fold_size = data.train.shape[0]//k
    for i in range(k):
        idx = range(i*fold_size, (i+1)*fold_size)
        rets.append(KaggleHouse(data.batch_size, data.train.drop(
            index=idx), data.train.loc[idx]))
    return rets


def k_fold(trainer, data, k, lr):
    val_loss, models = [], []
    input_dim = data.train.shape[1]-1
    for i, data_fold in enumerate(k_fold_data(data, k)):
        model = KaggleHouseLinearReg(input_dim, 32, 8, 4, lr=0.01)
        trainer.fit(model, data_fold)
        val_loss.append(trainer.validate_loss[-1])
        models.append(model)
    print(f'average validation log mse = {sum(val_loss)/len(val_loss)}')
    return models

In [340]:
data = KaggleHouse(batch_size=32)
data.preprocess()
trainer = Trainer(max_epochs=10)
models = k_fold(trainer, data, 5, 0.01)



complete 0 epoch train_loss=0.8621996641159058 validate_loss=0.26994261145591736
complete 1 epoch train_loss=0.796772301197052 validate_loss=0.4769635796546936
complete 2 epoch train_loss=2.5857760906219482 validate_loss=0.34998905658721924
complete 3 epoch train_loss=2.976158618927002 validate_loss=1.5294902324676514
complete 4 epoch train_loss=1.1240923404693604 validate_loss=20.864328384399414
complete 5 epoch train_loss=0.42627713084220886 validate_loss=0.08749411255121231
complete 6 epoch train_loss=1.536794662475586 validate_loss=0.4486249089241028
complete 7 epoch train_loss=1.781087875366211 validate_loss=0.27042052149772644
complete 8 epoch train_loss=0.921359658241272 validate_loss=51.17634201049805
complete 9 epoch train_loss=0.4380706250667572 validate_loss=0.2503223419189453
complete 0 epoch train_loss=0.8199349045753479 validate_loss=0.4335361123085022
complete 1 epoch train_loss=1.0760200023651123 validate_loss=0.6367863416671753
complete 2 epoch train_loss=1.36874592304

1. average validation log mse = 42.32970695495605(mse+nn.Linear(1))
2. average validation log mse = 19.539384269714354(mse+relu+dropout+linear(1))

In [17]:
x = [torch.randn(1),torch.randn(1)]
torch.cat(x,0)

tensor([-0.6453,  0.5900])

In [341]:
import numpy as np
preds = [model(torch.tensor(data.val.values).float()) for model in models]
ensemble_preds = torch.exp(torch.cat(preds, dim=1)).mean(dim=1)
submission = pd.DataFrame(
    {'Id': data.raw_val['Id'], 'SalePrice': ensemble_preds.detach().numpy()})
submission.replace([np.inf], 0.0, inplace=True)
submission.to_csv('submission.csv', index=False)

In [219]:
data = KaggleHouse(batch_size=64)
# data.preprocess()
data.raw_train.shape,data.raw_val.shape
# data.train.shape,data.val.shape
# data.raw_train.drop(columns=['Id'])
# data.raw_train.loc[0:4,['SalePrice','Id']]
# data.raw_train.drop(columns=['SalePrice','Id'])

((1460, 81), (1459, 80))

In [220]:
data.raw_val.iloc[:4]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal


In [221]:
data.raw_train.iloc[:4,[0,1,2,3,-3,-2,-1]]

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,WD,Normal,208500
1,2,20,RL,80.0,WD,Normal,181500
2,3,60,RL,68.0,WD,Normal,223500
3,4,70,RL,60.0,WD,Abnorml,140000


In [222]:
label = 'SalePrice'
t1 = data.raw_train.drop(columns=['Id', label])
t1.iloc[:4, [0, 1, 2, 3]]
features = pd.concat((data.raw_train.drop(columns=['Id', label]), data.raw_val.drop(columns=['Id'])))
features[:4]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml


In [223]:
numeric_features = features.dtypes[features.dtypes!='object'].index
features[numeric_features] = features[numeric_features].apply(lambda x:(x-x.mean())/x.std())
features[numeric_features] = features[numeric_features].fillna(features[numeric_features].mean())
features[:4]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.06732,RL,-0.184443,-0.217841,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,0.157619,WD,Normal
1,-0.873466,RL,0.458096,-0.072032,Pave,,Reg,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,-0.446848,-0.602858,WD,Normal
2,0.06732,RL,-0.055935,0.137173,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,0.157619,WD,Normal
3,0.302516,RL,-0.398622,-0.078371,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,-1.363335,WD,Abnorml
