In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import math
import random

In [2]:
torch.cuda.is_available()
device = 'cuda'

In [3]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [4]:
df_features = pd.concat((df_train.iloc[:, 1:-1], df_test.iloc[:, 1:]))
df_features['MSSubClass'] = pd.Categorical(df_features['MSSubClass'])

In [7]:
numerical_features = df_features.select_dtypes(include=np.number).columns
df_features[numerical_features] = df_features[numerical_features].apply(lambda x: (x - x.mean()) / x.std())
df_features[numerical_features] = df_features[numerical_features].apply(lambda x: x.fillna(x.mean()))
df_features = pd.get_dummies(df_features, dummy_na=True)

In [8]:
n_train = df_train.shape[0]
train_features = torch.tensor(df_features.iloc[:n_train,:].values, dtype=torch.float32).to(device)
test_features = torch.tensor(df_features.iloc[n_train:,:].values, dtype=torch.float32).to(device)
train_labels = torch.tensor(df_train.iloc[:, -1], dtype=torch.float32).unsqueeze(1).to(device)

In [9]:
loss = nn.MSELoss()

In [10]:
def log_rmse(y_hat, y_label):
    
    assert(y_hat.shape == y_label.shape)
    clipped_preds = torch.clamp(y_hat, 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(y_label)))
    return rmse

In [37]:
def get_net(hyperparams):
    dim_in = train_features.shape[1]
    layers = []
    for i in range(hyperparams['n_layers']):
        dim_out = hyperparams[f'layer_{i}'] 
        layers += [
            nn.Linear(dim_in, dim_out),
            nn.ReLU(),
            nn.Dropout(0.1)]
        dim_in = dim_out
    layers.append(nn.Linear(dim_in, 1))
    return nn.Sequential(*layers).to(device)

In [38]:
def train(net, X_train, y_train, X_val, y_val, hyperparams):
    net.train()
    train_dataloader = DataLoader(TensorDataset(X_train, y_train), batch_size=hyperparams['batch_size'])
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = hyperparams['lr'],
                                 weight_decay = hyperparams['weight_decay'])
    for i in range(hyperparams['n_epoch']):
        for X, y in train_dataloader:
            optimizer.zero_grad()
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()

        train_loss = log_rmse(net(X_train), y_train).item()
        val_loss = float('nan')
        if X_val is not None:
            val_loss = log_rmse(net(X_val), y_val).item() 
        yield train_loss, val_loss
       


In [39]:
def split_k_fold(X, y, k, i):
    fold_size = (X.shape[0] + k - 1) // k
    X_train = torch.cat([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
    y_train = torch.cat([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)
    X_val = X[i * fold_size: (i + 1) * fold_size]
    y_val = y[i * fold_size: (i + 1) * fold_size]
    return X_train, y_train, X_val, y_val
    
    
def k_fold(k, X, y, hyperparams, report=None):
    assert X.shape[0] == y.shape[0], "Features count must match targets count!"
    n_samples = X.shape[0]
    train_loss_list = []
    val_loss_list = []
    for i in range(k):
        print(f"Training on fold {i+1}/{k}.")
        X_train, y_train, X_val, y_val = split_k_fold(X, y, k, i)
        net = get_net(hyperparams)
        for train_loss, val_loss in train(net, X_train, y_train, X_val, y_val, hyperparams):
            if report:
                report(train_loss, val_loss)
        train_loss_list.append(train_loss)
        val_loss_list.append(val_loss)
    return sum(train_loss_list) / k, sum(val_loss_list) / k


In [46]:
param_grid = {
    'n_epoch': [100, 300, 500, 700, 900], 
    'lr': [1e-2,5e-2,1e-1, 5e-1, 1], 
    'weight_decay': [0], 
    'batch_size': [4, 8, 16, 32, 64, 128],
    'n_layers': [0, 1, 2],
    'layer_0': [8, 16, 32, 64, 128, 256, 512, 1024],
    'layer_1': [8, 16, 32, 64, 128, 256, 512, 1024],
    'layer_2': [8, 16, 32, 64, 128, 256, 512, 1024]
}

In [47]:
def random_search(param_grid, max_iter, func):
    results = []
    for i in range(max_iter):
        params = {}
        for k in param_grid:
            params[k] = random.choice(param_grid[k])
        print(f"random_search {i}/{max_iter}: {params}")
        results.append((func(params), params))
        print(f"result={results[-1]}")
    return min(results, key=lambda t: t[0])

In [None]:
val_size = 100

optim_params = random_search(param_grid, 50, lambda params: k_fold(5, train_features, train_labels, params)[1])

random_search 0/50: {'n_epoch': 500, 'lr': 1, 'weight_decay': 0, 'batch_size': 64, 'n_layers': 0, 'layer_0': 8, 'layer_1': 32, 'layer_2': 256}
Training on fold 1/5.
Training on fold 2/5.
Training on fold 3/5.
Training on fold 4/5.
Training on fold 5/5.
result=(0.16167492270469666, {'n_epoch': 500, 'lr': 1, 'weight_decay': 0, 'batch_size': 64, 'n_layers': 0, 'layer_0': 8, 'layer_1': 32, 'layer_2': 256})
random_search 1/50: {'n_epoch': 100, 'lr': 0.5, 'weight_decay': 0, 'batch_size': 16, 'n_layers': 2, 'layer_0': 256, 'layer_1': 32, 'layer_2': 64}
Training on fold 1/5.
Training on fold 2/5.
Training on fold 3/5.
Training on fold 4/5.
Training on fold 5/5.
result=(0.27145659625530244, {'n_epoch': 100, 'lr': 0.5, 'weight_decay': 0, 'batch_size': 16, 'n_layers': 2, 'layer_0': 256, 'layer_1': 32, 'layer_2': 64})
random_search 2/50: {'n_epoch': 300, 'lr': 0.01, 'weight_decay': 0, 'batch_size': 128, 'n_layers': 1, 'layer_0': 256, 'layer_1': 512, 'layer_2': 256}
Training on fold 1/5.
Training o

In [47]:
net = get_net()
#hyperparams = n_epoch, lr, weight_decay, batch_size
hyperparams = {'n_epoch': 1000, 'lr': 1e-4, 'weight_decay': 0, 'batch_size': 32}
n_val = 100
for x in enumerate(train(net, train_features[:n_train - n_val], train_labels[:n_train - n_val], train_features[n_train - n_val:], train_labels[n_train - n_val:], **hyperparams)):
    print(x)

(0, (10.614036560058594, 10.598791122436523))
(1, (8.883131980895996, 8.876664161682129))
(2, (7.626389503479004, 7.618443012237549))
(3, (6.703399181365967, 6.694027423858643))
(4, (5.990160942077637, 5.986666202545166))
(5, (5.418361663818359, 5.409488677978516))
(6, (4.939798355102539, 4.933335304260254))
(7, (4.527947425842285, 4.51456880569458))
(8, (4.17134428024292, 4.164086818695068))
(9, (3.854180097579956, 3.84677791595459))
(10, (3.570713996887207, 3.5633010864257812))
(11, (3.312119960784912, 3.305535078048706))
(12, (3.0788021087646484, 3.075195550918579))
(13, (2.863812208175659, 2.8588831424713135))
(14, (2.666095733642578, 2.6600594520568848))
(15, (2.4821789264678955, 2.472972869873047))
(16, (2.3119187355041504, 2.3045547008514404))
(17, (2.152277708053589, 2.1459381580352783))
(18, (2.0035881996154785, 1.995829701423645))
(19, (1.8649582862854004, 1.8540334701538086))
(20, (1.7332539558410645, 1.7271838188171387))
(21, (1.6100177764892578, 1.6069083213806152))
(22, (

(170, (0.14967967569828033, 0.1354752480983734))
(171, (0.1502765715122223, 0.1318531483411789))
(172, (0.14940840005874634, 0.13160192966461182))
(173, (0.14974811673164368, 0.1363149881362915))
(174, (0.1503974050283432, 0.13601556420326233))
(175, (0.14827851951122284, 0.13025197386741638))
(176, (0.1510130614042282, 0.13135337829589844))
(177, (0.14935517311096191, 0.13105134665966034))
(178, (0.14818672835826874, 0.13276153802871704))
(179, (0.14858727157115936, 0.13053616881370544))
(180, (0.1490163505077362, 0.1309434175491333))
(181, (0.148776575922966, 0.13167648017406464))
(182, (0.14670662581920624, 0.135333850979805))
(183, (0.14687147736549377, 0.13628685474395752))
(184, (0.14733991026878357, 0.12947745621204376))
(185, (0.14623481035232544, 0.12962937355041504))
(186, (0.1477605104446411, 0.13204261660575867))
(187, (0.14713935554027557, 0.1265329271554947))
(188, (0.14688098430633545, 0.12805695831775665))
(189, (0.14719292521476746, 0.13472315669059753))
(190, (0.14637

(337, (0.1350402683019638, 0.1183372214436531))
(338, (0.13460110127925873, 0.1205957904458046))
(339, (0.13498517870903015, 0.12198849022388458))
(340, (0.13555815815925598, 0.12020358443260193))
(341, (0.13426043093204498, 0.12422292679548264))
(342, (0.1343948245048523, 0.1266438513994217))
(343, (0.1335206925868988, 0.125282883644104))
(344, (0.1343250870704651, 0.1220850870013237))
(345, (0.13598115742206573, 0.12316282093524933))
(346, (0.13381828367710114, 0.12175865471363068))
(347, (0.13570360839366913, 0.12277618795633316))
(348, (0.13396990299224854, 0.1188008040189743))
(349, (0.13479915261268616, 0.11944686621427536))
(350, (0.1349100023508072, 0.11785400658845901))
(351, (0.13443709909915924, 0.12467286735773087))
(352, (0.13345180451869965, 0.12030261009931564))
(353, (0.1351187378168106, 0.12306944280862808))
(354, (0.13400913774967194, 0.12112759798765182))
(355, (0.13313357532024384, 0.12424323707818985))
(356, (0.1340281069278717, 0.11423176527023315))
(357, (0.13329

(503, (0.13084261119365692, 0.12305353581905365))
(504, (0.1301598697900772, 0.1268826425075531))
(505, (0.1311524510383606, 0.12136250734329224))
(506, (0.1300942599773407, 0.1228402629494667))
(507, (0.130087211728096, 0.12247052043676376))
(508, (0.13152438402175903, 0.1211697906255722))
(509, (0.12998667359352112, 0.11885762959718704))
(510, (0.13089677691459656, 0.12154413759708405))
(511, (0.1299089938402176, 0.12309067696332932))
(512, (0.12941966950893402, 0.11839841306209564))
(513, (0.1311429888010025, 0.12051635980606079))
(514, (0.13028407096862793, 0.12609682977199554))
(515, (0.1303754299879074, 0.12040304392576218))
(516, (0.1299138069152832, 0.12297055125236511))
(517, (0.13017158210277557, 0.11696259677410126))
(518, (0.12920841574668884, 0.11909850686788559))
(519, (0.13006120920181274, 0.1257249414920807))
(520, (0.12934857606887817, 0.12008117139339447))
(521, (0.13110226392745972, 0.1250232458114624))
(522, (0.13148710131645203, 0.12222753465175629))
(523, (0.12970

(669, (0.12922784686088562, 0.1231498196721077))
(670, (0.12966440618038177, 0.12346407771110535))
(671, (0.1290391981601715, 0.12762700021266937))
(672, (0.13004620373249054, 0.12567569315433502))
(673, (0.12852968275547028, 0.12646420300006866))
(674, (0.12952525913715363, 0.12447530031204224))
(675, (0.12770026922225952, 0.1253529191017151))
(676, (0.1289338618516922, 0.12630991637706757))
(677, (0.12794679403305054, 0.12233620136976242))
(678, (0.12853243947029114, 0.1227756217122078))
(679, (0.12828025221824646, 0.12848088145256042))
(680, (0.12873832881450653, 0.12602819502353668))
(681, (0.12862969934940338, 0.1256185621023178))
(682, (0.12743832170963287, 0.12534864246845245))
(683, (0.12827037274837494, 0.12162789702415466))
(684, (0.12903907895088196, 0.12663866579532623))
(685, (0.1295699179172516, 0.12975072860717773))
(686, (0.12971943616867065, 0.12881995737552643))
(687, (0.12873989343643188, 0.12786895036697388))
(688, (0.1290304958820343, 0.12669146060943604))
(689, (0

(836, (0.128822922706604, 0.12575817108154297))
(837, (0.1291469782590866, 0.12544900178909302))
(838, (0.12878373265266418, 0.13014526665210724))
(839, (0.1284789890050888, 0.12575508654117584))
(840, (0.12840048968791962, 0.13063360750675201))
(841, (0.1289428025484085, 0.1269470602273941))
(842, (0.1282067447900772, 0.1301804631948471))
(843, (0.12997367978096008, 0.12825192511081696))
(844, (0.12891213595867157, 0.13121819496154785))
(845, (0.12834931910037994, 0.12779945135116577))
(846, (0.12981292605400085, 0.1234624832868576))
(847, (0.12841184437274933, 0.12339168787002563))
(848, (0.1277088224887848, 0.13228009641170502))
(849, (0.12853261828422546, 0.1326359659433365))
(850, (0.12677516043186188, 0.12481365352869034))
(851, (0.12879249453544617, 0.12945157289505005))
(852, (0.12898026406764984, 0.12549564242362976))
(853, (0.12774373590946198, 0.12957628071308136))
(854, (0.12793172895908356, 0.12441132217645645))
(855, (0.12690725922584534, 0.12490379810333252))
(856, (0.12

In [None]:
params = list(net.parameters())[0].detach().flatten().tolist()
assert len(df_features.columns) == len(params)
list(zip(df_features.columns, params))

In [None]:
net.eval()
pred = net(test_features).detach().cpu().numpy()

In [None]:
df_test['SalePrice'] = pred

In [None]:
submission = df_test[['Id', 'SalePrice']]
submission.to_csv('submission.csv', index=False)

In [None]:
submission.describe()