## loading data

In [1]:
import pickle
import numpy as np
from os.path import join
dir_path = './sdml_final_1'
input_list  = ['prep_final_test_win_feature.pickle',
               'prep_final_train_bid_data.pickle',
               'prep_final_train_win_data.pickle']
with open(join(dir_path, 'prep_final_train_win_data.pickle'), 'rb') as f:
    win_dic = pickle.load(f)
with open(join(dir_path, 'prep_final_train_bid_data.pickle'), 'rb') as f:
    bid_dic = pickle.load(f)
    


## model

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# eps = 10**-8
# eps = 10**-2

class Regression_model(nn.Module):
    def __init__(self, app_dim, device_dim, imp_dim, imp_posi_dim, dm1, dm2, drop):
        super(Regression_model, self).__init__()
        self.app = nn.Embedding(app_dim, dm1)
        self.device = nn.Embedding(device_dim, dm1)
        self.imp = nn.Embedding(imp_dim, dm1)
        self.imp_posi = nn.Embedding(imp_posi_dim, dm1)
        self.weekdays = nn.Embedding(7, dm1)
        self.hours = nn.Embedding(24, dm1)
        self.linears = nn.ModuleList()
        self.norms = nn.ModuleList()
        self.drop = nn.Dropout(drop)
        for i in range(4):
            if i == 0:
                linear = nn.Linear(6*dm1 + 2, dm2)
            else:
                linear = nn.Linear(dm2, dm2)
            self.linears.append(linear)
            self.norms.append(nn.BatchNorm1d(dm2))
        self.out = nn.Linear(dm2, 1)
        

    def forward(self, width, height, app, device, imp, imp_posi, weekdays, hours):
        app = self.app(app).squeeze(1)
        device = self.device(device).squeeze(1)
        imp = self.imp(imp).squeeze(1)
        imp_posi = self.imp_posi(imp_posi).squeeze(1)
        weekdays = self.weekdays(weekdays).squeeze(1)
        hours = self.hours(hours).squeeze(1)
        x = torch.cat([width, height, app, device, imp, imp_posi, weekdays, hours], dim=-1)
        x = self.drop(x)
        for i, (norm, linear) in enumerate(zip(self.norms, self.linears)):
#             if i == 0:
#                 x = F.relu(linear(x))
#             else:
#                 x = F.relu(linear(x)) + x
            x = F.relu(linear(x))
            x = norm(x)
            x = self.drop(x)
        y = self.out(x)
        return y

## Data Generator

In [3]:
def split_dic(rate, dic):
    N = len(dic['price'])
    
    for k in dic.keys():
        assert dic[k].shape[0] == N
#         print 'categorical %s, dim size : %d' % (k, np.max(dic[k]+1))
    idx = np.random.permutation(N)
    c = int(N*rate)
    print 'split to %d : %d' % (c, N-c)
    train_idx, val_idx = idx[c:], idx[:c]
    train_dic = {k: dic[k][train_idx, :] for k in dic.keys()}
    val_dic = {k: dic[k][val_idx, :] for k in dic.keys()}
    return train_dic, val_dic

def batch_boostrap_generator(dic, batch):
    while True:
        N = len(dic['price'])
        rand_idx = np.random.permutation(N)
        ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
        for i in range(ran):
            idx = rand_idx[i*batch: (i+1)*batch] if (i+1)*batch < len(rand_idx) \
            else rand_idx[i*batch:]
            app = dic['app_type'][idx, :]
            device = dic['device_type'][idx, :]
            imp = dic['imp_type'][idx, :]
            imp_posi = dic['imp_position'][idx, :]
            weekdays = dic['weekdays'][idx, :]
            hours = dic['hours'][idx, :]
            width = dic['width'][idx, :]
            height = dic['height'][idx, :]
            price = dic['price'][idx, :]
            yield (width, height, app, device, imp, imp_posi, weekdays, hours, price)


## Training

In [12]:
import numpy as np
from itertools import count
from tqdm import tqdm
from collections import deque

def win_loss(x, y):
    return torch.pow(x-y, 2)
def calc_loss(data, use_cuda):
    data = list(data)
    for i, v in enumerate(data):
        if v.dtype == np.int32:
            data[i] = torch.LongTensor(v)
        elif v.dtype == np.float32:
            data[i] = torch.FloatTensor(v)
        if use_cuda:
            data[i] = data[i].cuda()
        
    width, height, app, device, imp, imp_posi, weekdays, hours, price = data
    out = model(width, height, app, device, imp, imp_posi, weekdays, hours)
    loss = win_loss(out, price)
    return loss

def train(model, opt, train_g, val_g, use_cuda):
    win_g, bid_g = train_g
    win_data, bid_data = next(win_g), next(bid_g)
    
#     price phase
    win_loss = calc_loss(win_data, use_cuda)
    loss1 = torch.mean(win_loss)
    opt.zero_grad()
    loss1.backward()
    opt.step()
    with torch.no_grad():
        win_g, bid_g = val_g
        win_loss = calc_loss(next(win_g), use_cuda)
        val_loss = torch.mean(win_loss)
    return loss1, val_loss
    
batch_size = 8192
train_win_dic, val_win_dic = split_dic(0.1, win_dic)
train_bid_dic, val_bid_dic = split_dic(0.1, bid_dic)

win_g = batch_boostrap_generator(train_win_dic, batch_size)
bid_g = batch_boostrap_generator(train_bid_dic, batch_size)
train_g = (win_g, bid_g)

win_g = batch_boostrap_generator(val_win_dic, batch_size)
bid_g = batch_boostrap_generator(val_bid_dic, batch_size)
val_g = (win_g, bid_g)


dm1 = 32
dm2 = 128
drop = 0.05
# batch_size = 4
model = Regression_model(3, 5, 3, 10, dm1, dm2, drop)

# opt = torch.optim.Adadelta(model.parameters())
opt = torch.optim.Adam(model.parameters())

distribution = torch.distributions.normal.Normal
train_q = deque(maxlen=100)
val_q = deque(maxlen=100)
c = 0

use_cuda = torch.cuda.is_available()
if use_cuda:
    model = model.cuda()


split to 1858597 : 16727378
split to 1645380 : 14808421


##  Regression

In [16]:
N = len(train_win_dic['price'])
ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
pre_loss = 0
pre_train = 0 
model.train()
for epoch in count():
    print 'epoch start : %d' % epoch
    with tqdm(total=ran) as pbar:
        for i in range(ran):
            loss, val_loss = train(model, opt, train_g, val_g, use_cuda)
            train_q.append(loss.item())
            val_q.append(val_loss.item())
            loss = np.mean(train_q)
            val_loss = np.mean(val_q)
            pbar.set_postfix_str('loss : %.5f, val loss : %.5f' % (loss, val_loss))
            pbar.update(1)
            if i % 10 == 0:
                if val_loss > pre_loss and loss < pre_train:
                    c += 1
                else:
                    c = 0
                pre_loss = val_loss
                pre_train = loss
                if c > 5 and val_loss < 50:
                    c = 0
                    print 'end of training'
                    torch.save(model, './best_linear.tar')
                    weffwe


  0%|          | 6/2042 [00:00<00:55, 36.91it/s, loss : 2785753.69750, val loss : 2884365.93250]

epoch start : 0


 92%|█████████▏| 1878/2042 [00:56<00:04, 33.35it/s, loss : 2813062.61250, val loss : 2919036.46500]


KeyboardInterrupt: 

  "type " + obj.__name__ + ". It won't be checked "


## Regression

## Testing

In [19]:
import numpy as np
from itertools import count
from tqdm import tqdm
from collections import deque

def test_generator(dic, batch):
    rand_idx = np.arange(len(dic['price']))
    N = len(dic['price'])
    ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
    for i in range(ran):
        idx = rand_idx[i*batch: (i+1)*batch] if (i+1)*batch < len(rand_idx) \
        else rand_idx[i*batch:]
        id = dic['id'][idx, :]
        app = dic['app_type'][idx, :]
        device = dic['device_type'][idx, :]
        imp = dic['imp_type'][idx, :]
        imp_posi = dic['imp_position'][idx, :]
        weekdays = dic['weekdays'][idx, :]
        hours = dic['hours'][idx, :]
        width = dic['width'][idx, :]
        height = dic['height'][idx, :]
        price = dic['price'][idx, :]
        yield (id, width, height, app, device, imp, imp_posi, weekdays, hours, price)
        
def test(model, data, use_cuda):
    data = list(data)
    for i, v in enumerate(data):
        if v.dtype == np.int32:
            data[i] = torch.LongTensor(v)
        elif v.dtype == np.float32:
            data[i] = torch.FloatTensor(v)
        if use_cuda:
            data[i] = data[i].cuda()
        
    id, width, height, app, device, imp, imp_posi, weekdays, hours, price = data
    out = model(width, height, app, device, imp, imp_posi, weekdays, hours)
    return id, out



dir_path = './sdml_final_1'
with open(join(dir_path, 'prep_final_test_win_feature.pickle'), 'rb') as f:
    test_dic = pickle.load(f)
batch_size = 8192
N = len(test_dic['price'])
ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
    
test_g = test_generator(test_dic, batch_size)
use_cuda = True
model.eval()
with open('./submission.csv', 'w') as f_out:
    f_out.write('id,win_price\n')
    with tqdm(total=ran) as pbar:
        with torch.no_grad():
            for i in range(ran):
                data = next(test_g)
                id, out = test(model, data, use_cuda)
                for j in range(out.shape[0]):
                    v = out[j,0].item()
                    f_out.write('%d,%f\n' % (id[j, 0].item(), v))
                pbar.update(1)
print 'done'

100%|██████████| 1233/1233 [04:28<00:00,  4.59it/s]

done





In [46]:
print len(test_dic['price'])
N = len(test_dic['price']) // batch_size
print N
print (N+1)*batch_size


10093506
1232
10100736
