## loading data

In [1]:
import pickle
import numpy as np
from os.path import join
dir_path = './sdml_final_1'
input_list  = ['prep_final_test_win_feature.pickle',
               'prep_final_train_bid_data.pickle',
               'prep_final_train_win_data.pickle']
with open(join(dir_path, 'prep_final_train_win_data.pickle'), 'rb') as f:
    win_dic = pickle.load(f)
with open(join(dir_path, 'prep_final_train_bid_data.pickle'), 'rb') as f:
    bid_dic = pickle.load(f)
    

    


## data preprocessing

In [2]:
def trim_max(dic):
    price = dic['price']
    threshold = np.percentile(price, 80)
#     threshold = 800
#     print threshold
    valid_idx = np.array([i for i in range(price.shape[0]) if price[i, 0] < threshold])
    ret = {k: v[valid_idx, :] for k, v in dic.items()}
    return ret

win_dic = trim_max(win_dic)
bid_dic = trim_max(bid_dic)
print 'trimming done'

trimming done


## model

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Regression_model(nn.Module):
    def __init__(self, app_dim, device_dim, imp_dim, imp_posi_dim, dm1, dm2, drop):
        super(Regression_model, self).__init__()
        self.app = nn.Embedding(app_dim, dm1)
        self.device = nn.Embedding(device_dim, dm1)
        self.imp = nn.Embedding(imp_dim, dm1)
        self.imp_posi = nn.Embedding(imp_posi_dim, dm1)
        self.weekdays = nn.Embedding(7, dm1)
        self.hours = nn.Embedding(24, dm1)
        self.linears = nn.ModuleList()
        self.norms = nn.ModuleList()
        self.drop = nn.Dropout(drop)
        for i in range(10):
            if i == 0:
                linear = nn.Linear(6*dm1 + 2, dm2)
            else:
                linear = nn.Linear(dm2, dm2)
            self.linears.append(linear)
            self.norms.append(nn.BatchNorm1d(dm2))
        self.out = nn.Linear(dm2, 1)
        self.sigma = nn.Linear(dm2, 1)
        

    def forward(self, width, height, app, device, imp, imp_posi, weekdays, hours):
        app = self.app(app).squeeze(1)
        device = self.device(device).squeeze(1)
        imp = self.imp(imp).squeeze(1)
        imp_posi = self.imp_posi(imp_posi).squeeze(1)
        weekdays = self.weekdays(weekdays).squeeze(1)
        hours = self.hours(hours).squeeze(1)
        x = torch.cat([width, height, app, device, imp, imp_posi, weekdays, hours], dim=-1)
        x = self.drop(x)
        for i, (norm, linear) in enumerate(zip(self.norms, self.linears)):
            if i == 0:
                x = F.relu(linear(x))
            else:
                x = F.relu(linear(x)) + x
            x = norm(x)
            x = self.drop(x)
        y = self.out(x)
        sigma = F.relu(self.sigma(x))
        sigma = torch.clamp(sigma, 5, 50)
        return y, sigma

## Data Generator

In [10]:
def split_dic(rate, dic):
    N = len(dic['price'])
    
    for k in dic.keys():
        assert dic[k].shape[0] == N
#         print 'categorical %s, dim size : %d' % (k, np.max(dic[k]+1))
    idx = np.random.permutation(N)
    c = int(N*rate)
    print 'split to %d : %d' % (c, N-c)
    train_idx, val_idx = idx[c:], idx[:c]
    train_dic = {k: dic[k][train_idx, :] for k in dic.keys()}
    val_dic = {k: dic[k][val_idx, :] for k in dic.keys()}
    return train_dic, val_dic

def permutation_generator(N):
    while True:
        rand_idx = np.random.permutation(N)
        for i in rand_idx:
            yield i
            
def batch_boostrap_generator(dic, batch):
    N = len(dic['price'])
    G = permutation_generator(N)
    while True:
        idx = np.array([next(G) for i in range(batch)])
        app = dic['app_type'][idx, :]
        device = dic['device_type'][idx, :]
        imp = dic['imp_type'][idx, :]
        imp_posi = dic['imp_position'][idx, :]
        weekdays = dic['weekdays'][idx, :]
        hours = dic['hours'][idx, :]
        width = dic['width'][idx, :]
        height = dic['height'][idx, :]
        price = dic['price'][idx, :]
        yield (width, height, app, device, imp, imp_posi, weekdays, hours, price)


## Training

In [11]:
import numpy as np
from itertools import count
from tqdm import tqdm
from collections import deque

def win_loss(x, y, sigma, distribution):
    if distribution is torch.distributions.gumbel.Gumbel:
        z = (y-x) / sigma
        loss = z+torch.clamp(torch.exp(-z), 10**-8, 10**8) + torch.log(sigma)
#         print 'los', loss[0,0], z[0, 0], torch.clamp(torch.exp(-z), 10**-8, 10**8)[0, 0], torch.log(sigma)[0,0], x[0, 0]
        return loss
    else:
        d = distribution(0, sigma)
        loss = -d.log_prob(y-x)    
        return loss
def bid_loss(x, y, sigma, distribution):
#     if distribution is torch.distributions.gumbel.Gumbel:
#         z = (y-x) / sigma
#         loss = -torch.exp(-z)
#         return loss
# #     else:
    
    d = distribution(0, sigma)
    z = 1-d.cdf(y-x)
    z = torch.clamp(z, 10**-8, 1.)
    loss = -torch.log(z)
#     print 'bid', loss[0,0]
    return loss
#     return 0
def cuda_data(data):
    data = list(data)
    for i, v in enumerate(data):
        if v.dtype == np.int32:
            data[i] = torch.LongTensor(v)
        elif v.dtype == np.float32:
            data[i] = torch.FloatTensor(v)
        if use_cuda:
            data[i] = data[i].cuda()
    return data

def calc_train_loss(model, data, is_win, distribution, train_price, use_cuda):
    if use_cuda:
        data = cuda_data(data)
    
    width, height, app, device, imp, imp_posi, weekdays, hours, price = data
    with torch.no_grad():
        model.eval()
        out, sigma = model(width, height, app, device, imp, imp_posi, weekdays, hours)
    model.train()
    if train_price:
        out, _ = model(width, height, app, device, imp, imp_posi, weekdays, hours)
    else:
        _, sigma = model(width, height, app, device, imp, imp_posi, weekdays, hours)
#     print out[0,0], sigma[0,0], price[0,0 ]
    if is_win:
        loss = win_loss(out, price, sigma, distribution)
    else:
        loss = bid_loss(out, price, sigma, distribution)
    return loss

def calc_val_loss(model, data, is_win, distribution, use_cuda):
    if use_cuda:
        data = cuda_data(data)
    width, height, app, device, imp, imp_posi, weekdays, hours, price = data
    model.eval()
    with torch.no_grad():
        out, sigma = model(width, height, app, device, imp, imp_posi, weekdays, hours)
        if is_win:
            loss = win_loss(out, price, sigma, distribution)
        else:
            loss = bid_loss(out, price, sigma, distribution)
    return loss

def train(model, opt, train_g, val_g, distribution, train_price, use_cuda):
    price_opt, sigma_opt = opt
    train_opt = price_opt if train_price else sigma_opt
    win_g, bid_g = train_g
    win_data, bid_data = next(win_g), next(bid_g)
    win_loss = calc_train_loss(model, win_data, True, distribution, train_price, use_cuda)
    bid_loss = calc_train_loss(model, bid_data, False, distribution, train_price, use_cuda)
    loss = torch.mean(win_loss + bid_loss)
    train_opt.zero_grad()
    loss.backward()
    train_opt.step()
#     validation
    with torch.no_grad():
        model.eval()
        win_g, bid_g = val_g
        win_loss = calc_val_loss(model, next(win_g), True, distribution, use_cuda)
        bid_loss = calc_val_loss(model, next(bid_g), False, distribution, use_cuda)
        val_loss = torch.mean(win_loss + bid_loss)
    return loss, val_loss
    
batch_size = 8192
train_win_dic, val_win_dic = split_dic(0.1, win_dic)
train_bid_dic, val_bid_dic = split_dic(0.1, bid_dic)

win_g = batch_boostrap_generator(train_win_dic, batch_size)
bid_g = batch_boostrap_generator(train_bid_dic, batch_size)
train_g = (win_g, bid_g)

win_g = batch_boostrap_generator(val_win_dic, batch_size)
bid_g = batch_boostrap_generator(val_bid_dic, batch_size)
val_g = (win_g, bid_g)


dm1 = 64
dm2 = 512
drop = 0.15
model = Regression_model(3, 5, 3, 10, dm1, dm2, drop)

price_params = []
sigma_params = []
for m in model.modules():
    for p in m.parameters():
        if m != model.out:
            sigma_params.append(p)
        if m != model.sigma:
            price_params.append(p)
# price_opt = torch.optim.Adadelta(price_params)
# sigma_opt = torch.optim.Adadelta(sigma_params)
price_opt = torch.optim.Adam(price_params)
sigma_opt = torch.optim.Adam(sigma_params)
opt = price_opt, sigma_opt

# distribution = torch.distributions.normal.Normal
distribution = torch.distributions.gumbel.Gumbel
train_q = deque(maxlen=100)
val_q = deque(maxlen=100)
c = 0

use_cuda = torch.cuda.is_available()
if use_cuda:
    model = model.cuda()
    


split to 1486686 : 13380177
split to 1316304 : 11846736


## Cencored  regression

In [None]:
def dump_log(model, n_iter, loss, val_loss, log_file_stream, tmp_model_path):
    log_text = '%.7d<split>%.5f<split>%.5f\n' % (n_iter, loss, val_loss)
    log_file_stream.write(log_text)
    if n_iter % 10 == 0 :
        log_file_stream.flush()
        torch.save(model, tmp_model_path)



N = len(train_win_dic['price'])
train_num = 1
a = 2*train_num*batch_size
ran = N // a + 1 if N % a != 0 else N // a
pre_loss = 0
pre_train = 0 
model.train()
it = 0
with open('log.txt', 'w') as log_stream:
    for epoch in count():
        print 'epoch start : %d' % epoch
        with tqdm(total=ran) as pbar:
            for i in range(ran):
                loss_list = []
                val_loss_list = [] 
    #             price phase
                for i in range(train_num):
                    loss, val_loss = train(model, opt, train_g, val_g, distribution,
                                           train_price=True, use_cuda=use_cuda)
                    loss_list.append(loss.item())
                    val_loss_list.append(val_loss.item())

    #             sigma phase
                for i in range(train_num):
                    loss, val_loss = train(model, opt, train_g, val_g, distribution,
                                           train_price=False, use_cuda=use_cuda)
                    loss_list.append(loss.item())
                    val_loss_list.append(val_loss.item())
                loss = np.mean(loss_list)
                val_loss = np.mean(val_loss_list)

                train_q.append(loss)
                val_q.append(val_loss)
                loss = np.mean(train_q)
                val_loss = np.mean(val_q)
                pbar.set_postfix_str('loss : %.5f, val loss : %.5f' % (loss, val_loss))

                pbar.update(1)
                if i % 10 == 0:
                    if val_loss > pre_loss and loss < pre_train:
                        c += 1
                    else:
                        c = 0
                    pre_loss = val_loss
                    pre_train = loss
                    if c > 5 and val_loss < 50:
                        c = 0
                        print 'end of training'
                        torch.save(model, './only.tar')
                        weffwe
    #             log
                it += 1
#                 dump_log(model, it, loss, val_loss, log_stream, 'tmp_only.tar')

  0%|          | 0/817 [00:00<?, ?it/s]

epoch start : 0


100%|██████████| 817/817 [03:29<00:00,  2.60it/s, loss : 62.85095, val loss : 63.02411]
  0%|          | 0/817 [00:00<?, ?it/s]

epoch start : 1


100%|██████████| 817/817 [03:29<00:00,  2.55it/s, loss : 63.53315, val loss : 63.06644]
  0%|          | 0/817 [00:00<?, ?it/s]

epoch start : 2


100%|██████████| 817/817 [03:30<00:00,  2.90it/s, loss : 62.35412, val loss : 62.40845]
  0%|          | 0/817 [00:00<?, ?it/s]

epoch start : 3


100%|██████████| 817/817 [03:29<00:00,  2.88it/s, loss : 62.71861, val loss : 62.66890]
  0%|          | 0/817 [00:00<?, ?it/s]

epoch start : 4


 33%|███▎      | 269/817 [01:08<02:22,  3.84it/s, loss : 62.10911, val loss : 62.32947]

In [None]:
# torch.save(model, './best_trim.tar')
# torch.save(model, './best_bak.tar')


## Testing

In [None]:
import numpy as np
from itertools import count
from tqdm import tqdm
from collections import deque

def test_generator(dic, batch):
    rand_idx = np.arange(len(dic['price']))
    N = len(dic['price'])
    ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
    for i in range(ran):
        idx = rand_idx[i*batch: (i+1)*batch] if (i+1)*batch < len(rand_idx) \
        else rand_idx[i*batch:]
        id = dic['id'][idx, :]
        app = dic['app_type'][idx, :]
        device = dic['device_type'][idx, :]
        imp = dic['imp_type'][idx, :]
        imp_posi = dic['imp_position'][idx, :]
        weekdays = dic['weekdays'][idx, :]
        hours = dic['hours'][idx, :]
        width = dic['width'][idx, :]
        height = dic['height'][idx, :]
        price = dic['price'][idx, :]
        yield (id, width, height, app, device, imp, imp_posi, weekdays, hours, price)
        
def test(model, data, use_cuda):
    data = list(data)
    for i, v in enumerate(data):
        if v.dtype == np.int32:
            data[i] = torch.LongTensor(v)
        elif v.dtype == np.float32:
            data[i] = torch.FloatTensor(v)
        if use_cuda:
            data[i] = data[i].cuda()
        
    id, width, height, app, device, imp, imp_posi, weekdays, hours, price = data
    out, sigma = model(width, height, app, device, imp, imp_posi, weekdays, hours)
    return id, out



dir_path = './sdml_final_1'
with open(join(dir_path, 'prep_final_test_win_feature.pickle'), 'rb') as f:
    test_dic = pickle.load(f)
batch_size = 8192
N = len(test_dic['price'])
ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
    
test_g = test_generator(test_dic, batch_size)
use_cuda = True
model.eval()
with open('./submission.csv', 'w') as f_out:
    f_out.write('id,win_price\n')
    with tqdm(total=ran) as pbar:
        with torch.no_grad():
            for i in range(ran):
                data = next(test_g)
                id, out = test(model, data, use_cuda)
                for j in range(out.shape[0]):
                    v = out[j,0].item()
                    f_out.write('%d,%f\n' % (id[j, 0].item(), v))
                pbar.update(1)
print 'done'

## Comined Testing

In [None]:
import numpy as np
from itertools import count
from tqdm import tqdm
from collections import deque

def test_generator(dic, batch):
    rand_idx = np.arange(len(dic['price']))
    N = len(dic['price'])
    ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
    for i in range(ran):
        idx = rand_idx[i*batch: (i+1)*batch] if (i+1)*batch < len(rand_idx) \
        else rand_idx[i*batch:]
        id = dic['id'][idx, :]
        app = dic['app_type'][idx, :]
        device = dic['device_type'][idx, :]
        imp = dic['imp_type'][idx, :]
        imp_posi = dic['imp_position'][idx, :]
        weekdays = dic['weekdays'][idx, :]
        hours = dic['hours'][idx, :]
        width = dic['width'][idx, :]
        height = dic['height'][idx, :]
        price = dic['price'][idx, :]
        yield (id, width, height, app, device, imp, imp_posi, weekdays, hours, price)
        
def combined_test(m1, m2, data, use_cuda):
    data = list(data)
    for i, v in enumerate(data):
        if v.dtype == np.int32:
            data[i] = torch.LongTensor(v)
        elif v.dtype == np.float32:
            data[i] = torch.FloatTensor(v)
        if use_cuda:
            data[i] = data[i].cuda()
        
    id, width, height, app, device, imp, imp_posi, weekdays, hours, price = data
    out1, sigma = m1(width, height, app, device, imp, imp_posi, weekdays, hours)
    out2, sigma = m2(width, height, app, device, imp, imp_posi, weekdays, hours)
    out = out1*0.5 + out2*0.5
    return id, out



dir_path = './sdml_final_1'
with open(join(dir_path, 'prep_final_test_win_feature.pickle'), 'rb') as f:
    test_dic = pickle.load(f)
batch_size = 8192
N = len(test_dic['price'])
ran = N // batch_size + 1 if N % batch_size != 0 else N // batch_size
    
test_g = test_generator(test_dic, batch_size)
use_cuda = True
m1 = torch.load('./best.tar')
m2 = torch.load('./best_win.tar')
m1.eval()
m2.eval()
with open('./submission.csv', 'w') as f_out:
    f_out.write('id,win_price\n')
    with tqdm(total=ran) as pbar:
        with torch.no_grad():
            for i in range(ran):
                data = next(test_g)
                id, out = combined_test(m1, m2, data, use_cuda)
                for j in range(out.shape[0]):
                    v = out[j,0].item()
                    f_out.write('%d,%f\n' % (id[j, 0].item(), v))
                pbar.update(1)
print 'done'

In [None]:
print len(test_dic['price'])
N = len(test_dic['price']) // batch_size
print N
print (N+1)*batch_size
