## Rating csv


In [1]:
import pandas as pd
import datetime
import numpy as  np
from tqdm import tqdm
np.random.seed(1337)
with open('./kaggle/rating_train.csv', 'r') as f:
    ls = f.readlines()[1:]
u_map = {}

dates = []
foods = []
users = []
    

with tqdm(total=len(ls)) as pbar:
    for l in ls:
        date_str, user, food = l.strip().split(',')
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
        user, food = int(user), int(food)
        if user not in u_map:
            u_map[user] = []
        u_map[user].append( (date, food) )
        
        dates.append(date)
        foods.append(food)
        users.append(user)
        pbar.update(1)
        

        
food_map = {f:i for i, f in enumerate(set(foods))}
user_map = {u:i for i, u in enumerate(set(users))}



100%|██████████| 2681494/2681494 [00:20<00:00, 128825.37it/s]


### Generate training data and label

In [2]:
### ALS embeddingimport numpy as  np
from scipy.sparse import csr_matrix

with tqdm(total=len(u_map)) as pbar:
    
#     split 1/10 for future
    Y_map = {}
    pairs = []
    pseudo_history_map = {} # for training and validation
    for user,v in u_map.items():
        history = v
        dates_history = [d for d,f in history]
        min_date, max_date = min(dates_history), max(dates_history)
        date_list = sorted(set(dates_history))
        rest_date_set = set(date_list[-7:])
        
        
        pseudo_history_map[user] = [(d,f) for d,f in history if d not in rest_date_set]
#         calculate the Y
        y = np.zeros([1, len(food_map)])
        for d,f in history:
            if d in rest_date_set:
                y[0, food_map[f]] = 1
            
            else:
                pair = (user_map[user], food_map[f])
                pairs.append(pair)
        Y_map[user] = y
        pbar.update(1)
        
    rows = []
    cols = []
    pairs = set(pairs)
    for pair in pairs:
        u, f = pair
        rows.append(u)
        cols.append(f)
    data = np.ones([len(rows),])
    R = csr_matrix((data, (rows,cols)), shape=(len(user_map),len(food_map)))
        
    
print len(Y_map), R.shape
        
    



100%|██████████| 2608/2608 [00:02<00:00, 1259.06it/s]

2608 (2608, 5532)





### ALS embedding Training

In [3]:
import implicit

# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=32, use_gpu=True, iterations=100)
user_items = R.transpose()
model.fit(user_items)
print model.user_factors.shape
print R.shape


100%|██████████| 100.0/100 [00:00<00:00, 181.88it/s]

(2608, 32)
(2608, 5532)





### Transformer with ALS embedding Training

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# import Transformer
import numpy as np
from constants import FOOD_NUM, USER_NUM
class Net(nn.Module):

    def __init__(self, dm, p_drop):
        super(Net, self).__init__()
        self.drop = nn.Dropout(p_drop)
        self.food_emb = Food_embedding(FOOD_NUM, dm, 2, p_drop, activation_fn=F.relu)
        self.user_emb = nn.Embedding(USER_NUM, dm)
        self.summary_weight = nn.Parameter(torch.FloatTensor(1, dm))
        self.summary_linear = nn.Linear(3*dm, dm)
        nn.init.xavier_normal_(self.summary_weight)
        self.linears = nn.ModuleList([nn.Linear(dm, dm) for i in range(2)])
        self.output_linear = nn.Linear(dm, 1)

    def forward(self, history, user, target):
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        
        K = history
        K = self.food_emb(K)
        target = self.to_ont_hot(FOOD_NUM, target)
        target = self.food_emb(target)
        
        
        batch, K_len, d = K.size()
        Q = self.summary_weight.repeat(batch,1).view(batch, 1, -1)
        att_out = self.attention(Q, K)
        a = self.user_emb(user).view((batch, -1))
        x = torch.cat([att_out, a, target], dim=-1)
        x = F.selu(self.summary_linear(x))
        for lay in self.linears:
            x = F.selu(lay(x))
            x = self.drop(x)
        x = self.output_linear(x)
        return torch.sigmoid(x)
    def to_ont_hot(self, one_hot_dim, indices):
        batch, _ = indices.size()
        one_hot = torch.zeros(batch, one_hot_dim, requires_grad=False)
        one_hot.scatter_(1, indices.cpu(), 1)
        return one_hot.cuda()
    def attention(self, Q,K):
        assert len(Q.shape) == 3 and len(K.shape) == 3
        e = torch.bmm(Q,K.permute(0,2,1))
        
        att = F.softmax(e, dim=-1)
        out = torch.bmm(att, K).squeeze(1)
        return out
        
class Food_embedding(nn.Module):
    def __init__(self, c_in, dm, layer_num, p_drop, activation_fn=F.selu):
        super(Food_embedding, self).__init__()
        self.activation_fn = activation_fn
        self.drop = nn.Dropout(p_drop)
        assert layer_num >= 1
        self.first_linear = nn.Linear(c_in, dm)
        self.linears = nn.ModuleList([nn.Linear(dm, dm) for i in range(layer_num-1)])
        

    def forward(self, x):
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        x = self.first_linear(x)
        x = self.activation_fn(x)
        x = self.drop(x)
        for lay in self.linears:
            x = self.drop(self.activation_fn(lay(x)))
        return x
    
batch = 7
dm = 128
K = torch.rand([batch, 18, FOOD_NUM]).cuda()
user = torch.randint(USER_NUM, [batch, 1], dtype=torch.int64).cuda()
target = torch.randint(FOOD_NUM, [batch, 1], dtype=torch.int64).cuda()

model = Net(dm, 0.1).cuda()
o = model(K, user, target)
# print t
print(o.size())
# print o

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))




torch.Size([7, 1])
1141121


#### prepare sample generator


In [5]:
from constants import MAX_SEQ_LEN

val_num = len(pseudo_history_map) // 10
idx = np.random.permutation(len(pseudo_history_map))
train_idx, val_idx = idx[val_num:], idx[:val_num]
train_u_map = {k:pseudo_history_map[k] for k in pseudo_history_map.keys()[val_num:]}
val_u_map = {k:pseudo_history_map[k] for k in pseudo_history_map.keys()[:val_num]}
def batch_boostrap_generator(batch_size, u_map, food_map, Y_map, max_history_len, flip):
    G = boostrap_generator(u_map, food_map, Y_map, max_history_len, flip)
    while True:
        X = []
        pos_Y = []
        neg_Y = []
        U = []
        for i in range(batch_size):
            x, pos_y, neg_y, u = next(G)
            x = np.pad(x, ((0,max_history_len-x.shape[0]),(0,0)), 'constant', constant_values=0)
            X.append(np.expand_dims(x, axis=0))
            pos_Y.append(np.expand_dims(pos_y, axis=0))
            neg_Y.append(np.expand_dims(neg_y, axis=0))
            U.append(np.expand_dims(u, axis=0))
        yield np.vstack(X), np.vstack(pos_Y), np.vstack(neg_Y), np.vstack(U)
def boostrap_generator(u_map, food_map, Y_map, max_history_len, flip):
    while True:
        keys = u_map.keys()
        for user_idx in np.random.permutation(len(u_map)):
            user = keys[user_idx]
            X = np.zeros([max_history_len, len(food_map)])
            Y = Y_map[user].flatten()
            history = u_map[user]
            ds = np.array([d for d,f in history])
            fs = np.array([f for d,f in history])
            sorted_idx = np.flip(np.argsort(ds), axis=-1) if flip else np.argsort(ds)
            ds = ds[sorted_idx]
            fd = fs[sorted_idx]
            
            date_idx = 0
            now_date = ds[0]
            for food, date in zip(fs,ds):
                if date != now_date:
                    date_idx+=1
                    now_date = date
                X[date_idx, food_map[food]] = 1
            
    #         positive sample
            idx = np.random.permutation(len(Y))
            pos_i = neg_i = -1
            for i in idx:
                if Y[i] == 1 and pos_i == -1:
                    pos_i = i
                if Y[i] == 0 and neg_i == -1:
                    neg_i = i
                if pos_i != -1 and neg_i != -1:
                    break
            yield X, np.array([pos_i]), np.array([neg_i]), torch.LongTensor([user_map[user],])
            
    

G = batch_boostrap_generator(32, train_u_map, food_map, Y_map, max_history_len=MAX_SEQ_LEN, flip=True)
val_G = batch_boostrap_generator(32//2, val_u_map, food_map, Y_map, max_history_len=MAX_SEQ_LEN, flip=True)

x, pos_y, neg_y, u = next(G)
print x.shape, pos_y.shape, neg_y.shape, u.shape
x, pos_y, neg_y, u = next(val_G)
print x.shape, pos_y.shape, neg_y.shape, u.shape

G2 = boostrap_generator(train_u_map, food_map, Y_map, max_history_len=MAX_SEQ_LEN, flip=True)
x, pos_y, neg_y, u = next(G2)
print x.shape, pos_y.shape, neg_y.shape, u.shape




(32, 165, 5532) (32, 1) (32, 1) (32, 1)
(16, 165, 5532) (16, 1) (16, 1) (16, 1)
(165, 5532) (1,) (1,) torch.Size([1])


In [6]:
from collections import deque
from tqdm import tqdm as tqdm

import time
def dump_log(model, n_iter, loss, acc, val_loss, val_acc, log_file_stream, tmp_model_path):
    log_text = '%.7d<split>%.5f<split>%.5f<split>%.5f<split>%.5f\n' % (n_iter, loss, acc, val_loss, val_acc)
    log_file_stream.write(log_text)
    if n_iter % 10 == 0 :
        log_file_stream.flush()
        torch.save(model, tmp_model_path)
def normal_acc(pred, label):
    mask = label.type(torch.uint8)
    buf = pred == mask
    buf = buf.masked_select(mask)
    
    acc = torch.sum(buf).item() / float(torch.sum(mask).item())
    return acc
def rev_mask(m):
    out = torch.ones_like(m, dtype=torch.uint8, requires_grad=False)
    out.masked_fill_(m, 0)
    return out
def normal_loss(criterion, output, y):
    loss = criterion(output*10., y*10.)
    label = y.type(torch.uint8)
    
    
    dim = label.shape[-1]
    sum_1s = torch.sum(label)
    loss_mask = y.clone()*(dim-sum_1s)

    loss_mask.masked_fill_(rev_mask(label), sum_1s)
    return torch.mean(loss_mask*loss)
def foo(output, y):
    output = output[0,:].flatten()
    y = y[0,:].flatten()
    pos_i = neg_i = -1
    for i in np.random.permutation(len(y)):
        if y[i] == 1 and pos_i == -1:
            pos_i = i
        if y[i] == 0 and neg_i == -1:
            neg_i = i
        if pos_i != -1 and neg_i != -1:
            break
    return output[pos_i], output[neg_i]
        
acc_q = deque(maxlen=1000)
loss_q = deque(maxlen=1000)
val_acc_q = deque(maxlen=1000)
val_loss_q = deque(maxlen=1000)
t = time.time()
best_acc  = 0
best_loss = float('inf')

epochs = 100
batch_size = 32
G = batch_boostrap_generator(batch_size, train_u_map, food_map, Y_map, max_history_len=MAX_SEQ_LEN, flip=False)
val_G = batch_boostrap_generator(batch_size, val_u_map, food_map, Y_map, max_history_len=MAX_SEQ_LEN, flip=False)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
print 'start training.'
with open('log_shit.txt', 'a') as f:
    with open('best_shit.txt', 'w') as best_log:
        for e in range(epochs):
            iters = len(food_map) * len(user_map)
            with tqdm(total=iters) as pbar:
                for it in range(iters):
                    optimizer.zero_grad()
                    model.train()
                    x, pos_y, neg_y, u  = next(G)
    #                 print 'x1', np.sum(x[0,:])
                    x = torch.FloatTensor(x).cuda()
                    pos_y = torch.LongTensor(pos_y).cuda()
                    neg_y = torch.LongTensor(neg_y).cuda()
                    u = torch.LongTensor(u).cuda()
                    x.requires_grad_(False)
                    pos_y.requires_grad_(False)
                    neg_y.requires_grad_(False)
                    u.requires_grad_(False)
#                     positive
                    output = model(x, u, pos_y)
                    a = output[0,0].item()
                    pred = output > 0.5
                    label = torch.ones_like(output).cuda()
                    pos_loss = criterion(output, label)
                    
                    acc = torch.sum(pred == label.type(torch.uint8)).item() / float(output.shape[0])
                    acc_q.append(acc)
#                     negative
                    output = model(x, u, neg_y)
                    b = output[0,0].item()
                    
                    pred = output > 0.5
                    label = torch.zeros_like(output).cuda()
                    neg_loss = criterion(output, label)
                    
                    acc = torch.sum(pred == label.type(torch.uint8)).item() / float(output.shape[0])
                    acc_q.append(acc)
                    
                    loss = pos_loss + neg_loss
                    loss.backward()
                    
                    optimizer.step()
                    with torch.no_grad():
                        model.eval()
                        x, pos_y, neg_y, u  = next(val_G)
                        x = torch.FloatTensor(x).cuda()
                        pos_y = torch.LongTensor(pos_y).cuda()
                        neg_y = torch.LongTensor(neg_y).cuda()
                        u = torch.LongTensor(u).cuda()
    #                     positive
                        output = model(x, u, pos_y)
                        c = output[0,0].item()
                    
                        pred = output > 0.5
                        label = torch.ones_like(output).cuda()
                        pos_loss = criterion(output, label)

                        acc = torch.sum(pred == label.type(torch.uint8)).item() / float(output.shape[0])
                        val_acc_q.append(acc)
    #                     negative
                        output = model(x, u, neg_y)
                        d = output[0,0].item()
                    
                        pred = output > 0.5
                        label = torch.zeros_like(output).cuda()
                        neg_loss = criterion(output, label)

                        acc = torch.sum(pred == label.type(torch.uint8)).item() / float(output.shape[0])
                        val_acc_q.append(acc)

                        val_loss = pos_loss + neg_loss
                    acc = np.mean(acc_q)
                    val_acc = np.mean(val_acc_q)

                    pbar.set_postfix_str('acc : %.3f, val_acc : %.3f, loss : %.3f, val_loss : %.3f \t %.3f, %.3f, %.3f, %.3f' % (acc, val_acc, loss.item(), val_loss.item(), a,b,c,d), refresh=False)
                    pbar.update(batch_size)
                    dump_log(model, (it+1)*batch_size, loss, acc, val_loss, val_acc, f, './tmp_shit.pt')
                    if val_acc > best_acc and it > 100:
                        torch.save(model, './best_shit.pt')
                        best_acc = val_acc
                        best_log.write('%d\t%.5f\n' % ((it+1)*batch_size, best_acc))

# Train model
print("Optimization Finished!")
# print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

  0%|          | 0/14427456 [00:00<?, ?it/s]

start training.


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  0%|          | 62912/14427456 [26:18<126:51:26, 31.45it/s, acc : 0.800, val_acc : 0.796, loss : 0.705, val_loss : 0.990 	 0.992, 0.117, 0.976, 0.475]


KeyboardInterrupt: 

In [None]:
a = torch.FloatTensor([0,0,1,1,0,0])
torch.

In [None]:
a = torch.FloatTensor([0,0,1,1,0,0])
b = torch.ByteTensor([0,0,0,1,0,1])
a.requires_grad_(False)
print a.clone().requires_grad

### testing with Transformer model

In [7]:
from constants import MAX_SEQ_LEN

def batch_boostrap_generator(batch_size, u_map, food_map, Y_map, max_history_len, flip):
    G = boostrap_generator(u_map, food_map, Y_map, max_history_len, flip)
    while True:
        X = []
        pos_Y = []
        neg_Y = []
        for i in range(batch_size):
            x, pos_y, neg_y = next(G)
            x = np.pad(x, ((0,max_history_len-x.shape[0]),(0,0)), 'constant', constant_values=0)
            X.append(np.expand_dims(x, axis=0))
            pos_Y.append(np.expand_dims(pos_y, axis=0))
            neg_Y.append(np.expand_dims(neg_y, axis=0))
        yield np.vstack(X), np.vstack(pos_Y), np.vstack(neg_Y) 

        
flip = True       
model = torch.load('./best_shit.pt')
model.eval()
batch_size = 256
food_buf = torch.LongTensor(np.arange(len(food_map))).view(1,-1).repeat(batch_size, 1).cuda()
answer_sheet = np.zeros([len(u_map), len(food_map)])

with torch.no_grad():
    with tqdm(total=len(u_map)*len(food_map)) as pbar:
        keys = u_map.keys()
        ran = range((len(keys) // batch_size) +1) if len(keys) % batch_size != 0 else range(len(keys) // batch_size)
        for i in ran:
            a = i*batch_size
            b = (i+1)*batch_size if (i+1)*batch_size < len(keys) else len(keys)
            X = []
            for user in keys[a:b]:
                x = np.zeros([MAX_SEQ_LEN, len(food_map)])
                history = u_map[user]
                ds = np.array([d for d,f in history])
                fs = np.array([f for d,f in history])
                sorted_idx = np.flip(np.argsort(ds), axis=-1) if flip else np.argsort(ds)
                ds = ds[sorted_idx]
                fd = fs[sorted_idx]

                date_idx = 0
                now_date = ds[0]
                for food, date in zip(fs,ds):
                    if date != now_date:
                        date_idx+=1
                        now_date = date
                    x[date_idx, food_map[food]] = 1
                X.append(np.expand_dims(x,axis=0))
            
            X = torch.FloatTensor(np.vstack(X)).cuda()
            for food_idx  in range(len(food_map)):
                target = food_buf[:b-a, food_idx:food_idx+1]
                output = model(target, X)
                answer_sheet[a:b, food_idx:food_idx+1] =  output.cpu()
                pbar.update(batch_size)
                
np.save('./output_sheet', answer_sheet)
print answer_sheet.shape
print'Done'

rev_food_map = {v:k for k,v in food_map.items()}
k=20
a = ''
buf = []
with open('predict_shit.csv', 'w') as f:
    f.write('userid,foodid\n')
    for user in u_map.keys():
        s = ''
        i = user_map[user]
        for food_idx in reversed(np.argsort(answer_sheet[i,:])[-k:]):
            s += ' %d' % rev_food_map[food_idx]
        f.write('%d,%s\n' % (user, s) )
        buf.append(a == s)
        a = s
print buf
print 'done'

    
            


  0%|          | 0/14427456 [00:00<?, ?it/s]


TypeError: forward() takes exactly 4 arguments (3 given)

In [None]:
from constants import MAX_SEQ_LEN

print 'V2'        
flip = True       
model = torch.load('./best_v2.pt')
model.eval()
batch_size = 256
food_buf = torch.LongTensor(np.arange(len(food_map))).view(1,-1).repeat(batch_size, 1).cuda()
answer_sheet = np.zeros([len(u_map), len(food_map)])

with torch.no_grad():
    with tqdm(total=len(u_map)*len(food_map)) as pbar:
        keys = u_map.keys()
        ran = range((len(keys) // batch_size) +1) if len(keys) % batch_size != 0 else range(len(keys) // batch_size)
        for i in ran:
            a = i*batch_size
            b = (i+1)*batch_size if (i+1)*batch_size < len(keys) else len(keys)
            X = []
            for user in keys[a:b]:
                x = np.zeros([MAX_SEQ_LEN, len(food_map)])
                history = u_map[user]
                ds = np.array([d for d,f in history])
                fs = np.array([f for d,f in history])
                sorted_idx = np.flip(np.argsort(ds), axis=-1) if flip else np.argsort(ds)
                ds = ds[sorted_idx]
                fd = fs[sorted_idx]

                date_idx = 0
                now_date = ds[0]
                for food, date in zip(fs,ds):
                    if date != now_date:
                        date_idx+=1
                        now_date = date
                    x[date_idx, food_map[food]] = 1
                X.append(np.expand_dims(x,axis=0))
            
            X = torch.FloatTensor(np.vstack(X)).cuda()
            for food_idx  in range(len(food_map)):
                target = food_buf[:b-a, food_idx:food_idx+1]
                output = model(target, X)
                answer_sheet[a:b, food_idx:food_idx+1] =  output.cpu()
                pbar.update(batch_size)
                
np.save('./output_sheet2', answer_sheet)
print answer_sheet.shape
print'Done'

rev_food_map = {v:k for k,v in food_map.items()}
k=20
a = ''
buf = []
with open('predict2.csv', 'w') as f:
    f.write('userid,foodid\n')
    for i,user in enumerate(u_map.keys()):
        s = ''
        for food_idx in reversed(np.argsort(answer_sheet[i,:])[-k:]):
            s += ' %d' % rev_food_map[food_idx]
        f.write('%d,%s\n' % (user, s) )
        buf.append(a == s)
        a = s
print buf
print 'done'

    
            


In [None]:
i = 0 
a = [b for b in reversed(np.argsort(answer_sheet[i,:])[-k:])]
print a

# i=1
a = [b for b in reversed(np.argsort(answer_sheet[i,:])[-k:])]
print a



In [None]:
print answer_sheet[0,:]
print answer_sheet[1,:]

In [None]:
# print acc_q
# print output.shape
# c = output < 0.5
# print output < 0.5

print pred.shape
print label.shape
print torch.sum(pred == label.type(torch.uint8)).item() / float(output.shape[0])

In [None]:
a = torch.tensor([[0.49]])
b = torch.zeros_like(a)
print a.shape
print a > 0.5
c = a > 0.5
print b,c
print b.type(torch.uint8) == c


In [None]:
salnj;kvahjk
# print u_map[6]
buf  = [] 
def ck(ds):
    l = max(ds) - min(ds)
    min_d = min(ds)
    
#     d_list = [(d-min_d) for d in ds]
#     d_list = sorted(set(d_list))
    d_list = sorted(set(ds))
#     print l, len(d_list)
    for i,d in enumerate(d_list):
        if i == 0:
            continue
        d_ = d_list[i-1]
        if (d - d_).days.real != 1:
            d_diff = (d - d_).days.real
            buf.append(d_diff)
#             print d_diff
#             assert d_diff < 10
def cf(ds):
    l = max(ds) - min(ds)
    min_d = min(ds)
    
#     d_list = [(d-min_d) for d in ds]
#     d_list = sorted(set(d_list))
    d_list = sorted(set(ds))
    
#     print l, len(d_list)
    for i,d in enumerate(d_list):
        if i == 0:
            continue
        d_ = d_list[i-1]
        if (d - d_).days.real != 1:
            d_diff = (d - d_).days.real
            buf.append(d_diff)
#             print d_diff
#             assert d_diff < 10
                        
    
# for i in [6,8,12]:
with tqdm(total=len(u_map)) as pbar:
    buf = []
    buf2 = []
    for i in u_map.keys():
        ds = [d for d,f in u_map[i]]
        threshold = ((max(ds) - min(ds)).days.real * (4./5.))
        min_d = min(ds)
        fl = []
        m = {}
        for d,f in u_map[i]:
            if (d - min_d).days.real > threshold:
                fl.append(f)
            if d not in m:
                m[d] = []
            m[d].append(f)
        for k,v in m.items():    
            buf2.append(len(set(v)))
#         print len(u_map[i]), len(fl), len(set(fl))
        buf.append(len(set(fl)))
        #     print i, min(ds), max(ds), len(ds)
#         ck(ds)
        pbar.update(1)
# d6 = [d for d,f in u_map[6]]
# ck(d6)
    
#     print ck(ds)
# print u_map[33]
print np.mean(buf), np.std(buf)
print np.mean(buf2), np.std(buf2)


In [None]:


print min(dates), max(dates),  max(dates)- min(dates)


In [None]:
print min(dates)
m = min(dates)
print max(dates)
a = dates[0]
print dir(a - min(dates))
print a
# def normalize_date(min_date, max_date, date):
# for k in u_map.keys()[:10]:
#     print len(u_map[k])
ds = [(d-m).total_seconds() / (60*60*24) for d in dates]
# import numpy as np
print np.mean(ds)
print np.min(ds), np.max(ds)

In [None]:
import keras
from os.path import join
import os
from bs4 import BeautifulSoup as BS
from constants import MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences
import numpy as np
np.random.seed(1337)
def quote_title_abstract(xml_path):
    with open(xml_path, 'r') as f:
        data = f.read()
    soup = BS(data)
    title, abstract = soup.find('title').text, soup.find('abstract').text
    return title.strip(), abstract.strip()

# text preprocessing
data_path = join('./','kaggle/')
xml_dir = join(data_path, 't2-doc')
xml_list = [f for f in os.listdir(xml_dir) if f.endswith('.xml')]
# print(len(xml_list))


texts = []

for xml in xml_list:
    path = join(xml_dir,xml)
    title, abstract = quote_title_abstract(path)
    text = title + '' + abstract
    texts.append(text)
#     texts.append(title)
#     texts.append(abstract)
print('read all %d xml files.' % len(xml_list))
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
xml_id_map = {}
for i,xml in enumerate(xml_list):
    node_id = int(xml.replace('.xml',''))
    xml_id_map[node_id] = data[i,:]


print('Preparing embedding matrix.')
embeddings_index = {}
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r', encoding='utf8') as f:
with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('done')