In [1]:
import os
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import hamming_loss

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
MAX_LENGTH = 25

In [3]:
data = pickle.load(open(os.path.join('../data/clean/data_clean_100k.res'), 'rb'))

In [4]:
data['source_len']= data.source.str.len()

In [5]:
d_data_selected = data[data['source_len'] == MAX_LENGTH]

In [6]:
d_data_selected.reset_index(drop=True, inplace=True)

In [7]:
idx2char = dict((idx, c) for idx, c in enumerate("abcdefghijklmnopqrstuvwxyz", 1))
char2idx = dict((c, idx) for idx, c in enumerate("abcdefghijklmnopqrstuvwxyz", 1))

In [8]:
idx2char[0] = '<UNK>'
char2idx['<UNK>'] = 0

In [9]:
def get_flag_space(sentence):
    
    no_space = []
    flag_space = []
    sentence = str(sentence)
    for char in sentence: 
        if char != ' ':
            no_space.append(char)
            flag_space.append('0')
        elif char == ' ':
            flag_space[-1] = '1'
            
    no_space = ''.join(no_space)
    flag_space = ''.join(flag_space)
    
    return flag_space

def get_total_space(flag):
    return sum(np.array(list(flag)).astype(int))

In [10]:
def get_source_char_idx(source):
    return [char2idx[char_] for char_ in list(str(source))]

In [11]:
d_data_selected['flag_space'] = d_data_selected['target'].apply(get_flag_space)
d_data_selected['flag_space_sum'] = d_data_selected['flag_space'].map(get_total_space)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
d_data_selected['source_array'] = d_data_selected['source'].map(get_source_char_idx)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
d_data_selected

Unnamed: 0,source,target,source_len,flag_space,flag_space_sum,source_array
0,dibuangtersentuhjubysudah,dibuang tersentuh juby sudah,25,0000001000000001000100000,3,"[4, 9, 2, 21, 1, 14, 7, 20, 5, 18, 19, 5, 14, ..."
1,pintardikenalharikirilaga,pintar dikenal hari kiri laga,25,0000010000001000100010000,4,"[16, 9, 14, 20, 1, 18, 4, 9, 11, 5, 14, 1, 12,..."
2,tirautamajelassesidisawit,tira utama jelas sesi di sawit,25,0001000010000100010100000,5,"[20, 9, 18, 1, 21, 20, 1, 13, 1, 10, 5, 12, 1,..."
3,padasebagaimenyamakanpetr,pada sebagai menyamakan petr,25,0001000000100000000010000,3,"[16, 1, 4, 1, 19, 5, 2, 1, 7, 1, 9, 13, 5, 14,..."
4,demikiansementaraaslilaga,demikian sementara asli laga,25,0000000100000000100010000,3,"[4, 5, 13, 9, 11, 9, 1, 14, 19, 5, 13, 5, 14, ..."
5,jugabandungapbnleveldalam,juga bandung apbn level dalam,25,0001000000100010000100000,4,"[10, 21, 7, 1, 2, 1, 14, 4, 21, 14, 7, 1, 16, ..."
6,laininsurancewidiadetelah,lain insurance widiade telah,25,0001000000001000000100000,3,"[12, 1, 9, 14, 9, 14, 19, 21, 18, 1, 14, 3, 5,..."
7,memainkanagarinipertanian,memainkan agar ini pertanian,25,0000000010001001000000000,3,"[13, 5, 13, 1, 9, 14, 11, 1, 14, 1, 7, 1, 18, ..."
8,polantastetapiharapanlalu,polantas tetapi harapan lalu,25,0000000100000100000010000,3,"[16, 15, 12, 1, 14, 20, 1, 19, 20, 5, 20, 1, 1..."
9,dirilisadakurangberjangka,dirilis ada kurang berjangka,25,0000001001000001000000000,3,"[4, 9, 18, 9, 12, 9, 19, 1, 4, 1, 11, 21, 18, ..."


In [14]:
class Dataset():
    def __init__(self, data):
        row, col = data.shape
        train = data.loc[:int(row*.8)]
        test = data.loc[int(row*.8):]
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
        
        self.lookup = {
            'train': (train, len(train)),
            'test': (test, len(test))
        }
        
        self.set_split('train')
        
    def set_split(self, split = 'train'):
        self.data, self.length = self.lookup[split]
    
    def __getitem__(self, index):
        X = self.data.loc[index, 'source_array']
        X = torch.LongTensor(X)
        
        
        y = np.array(list(self.data.loc[index, 'flag_space'])).astype(int)
        y = torch.Tensor(y).squeeze(0)
        
        return {'x': X,
               'y': y}
    
    def __len__(self):
        return self.length

In [15]:
class Classifier(nn.Module):
    def __init__(self, embedding_dim = 20):
        super(Classifier, self).__init__()
        
        self.embed = nn.Embedding(len(char2idx), embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 256)
        
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 16)
        self.fc5 = nn.Linear(16, 1)
        
        self.relu6 = nn.ReLU6()
        
        
    def forward(self, input_, apply_sigmoid=False):
        
        X = self.embed(torch.LongTensor(input_))
        X, _ = self.lstm(X)
        X = self.fc1(X)
        X = self.fc2(X)
        X = self.fc3(X)
        X = self.fc4(X)
        X = self.fc5(X)
        
        if apply_sigmoid:
            X = torch.sigmoid(X)
        
        y_pred = X.squeeze(2)    
        return y_pred

In [16]:
def compute_accuracy(y_true, y_pred):
    y_true = y_true.long().numpy()
    y_pred = (y_pred > 0.5).long().numpy()
    try:
#         hamming_score = hamming_loss(y_true, y_pred)
#         return 1 - hamming_score
        return (y_true == y_pred).all(axis = 1).mean()
    except:
        print("y_true", y_true, "y_pred", y_pred)

In [17]:
dataset = Dataset(d_data_selected)
classifier = Classifier()

In [18]:
loss_func = nn.BCEWithLogitsLoss(reduction= 'sum')
optimizer = optim.Adam(classifier.parameters(), lr = 0.001, weight_decay=0.002)

In [19]:
history_dict ={
    'acc_train': [],
    'acc_test': [],
    'loss_train': [],
    'loss_test': []
}

In [21]:
for epoch in range(100):
    
    running_loss = 0
    running_acc = 0
    running_loss_val = 0
    running_acc_val = 0
    
    start = time.time()
    
    classifier.train()
    dataset.set_split('train')
    data_generator = DataLoader(dataset=dataset, batch_size=2, shuffle=False)
    for batch_index, batch_dict in enumerate(data_generator, 1):
        optimizer.zero_grad()
        
        y_pred = classifier(batch_dict['x'])
        
        loss_train = loss_func(y_pred, batch_dict['y'])
        loss_item = loss_train.item()
        running_loss += (loss_item - running_loss) / batch_index
        
        loss_train.backward()
        
        accuracy_score = compute_accuracy(batch_dict['y'], y_pred)
        running_acc += (accuracy_score - running_acc) / batch_index
        
        optimizer.step()
        break
        
    classifier.eval()
    dataset.set_split('test')
    data_generator = DataLoader(dataset=dataset, batch_size=2, shuffle=False)
    for batch_index, batch_dict in enumerate(data_generator, 1):
        
        y_pred = classifier(batch_dict['x'])
        
        loss_train_val = loss_func(y_pred, batch_dict['y'])
        loss_item_val = loss_train_val.item()
        running_loss_val += (loss_item_val - running_loss_val) / batch_index
        
        accuracy_score_val = compute_accuracy(batch_dict['y'], y_pred)
        running_acc_val += (accuracy_score_val - running_acc_val) / batch_index
        break
        
    history_dict['acc_train'].append(running_acc)
    history_dict['acc_test'].append(running_acc_val)
    history_dict['loss_train'].append(running_loss)
    history_dict['loss_test'].append(running_loss_val)
    
    print("{:.2f} sec | epoch {} loss train: {:.2f} accuracy train: {:.2f} loss val {:.2f} accuracy val {:.2f}".format(
        time.time() - start, epoch, running_loss, running_acc, running_loss_val, running_acc_val
    ))

0.05 sec | epoch 0 loss train: 24.71 accuracy train: 0.00 loss val 23.86 accuracy val 0.00
0.02 sec | epoch 1 loss train: 23.23 accuracy train: 0.00 loss val 22.70 accuracy val 0.00
0.02 sec | epoch 2 loss train: 21.63 accuracy train: 0.00 loss val 21.71 accuracy val 0.00
0.01 sec | epoch 3 loss train: 20.01 accuracy train: 0.00 loss val 21.11 accuracy val 0.00
0.02 sec | epoch 4 loss train: 18.49 accuracy train: 0.00 loss val 21.11 accuracy val 0.00
0.01 sec | epoch 5 loss train: 17.22 accuracy train: 0.00 loss val 21.81 accuracy val 0.00
0.01 sec | epoch 6 loss train: 16.26 accuracy train: 0.00 loss val 23.18 accuracy val 0.00
0.01 sec | epoch 7 loss train: 15.58 accuracy train: 0.00 loss val 24.99 accuracy val 0.00
0.01 sec | epoch 8 loss train: 15.01 accuracy train: 0.00 loss val 26.98 accuracy val 0.00
0.01 sec | epoch 9 loss train: 14.37 accuracy train: 0.00 loss val 28.92 accuracy val 0.00
0.01 sec | epoch 10 loss train: 13.55 accuracy train: 0.00 loss val 30.69 accuracy val 0.0

In [22]:
input_ = torch.LongTensor([[1,2,3], [2,1,5]])
classifier.embed(input_)

tensor([[[ 4.8660e-01, -1.4273e+00, -3.9109e-01,  4.0494e-01, -9.1139e-01,
           8.0464e-02,  5.7828e-01, -1.4386e+00,  1.2356e+00, -5.3794e-01,
           1.3258e+00,  7.9231e-01, -9.2269e-01,  1.8320e+00, -1.6268e+00,
           2.9007e-01,  6.1937e-01,  5.1852e-01, -1.4722e+00,  1.8639e-01],
         [ 1.2861e+00,  1.2190e+00,  1.8201e+00,  7.4567e-01,  5.8241e-01,
           1.1132e+00,  1.2453e+00, -4.1103e-01,  1.1133e-01, -7.0772e-01,
          -1.8388e+00, -3.8734e-01, -4.4801e-01,  8.9586e-01, -4.5137e-02,
           6.1876e-01,  8.5535e-01, -3.4143e-01, -5.3053e-01,  1.8461e-01],
         [ 5.4632e-01, -2.5116e-01, -1.0225e-01, -2.4044e+00,  3.1527e-03,
           3.0873e-01,  4.9218e-01, -5.3592e-02, -5.3087e-01, -4.3339e-01,
          -6.5787e-01, -1.2070e-04,  5.0195e-01,  2.1899e-01,  1.5569e-01,
          -5.7083e-03, -5.9674e-01,  4.5279e-01,  7.7093e-01, -7.3761e-01]],

        [[ 1.2861e+00,  1.2190e+00,  1.8201e+00,  7.4567e-01,  5.8241e-01,
           1.1132e+0

In [23]:
classifier.forward(torch.LongTensor(d_data_selected.loc[:3,'source_array']))

tensor([[-1.7449e+01, -8.3835e+00, -1.6090e+01, -2.2077e+01, -3.8136e+01,
         -2.4636e+01,  1.2522e+01, -2.7113e+01, -3.2307e+01, -2.0129e+01,
         -2.3182e+01, -3.2307e+01, -2.4636e+01, -2.7113e+01, -2.2077e+01,
          9.8977e-03, -8.7397e+00, -2.2077e+01, -1.6090e+01,  8.0307e+00,
         -2.3182e+01, -2.2077e+01, -1.7449e+01, -3.8136e+01,  9.8974e-03],
        [-9.3182e+00, -2.2133e+01, -2.8868e+01, -5.1900e+01, -6.2045e+01,
          7.5745e+00, -2.0811e+01, -3.1158e+01, -3.2465e+01, -1.7134e+01,
         -1.4039e+01, -5.6891e+01,  1.7777e+01, -2.3491e+01, -6.3611e+01,
         -3.1712e+01,  1.0811e+01, -4.1056e+01, -1.1687e+01, -1.4991e+01,
          7.8552e+00, -3.4989e+01, -4.6718e+01, -8.8989e+00, -4.8360e+01],
        [-3.6548e+01, -3.4705e+01,  8.1868e+00, -8.4629e+01, -5.8451e+01,
          1.6760e+01, -5.5066e+01, -3.0558e+01, -6.8844e+01, -2.7102e+00,
          9.8869e+00, -3.9163e+01,  3.2254e+00, -5.2668e+01, -6.6486e+01,
         -3.2217e+01, -1.7773e+01, -

In [24]:
y_pred

tensor([[-17.4492,  -8.3835, -38.1356, -24.6365, -27.1131, -38.1356, -20.1295,
         -38.1356, -24.6365,   8.0307, -38.1356, -23.1820, -32.3073,  -2.1816,
         -32.3073, -20.1295, -27.1131,  -8.3835,   2.2755, -32.3073, -10.4547,
         -38.1356,  -8.4017, -38.1356, -24.6365],
        [ -9.3182, -51.3102, -40.8702, -13.8582, -31.9202, -62.0448,  -1.5324,
         -27.2404,   8.8628,  -0.8441, -42.7364, -17.7895, -16.1475, -13.7734,
         -39.5886,  25.2888, -11.6049, -51.3102,  11.8879, -29.8764, -24.1528,
         -44.2644, -35.6312, -27.2404,  11.8173]], grad_fn=<SqueezeBackward1>)

In [25]:
batch_dict['y']

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0.]])

In [26]:
loss_func(y_pred, batch_dict['y'], )

tensor(208.6944, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [None]:
plt.plot(history_dict['loss_train'])
plt.plot(history_dict['loss_test'])
plt.ylim(0.0, 1.0)
plt.savefig('../reports/20190916.png')

In [None]:
data = dataset.__getitem__(0)

In [None]:
y = classifier(data['x'].unsqueeze(0))

In [None]:
y > 0.5

In [None]:
data['y']