experiment 1

In [25]:
import os
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
import datetime
from tqdm import tqdm

from sklearn.metrics import hamming_loss

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
tanggal = datetime.datetime.today().strftime('%Y-%m-%d')

In [3]:
MAX_LENGTH = 25

In [4]:
data = pickle.load(open(os.path.join('../data/clean/data_clean_100k.res'), 'rb'))

In [5]:
d_data = data

In [6]:
d_data['source_len']= d_data.source.str.len()

In [7]:
d_data_selected = d_data[d_data['source_len'] == MAX_LENGTH]

In [8]:
d_data_selected.reset_index(drop=True, inplace=True)

In [9]:
idx2char = dict((idx, c) for idx, c in enumerate("abcdefghijklmnopqrstuvwxyz", 1))
char2idx = dict((c, idx) for idx, c in enumerate("abcdefghijklmnopqrstuvwxyz", 1))

In [10]:
idx2char[0] = '<UNK>'
char2idx['<UNK>'] = 0

In [11]:
def char_vectorizer(list_inputs, char_indices):
    x = np.zeros((len([list_inputs]), MAX_LENGTH, len(char_indices)))
    for i, input_ in enumerate([list_inputs]):
        for t, char in enumerate(input_):
            x[i, t, char_indices[char]] = 1
    
    return x

In [12]:
def get_flag_space(sentence):
    
    no_space = []
    flag_space = []
    sentence = str(sentence)
    for char in sentence: 
        if char != ' ':
            no_space.append(char)
            flag_space.append('0')
        elif char == ' ':
            flag_space[-1] = '1'
            
    no_space = ''.join(no_space)
    flag_space = ''.join(flag_space)
    
    return flag_space

In [13]:
def char_vectorizer(list_inputs, char_indices):
    x = np.zeros((len([list_inputs]), MAX_LENGTH, len(char_indices)))
    for i, input_ in enumerate([list_inputs]):
        for t, char in enumerate(input_):
            try:
                x[i, t, char_indices[char]] = 1
            except:
                x[i, t, 0] = 1
    
    return x

In [14]:
def flag_space_to_list(flag):
    return np.array(list(flag)).astype(int)

In [15]:
d_data_selected['flag_space'] = d_data_selected['target'].apply(get_flag_space)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
d_data_selected.loc[:, 'matrix'] = d_data_selected.loc[:, 'source'].apply(char_vectorizer, args=(char2idx,))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [17]:
d_data_selected['flag_space_array'] = d_data_selected.flag_space.apply(flag_space_to_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
d_data_selected['flag_space_sum'] = d_data_selected.flag_space_array.apply(np.sum)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
class Dataset():
    def __init__(self, data):
        row, col = data.shape
        train = data.loc[:int(row*.8)]
        test = data.loc[int(row*.8):]
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
        
        self.lookup = {
            'train': (train, len(train)),
            'test': (test, len(test))
        }
        
        self.set_split('train')
        
    def set_split(self, split = 'train'):
        self.data, self.length = self.lookup[split]
    
    def __getitem__(self, index):
        X = self.data.loc[index, 'matrix']
        X = torch.Tensor(X).squeeze(0)
        
        y = np.array(list(self.data.loc[index, 'flag_space'])).astype(int)
        y = torch.Tensor(y).squeeze(0)
        
        return {'x': X,
               'y': y}
    
    def __len__(self):
        return self.length

In [20]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.lstm = nn.LSTM(len(char2idx), 256)
        
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 16)
        self.fc5 = nn.Linear(16, 1)
        
    def forward(self, input_, apply_sigmoid=False):
        
        y_pred, _ = self.lstm(input_)
        y_pred, _ = self.lstm(input_, _)
        y_pred = self.fc1(y_pred)
        y_pred = self.fc2(y_pred)
        y_pred = self.fc3(y_pred)
        y_pred = self.fc4(y_pred)
        y_pred = self.fc5(y_pred)
        
        if apply_sigmoid:
            y_pred = torch.sigmoid(y_pred)
        
        y_pred = y_pred.squeeze(2)    
        return y_pred

In [21]:
def compute_accuracy(y_true, y_pred):
    y_true = y_true.long().numpy()
    y_pred = (y_pred > 0.5).long().numpy()
    try:
#         hamming_score = hamming_loss(y_true, y_pred)
#         return 1 - hamming_score
        return (y_true == y_pred).all(axis = 1).mean()
    except:
        print("y_true", y_true, "y_pred", y_pred)

In [27]:
dataset = Dataset(d_data_selected)
classifier = Classifier()

In [28]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr = 0.001, weight_decay=0.002)

In [29]:
history_dict ={
    'acc_train': [],
    'acc_test': [],
    'loss_train': [],
    'loss_test': []
}

In [30]:
try:
    for epoch in tqdm(range(100)):

        running_loss = 0
        running_acc = 0
        running_loss_val = 0
        running_acc_val = 0

        start = time.time()

        classifier.train()
        dataset.set_split('train')
        data_generator = DataLoader(dataset=dataset, batch_size=512, shuffle=True)
        for batch_index, batch_dict in tqdm(enumerate(data_generator, 1)):
            optimizer.zero_grad()

            y_pred = classifier(batch_dict['x'])

            loss_train = loss_func(y_pred, batch_dict['y'])
            loss_item = loss_train.item()
            running_loss += (loss_item - running_loss) / batch_index

            loss_train.backward()

            accuracy_score = compute_accuracy(batch_dict['y'], y_pred)
            running_acc += (accuracy_score - running_acc) / batch_index

            optimizer.step()

        classifier.eval()
        dataset.set_split('test')
        data_generator = DataLoader(dataset=dataset, batch_size=512, shuffle=True)
        for batch_index, batch_dict in tqdm(enumerate(data_generator, 1)):

            y_pred = classifier(batch_dict['x'])

            loss_train_val = loss_func(y_pred, batch_dict['y'])
            loss_item_val = loss_train_val.item()
            running_loss_val += (loss_item_val - running_loss_val) / batch_index

            accuracy_score_val = compute_accuracy(batch_dict['y'], y_pred)
            running_acc_val += (accuracy_score_val - running_acc_val) / batch_index

        history_dict['acc_train'].append(running_acc)
        history_dict['acc_test'].append(running_acc_val)
        history_dict['loss_train'].append(running_loss)
        history_dict['loss_test'].append(running_loss_val)

        print("{:.2f} sec | epoch {} loss train: {:.2f} accuracy train: {:.2f} loss val {:.2f} accuracy val {:.2f}".format(
            time.time() - start, epoch, running_loss, running_acc, running_loss_val, running_acc_val
        ))
except KeyboardInterrupt:
    print("exit loop")


  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

0it [00:00, ?it/s]

1it [00:08,  8.01s/it]

2it [00:16,  8.03s/it]

3it [00:24,  8.04s/it]

4it [00:32,  8.05s/it]

5it [00:40,  8.07s/it]

6it [00:48,  8.02s/it]

7it [00:56,  8.04s/it]

8it [01:04,  8.05s/it]

9it [01:13,  8.26s/it]

10it [01:21,  8.32s/it]

11it [01:30,  8.59s/it]

12it [01:40,  8.84s/it]

13it [01:50,  9.16s/it]

14it [02:00,  9.46s/it]

15it [02:09,  9.31s/it]

16it [02:18,  9.29s/it]

17it [02:28,  9.49s/it]

18it [02:38,  9.62s/it]

19it [02:48,  9.63s/it]

20it [02:57,  9.68s/it]

21it [03:06,  9.50s/it]

22it [03:15,  9.28s/it]

23it [03:24,  9.09s/it]

24it [03:32,  8.92s/it]

25it [03:41,  8.78s/it]

26it [03:49,  8.60s/it]

27it [03:57,  8.49s/it]

28it [04:06,  8.45s/it]

29it [04:14,  8.54s/it]

30it [04:23,  8.56s/it]

31it [04:32,  8.65s/it]

32it [04:41,  8.75s/it]

33it [04:49,  8.58s/it]

34it [04:57,  8.54s/it]

35it [05:06,  8.5

exit loop


In [None]:
plt.plot(history_dict['loss_train'])
plt.plot(history_dict['loss_test'])
plt.ylim(0.0, 1.0)
plt.savefig('../reports/{}.png'.format(tanggal))

pickle.dump(open("../reports{}.pkl".format(tanggal), 'wb'))