In [1]:
import os
import pickle
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [2]:
MAX_LENGTH = 10

In [3]:
data = pickle.load(open(os.path.join('../data/clean/data_clean_100k.res'), 'rb'))

In [4]:
d_data = data

In [5]:
d_data['source_len']= d_data.source.str.len()

In [6]:
d_data_selected = d_data[d_data['source_len'] == MAX_LENGTH]

In [7]:
d_data_selected.reset_index(drop=True, inplace=True)

In [8]:
idx2char = dict((idx, c) for idx, c in enumerate("abcdefghijklmnopqrstuvwxyz", 1))
char2idx = dict((c, idx) for idx, c in enumerate("abcdefghijklmnopqrstuvwxyz", 1))

In [9]:
idx2char[0] = '<UNK>'
char2idx['<UNK>'] = 0

In [10]:
def char_vectorizer(list_inputs, char_indices):
    x = np.zeros((len([list_inputs]), MAX_LENGTH, len(char_indices)))
    for i, input_ in enumerate([list_inputs]):
        for t, char in enumerate(input_):
            x[i, t, char_indices[char]] = 1
    
    return x

In [11]:
def get_flag_space(sentence):
    
    no_space = []
    flag_space = []
    sentence = str(sentence)
    for char in sentence: 
        if char != ' ':
            no_space.append(char)
            flag_space.append('0')
        elif char == ' ':
            flag_space[-1] = '1'
            
    no_space = ''.join(no_space)
    flag_space = ''.join(flag_space)
    
    return flag_space

In [12]:
def char_vectorizer(list_inputs, char_indices):
    x = np.zeros((len([list_inputs]), MAX_LENGTH, len(char_indices)))
    for i, input_ in enumerate([list_inputs]):
        for t, char in enumerate(input_):
            try:
                x[i, t, char_indices[char]] = 1
            except:
                x[i, t, 0] = 1
    
    return x

In [13]:
d_data_selected['flag_space'] = d_data_selected['target'].apply(get_flag_space)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
d_data_selected.loc[:, 'matrix'] = d_data_selected.loc[:, 'source'].apply(char_vectorizer, args=(char2idx,))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [116]:
class Dataset():
    def __init__(self, data):
        row, col = data.shape
        train = data.loc[:int(row*.8)]
        test = data.loc[int(row*.8):]
        train.reset_index(drop=True, inplace=True)
        test.reset_index(drop=True, inplace=True)
        
        self.lookup = {
            'train': (train, len(train)),
            'test': (test, len(test))
        }
        
        self.set_split('train')
        
    def set_split(self, split = 'train'):
        self.data, self.length = self.lookup[split]
    
    def __getitem__(self, index):
        X = self.data.loc[index, 'matrix']
        X = torch.Tensor(X).squeeze(0)
        
        y = np.array(list(self.data.loc[index, 'flag_space'])).astype(int)
        y = torch.Tensor(y).squeeze(0)
        
        return {'x': X,
               'y': y}
    
    def __len__(self):
        return self.length

In [137]:
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier, self).__init__()
        self.lstm = nn.LSTM(len(char2idx), 256)
        
        self.fc = nn.Linear(256, 1)
        
    def forward(self, input_, apply_sigmoid=False):
        
        y_pred, _ = self.lstm(input_)
        y_pred = self.fc(y_pred)
        
        if apply_sigmoid:
            y_pred = torch.sigmoid(y_pred)
        
        y_pred = y_pred.squeeze(2)    
        return y_pred

In [138]:
def accuracy(y_true, y_pred):
    pass

In [139]:
dataset = Dataset(d_data_selected)
classifier = Classifier()

In [140]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr = 0.001)

In [141]:
running_loss = 0
running_acc = 0
running_loss_val = 0
running_acc_val = 0

In [142]:
for epoch in range(10):
    classifier.eval()
    data_generator = DataLoader(dataset=dataset, batch_size=2, shuffle=True)
    for batch_index, batch_dict in enumerate(data_generator, 1):
        optimizer.zero_grad()
        
        y_pred = classifier(batch_dict['x'])
        
        loss_train = loss_func(y_pred, batch_dict['y'])
        loss_item = loss_train.item()
        
        accuracy_score = accuracy(batch_dict['y'], y_pred)
        
        running_loss += (running_loss - loss_item) / batch_index
        
        loss_train.backward()
        
        optimizer.step()
        break
    break

In [156]:
y_pred = (y_pred > 0.5).long()

In [159]:
col, row = y_pred.shape

In [160]:
col * row

20

In [152]:
batch_dict['y'].long()

tensor([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [163]:
torch.eq((y_pred > 0.5).long(), batch_dict['y'].long()).sum().item() / (col * row)

0.85