In [28]:
%matplotlib inline
import importlib
from pytorch_utils_oh_2 import *

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [27]:
MODEL_SAVE_PATH = 'category_simple_7_minibatches'

In [None]:
import pytorch_utils_oh_2; importlib.reload(pytorch_utils_oh_2); from pytorch_utils_oh_2 import *;

# Data loading

In [4]:
all_data = pickle.load(open("data/en_train_fixed_1.pkl", "rb" ))
all_data_sentence_index = all_data.set_index('sentence_id')

In [5]:
all_data.sample(2)

Unnamed: 0,sentence_id,token_id,class,before,after,class_org
8934765,675150,4,NOT_CHANGED,hepatitis,hepatitis,PLAIN
5025258,384265,4,NOT_CHANGED,man,man,PLAIN


In [7]:
categories_all = sorted(all_data["class"].unique())
print(categories_all)
print(len(categories_all))
categories_index = dict((c, i) for i, c in enumerate(categories_all))

['ELECTRONIC', 'LETTERS', 'NOT_CHANGED', 'NUMBERS', 'PLAIN', 'VERBATIM']
6


### Utils stuff

In [8]:
chars_normal, chars_normal_index = load_characters_pkl('data/en_features/chars_normal.pkl')
print(''.join(chars_normal))

<SOS><EOS>☒ !"#$%&'(),-./0123456789:;ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~£¥ª²³µº¼½¾éɒʻˈΩμ—€⅓⅔⅛


In [29]:
common_words, common_words_index = load_common_words_10k()
len(common_words)
common_words[0:10]

8192

['<EOS>', '<SOS>', '<UNK>', '<0000>', '<SAMPLE>', '.', ',', 'the', '"', 'of']

### More balanced sample

In [30]:
all_data.groupby("class")["class"].count()

class
ELECTRONIC        4964
LETTERS         152986
NOT_CHANGED    9258648
NUMBERS         448172
PLAIN            36472
VERBATIM         16950
Name: class, dtype: int64

In [31]:
balanced_data_classes_select = list(all_data.groupby('class'))

balanced_data_accessed_counter = 0 
balanced_data_length = 0
def balanced_data_randomize(max_len=20000):
    global balanced_data, balanced_data_length, balanced_data_accessed_counter
    balanced_data = pd.concat([v.sample(min(max_len, len(v))) for k, v in balanced_data_classes_select])
    balanced_data_length = len(balanced_data)
    balanced_data_accessed_counter = 0

def balanced_data_sample_row():
    global balanced_data_accessed_counter
    global balanced_data_last_sample
    balanced_data_accessed_counter += 1
    if balanced_data_accessed_counter/balanced_data_length > 0.2:
        balanced_data_randomize()
    balanced_data_last_sample = balanced_data.iloc[random.randint(1, balanced_data_length-1)]
    return balanced_data_last_sample
    
balanced_data_randomize()

In [32]:
balanced_data.groupby("class")["class"].count()

class
ELECTRONIC      4964
LETTERS        20000
NOT_CHANGED    20000
NUMBERS        20000
PLAIN          20000
VERBATIM       16950
Name: class, dtype: int64

### Samples

In [43]:
def get_random_sample():
    sample_row = balanced_data_sample_row()
    sentence_id = sample_row['class']

    rows = all_data_sentence_index.loc[sample_row['sentence_id']]
    befores = list(rows.before)
    token_id_idx = list(rows['token_id']).index(sample_row['token_id'])
    #befores[token_id_idx] = SAMPLE_WORD_TOKEN
    
    befores = [simple_tokeniser(w) for w in befores]
    befores[token_id_idx] = [SAMPLE_WORD_TOKEN]
    befores = np.concatenate(befores)
    
    return sample_row['before'], sample_row['after'], sample_row['class'], befores
            
def tmp():
    s_bef, s_aft, s_class, s_sentence = get_random_sample()
    print(s_class, ':', s_bef, '->', s_aft)
    print(' '.join(s_sentence))
    wt = words_to_tensor(list(s_sentence), common_words_index)
    print(' '.join([common_words[w] for w in wt.topk(1)[1][0].numpy()[:,0]]))
tmp()

PLAIN : - -> to
virtue 2001 , p . 161 <SAMPLE> 162 .
<UNK> 2001 , p . 161 <SAMPLE> 162 . <EOS>


# Model

In [44]:
class CategorizeRNN(nn.Module):
    def __init__(self, output_size, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_dropout=0, chars_dropout=0, words_layers=1, chars_layers=1):
        super(CategorizeRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size
        
        batch_first = False

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 dropout=words_dropout, batch_first=batch_first, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                 dropout=chars_dropout, batch_first=batch_first, bidirectional=True)
        
        self.lin_output = nn.Linear(words_hidden_size+chars_hidden_size, output_size)
        
    def forward(self, words_tensor, string_tensor, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(words_tensor, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        #output = self.lin_1(output)
        output = self.lin_output(output)
        output = F.log_softmax(output)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [45]:
model = CategorizeRNN(len(categories_all), 
                      words_input_size=len(common_words), chars_input_size=len(chars_normal),
                      words_hidden_size=128, chars_hidden_size=128,
                      words_layers=2, chars_layers=2,
                      words_dropout=0.2, chars_dropout=0.2)
model = model.cuda()

model_training = ModelTraining(MODEL_SAVE_PATH, [model])

model

Save path: data/models/category_simple_7_minibatches


CategorizeRNN (
  (rnn_words): LSTM(8192, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (rnn_chars): LSTM(104, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (lin_output): Linear (256 -> 6)
)

### Testing batching

In [246]:
s_bef, s_aft, s_class, s_sentence = sample = get_random_sample()


'ltd'

In [248]:
def pad(tensor, length):
    return torch.cat([tensor, tensor.new(length - tensor.size(0), *tensor.size()[1:]).zero_()])

In [300]:
sbefs_org = ['aaaaa', 'bb']

In [282]:
sbefs = [string_to_tensor(s_bef, chars_normal_index).view(1,104,-1) for s_bef in sbefs]

In [293]:
sbefs[0].squeeze().unsqueeze(1).size()

torch.Size([6, 1, 104])

In [294]:
sbefs = [t.squeeze().unsqueeze(1) for t in sbefs]

In [295]:
[t.size() for t in sbefs]

[torch.Size([6, 1, 104]), torch.Size([3, 1, 104])]

In [298]:
max_length = sbefs[0].size()[0]
sbefs_padded = [pad(t, max_length) for t in sbefs]
#torch.cat(sbefs_padded)

In [299]:
[t.size() for t in sbefs_padded]

[torch.Size([6, 1, 104]), torch.Size([6, 1, 104])]

In [334]:
torch.zeros(10).new(1,0,0).size()

torch.Size([1])

In [329]:
def pad(tensor, length):
    return tensor.new(1, length - tensor.size(1), *tensor.size()[2:]).zero_()
    return torch.cat([tensor, tensor.new(1, length - tensor.size(1), *tensor.size()[2:]).zero_()])

In [303]:
sbefs = [string_to_tensor(s_bef, chars_normal_index) for s_bef in sbefs_org]

In [332]:
[t.size() for t in sbefs]

[torch.Size([1, 6, 104]), torch.Size([1, 3, 104])]

In [313]:
sbefs[0].squeeze().size()

torch.Size([6, 104])

In [314]:
max_length = sbefs[0].size()[1]
sbefs_padded = [pad(t.squeeze(), max_length) for t in sbefs]
[t.size() for t in sbefs_padded]

[torch.Size([6, 104]), torch.Size([6, 104])]

In [330]:
max_length = sbefs[0].size()[1]
sbefs_padded = [pad(t, max_length) for t in sbefs]
[t.size() for t in sbefs_padded]

[torch.Size([1]), torch.Size([1, 3, 104])]

In [280]:
tmp, hidden_chars = model.init_hidden()

In [270]:
model.rnn_chars(Variable(sbefs_padded), hidden_chars)

RuntimeError: Variable data has to be a tensor, but got list

In [58]:
def test_model_n_samples(model, n_samples=2):
    #words_t_arr = []; string_t_arr = []
    
    samples = [get_random_sample() for _ in range(2)]
    sorted(samples, key=lambda a: -len(a[0]))

    for _ in range(n_samples):
        s_bef, s_aft, s_class, s_sentence = sample = get_random_sample()
        

        words_t_arr.append(words_to_tensor(s_sentence, common_words_index))
        string_t_arr.append(string_to_tensor(s_bef, chars_normal_index))
    words_t = torch.cat(words_t_arr)
    #string_t = torch.cat(string_t_arr)
    return words_t
    
    words_t = Variable(words_t).cuda()
    string_t = Variable(string_t).cuda()
    
    output = model(words_t, string_t)
    guess = category_from_output(output, categories_all)

    return output, guess, (s_class, categories_index[s_class])#, sample
    
tmp = test_model_n_samples(model)
tmp

RuntimeError: inconsistent tensor sizes at /opt/conda/conda-bld/pytorch_1503970438496/work/torch/lib/TH/generic/THTensorMath.c:2709

### Accuracy

In [None]:
def test_model_single_sample(model):
    s_bef, s_aft, s_class, s_sentence = sample = get_random_sample()
    
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    output = model(words_t, string_t)
    guess = category_from_output(output, categories_all)

    return output, guess, (s_class, categories_index[s_class]), sample
    
tmp = test_model_single_sample(model)
tmp

In [None]:
def print_local_wrong_predictions(max_results=10):
    arr = get_some_wrong_predictions(model, test_model_single_sample, max_iterations=10000, max_results=max_results)
    for sample, predict, output in arr:
        s_bef, s_aft, s_class, s_sentence = sample
        m = torch.nn.functional.softmax(output).max().data[0]
        print("{:>11} -> {:>11} ({:0.3}) || {:>8} -> {} == {}".format(s_class, predict[0], m, s_bef, s_aft, ' '.join(s_sentence), ))

In [None]:
print_local_wrong_predictions(max_results=1)

### Training

In [None]:
def train(model, s_category, s_sentence, s_bef, loss_function, optimizer):
    
    category_tensor = Variable(torch.LongTensor([categories_index[s_category]])).cuda()
    
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    output = model(words_t, string_t)
    loss = loss_function(output, category_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return output, loss.data[0]

In [None]:
def train_iterations(n_iters=100000, lr=0.001, print_every=10000, plot_every=1000):

    start = time.time()
    model.train()

    current_loss = 0
    current_loss_iter = 0

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.NLLLoss()
    
    for iteration in range(1, n_iters + 1):
        model_training.iterations += 1
        
        s_bef, s_aft, s_class, s_sentence = get_random_sample()
        
        result, loss = train(model=model, s_category=s_class, s_sentence=s_sentence,
                             s_bef=s_bef, optimizer=optimizer, loss_function=loss_function)

        current_loss += loss
        current_loss_iter += 1

        # Print iter number, loss, name and guess
        if iteration % print_every == 0:
            guess, guess_i = category_from_output(result, categories_all)
            correct = '✓' if guess == s_class else "✗ {}".format(s_class)
            print("{:>6d} {:>4.0%} ({:>8}) {:>7.3f}   | {:>6.2f}: {} -> {} ({})".format(
                      model_training.iterations, iteration/n_iters, time_since(start),
                      current_loss/current_loss_iter, loss,
                      s_bef, guess, correct))

        # Add current loss avg to list of losses
        if iteration % plot_every == 0:
            model_training.losses.append(current_loss / plot_every)
            model_training.learning_rates.append(lr)
            current_loss = 0
            current_loss_iter = 0
            
        if model_training.iterations % 50000 == 0 or model_training.iterations == 10:
            model_training.save_models()
            acc = test_model_accuracy(model, test_model_single_sample)
            model_training.accuracy.append(acc)
    
    # test_model_accuracy(model, n_sample=10000)

In [None]:
train_iterations(n_iters=50, print_every=9, lr=0.0001)

In [None]:
train_iterations(n_iters=(1000-model_training.iterations), print_every=500, lr=0.0001)

In [None]:
train_iterations(n_iters=10000, lr=0.0001, print_every=1000)

In [None]:
train_iterations(n_iters=89000, print_every=10000)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=100000, print_every=10000)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=100000, lr=0.0001, print_every=10000)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=100000, lr=0.001, print_every=10000)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=300000, lr=0.0001, print_every=25000)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=300000, lr=0.0001, print_every=10000)

In [None]:
print_local_wrong_predictions()

In [None]:
train_iterations(n_iters=300000, lr=0.00005, print_every=10000)

In [None]:
print_local_wrong_predictions()

# Results

In [None]:
plt.figure()
plt.plot(model_training.losses)

In [None]:
plt.figure()
plt.plot(model_training.accuracy)

In [None]:
%%time
plot_category_confusion_matrix(model, categories_all, test_model_single_sample, n_confusion=50000)

In [None]:
%%time
plot_category_confusion_matrix(model, categories_all, test_model_single_sample,
                               n_confusion=50000, remove_diagonal=True)

In [None]:
%%time
plot_category_confusion_matrix(model, categories_all, test_model_single_sample,
                               n_confusion=50000, remove_diagonal=True)