In [3]:
%matplotlib inline
import importlib
from pytorch_utils_oh_2 import *

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


In [4]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [5]:
import pytorch_utils_oh_2; importlib.reload(pytorch_utils_oh_2); from pytorch_utils_oh_2 import *;

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


# Data loading

In [6]:
chars_normal, chars_normal_index = load_characters_pkl('data/en_features/chars_normal.pkl')
common_words, common_words_index = load_common_words_10k()
chars_with_changes = pickle.load(open('data/en_features/chars_with_changes.pkl', "rb"))
chars_with_no_changes_re = re.compile("[^{}]".format(''.join(chars_with_changes)))
chars_with_single_output_dict = pickle.load(open('data/en_features/chars_with_single_output_dict.pkl', "rb"));

In [7]:
test_data_org = pd.read_csv('data/en_test.csv', keep_default_na=False)
len(test_data_org)

1088564

In [8]:
test_data = test_data_org.copy()
test_data_sentence_index = test_data.set_index('sentence_id')

In [9]:
test_data.sample(2)

Unnamed: 0,sentence_id,token_id,before
53612,3434,3,SchoolWaconda
881019,56651,19,release


In [10]:
chars_with_single_output_dict

{'+': 'plus w',
 '=': 'equals',
 '>': 'greater than',
 '×': 'times',
 'Α': 'alpha',
 'Β': 'beta',
 'Γ': 'gamma',
 'Δ': 'delta',
 'Ε': 'epsilon',
 'Ζ': 'zeta',
 'Η': 'eta',
 'Θ': 'theta',
 'Ι': 'iota',
 'Κ': 'kappa',
 'Λ': 'lambda',
 'Μ': 'mu',
 'Ν': 'nu',
 'Ξ': 'xi',
 'Ο': 'omicron',
 'Π': 'pi',
 'Ρ': 'rho',
 'Σ': 'sigma',
 'Τ': 'tau',
 'Υ': 'upsilon',
 'Φ': 'phi',
 'Χ': 'chi',
 'Ψ': 'psi',
 'α': 'alpha',
 'β': 'beta',
 'γ': 'gamma',
 'δ': 'delta',
 'ε': 'epsilon',
 'ζ': 'zeta',
 'η': 'eta',
 'θ': 'theta',
 'ι': 'iota',
 'κ': 'kappa',
 'λ': 'lambda',
 'ν': 'nu',
 'ξ': 'xi',
 'ο': 'omicron',
 'π': 'pi',
 'ρ': 'rho',
 'ς': 'sigma',
 'σ': 'sigma',
 'τ': 'tau',
 'υ': 'upsilon',
 'φ': 'phi',
 'χ': 'chi',
 'ψ': 'psi',
 'ω': 'omega',
 '⅝': 'five eighths',
 '⅞': 'seven eighths'}

In [11]:
def manual_pre_checks(before):
    if chars_with_no_changes_re.search(before): #contains chars that were never changed
        return before 
    if before in chars_with_single_output_dict:
        return chars_with_single_output_dict[before]
    return False

In [12]:
manual_pre_checks('yes')
manual_pre_checks('Ε')
manual_pre_checks('利')

False

'epsilon'

'利'

In [14]:
manual_pre_checks('ς')

'sigma'

# Models

### Categorizer

In [10]:
categories_all = ['NOT_CHANGED', 'NUMBERS', 'LETTERS', 'PLAIN', 'VERBATIM', 'ELECTRONIC']
categories_index = dict((c, i) for i, c in enumerate(categories_all))

In [11]:
class CategorizeRNN(nn.Module):
    def __init__(self, output_size, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_dropout=0, chars_dropout=0, words_layers=1, chars_layers=1):
        super(CategorizeRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 dropout=words_dropout, batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                 dropout=chars_dropout, batch_first=True, bidirectional=True)
        
        self.lin_output = nn.Linear(words_hidden_size+chars_hidden_size, output_size)
        
    def forward(self, words_tensor, string_tensor, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(words_tensor, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        #output = self.lin_1(output)
        output = self.lin_output(output)
        output = F.log_softmax(output)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [12]:
m_categorize = CategorizeRNN(len(categories_all), 
                              words_input_size=len(common_words), chars_input_size=len(chars_normal),
                              words_hidden_size=128, chars_hidden_size=128,
                              words_layers=2, chars_layers=2,
                              words_dropout=0.2, chars_dropout=0.2)
m_categorize = m_categorize.cuda()

m_categorize

m_categorize.load_state_dict(torch.load('data/models/category_6_mod_data_common_words/400000_CategorizeRNN'))

CategorizeRNN (
  (rnn_words): LSTM(8192, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (rnn_chars): LSTM(104, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (lin_output): Linear (256 -> 6)
)

In [170]:
def categorize(s_bef, s_sentence):
    model = m_categorize
    #words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = words_to_tensor(sentence_arr_tokenize(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    output = model(words_t, string_t)
    guess = category_from_output(output, categories_all)
    confidence = torch.nn.functional.softmax(output).topk(1)[0].data[0][0]
    return guess[0], confidence

categorize('hello', ['<SAMPLE> it\'s me'])
categorize('hello', 'Hello welcome to <SAMPLE>'.split(' '))

('NOT_CHANGED', 0.9998573064804077)

('NOT_CHANGED', 0.9991723299026489)

### Numbers

In [58]:
number_words = pickle.load(open('data/models/numbers_gen_4_mod_data_1/number_words.pkl', 'rb'))
number_words_index = dict((c, i) for i, c in enumerate(number_words))
len(number_words_index)
def number_words_to_tensor(words, include_eos=True):
    return words_to_tensor(words, words_lookup_index=number_words_index, include_eos=include_eos)
number_words_onehot_sos = number_words_to_tensor([SOS_TOKEN], include_eos=False)

511

In [48]:
class NumEncoderRNN(nn.Module):
    def __init__(self, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(NumEncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [49]:
num_encoder_rnn = NumEncoderRNN(words_input_size=len(common_words), chars_input_size=len(chars_normal),
                         words_hidden_size=128, chars_hidden_size=128,
                         words_layers=2, chars_layers=2).cuda()
num_encoder_rnn

NumEncoderRNN (
  (rnn_words): LSTM(8192, 64, num_layers=2, batch_first=True, bidirectional=True)
  (rnn_chars): LSTM(104, 64, num_layers=2, batch_first=True, bidirectional=True)
)

In [57]:
class NumDecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(NumDecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
                         # LSTM would require own hidden included
        
        self.lin_out = nn.Linear(hidden_size, input_size)
        #self.softmax = nn.LogSoftmax()

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden

num_decoder_rnn = NumDecoderRNN(input_size=len(number_words), hidden_size=128*2, n_layers=1)
num_decoder_rnn = num_decoder_rnn.cuda()
num_decoder_rnn

NumDecoderRNN (
  (rnn): GRU(511, 256, batch_first=True)
  (lin_out): Linear (256 -> 511)
)

In [62]:
num_encoder_rnn.load_state_dict(torch.load('data/models/numbers_gen_4_mod_data_1/2250000_EncoderRNN'))
num_decoder_rnn.load_state_dict(torch.load('data/models/numbers_gen_4_mod_data_1/2250000_DecoderRNN'))

In [181]:
def num_model_eval(s_bef, s_sentence):
    #words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = words_to_tensor(sentence_arr_tokenize(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = num_encoder_rnn(words_t, string_t)
    
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = encoder_output
    decoder_input = Variable(number_words_onehot_sos).cuda()

    decoded_output = []
    max_length = 20
    
    decoder_confidences = []
    
    for _ in range(max_length):
        decoder_output, decoder_hidden = num_decoder_rnn(decoder_input, decoder_hidden)
        #return decoder_output

        topv, topi = decoder_output.data.topk(1)
        decoder_confidences.append(topv[0][0])
        word_index = topi[0][0]
        word = number_words[word_index] # Use own prediction as next input
                
        if word == EOS_TOKEN:
            break

        decoded_output.append(word)
        
        decoder_input = number_words_to_tensor([word], include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
    
    output = ' '.join(decoded_output)
    return output, np.sum(decoder_confidences)
    
num_model_eval('4200', ['he', 'was', '<SAMPLE>', 'years', 'old'])
num_model_eval('25200€', ['the', 'price', 'is', '<SAMPLE>'])
num_model_eval('25200€', ['my', 'phone', 'is', '<SAMPLE>'])


('four thousand two hundred', -0.065897941589355469)

('twenty five thousand two o', -2.3529701232910156)

('twenty five thousand two hundred two', -2.8845462799072266)

In [199]:
def numbers(s_bef, sentence_words):
    return num_model_eval(s_bef, sentence_words)

## Whole rnn

In [205]:
chars_after = pickle.load(open('data/en_features/chars_after_1.pkl', 'rb'))
chars_after_index = dict((c, i) for i, c in enumerate(chars_after))
def after_string_to_tensor(word, include_eos=True):
    return string_to_tensor(word, chars_index=chars_after_index, include_eos=include_eos)
after_string_to_tensor('abcé').shape
after_string_to_tensor('abcé')[0, -2, -1] == 1

torch.Size([1, 5, 32])

True

In [207]:
whole_onehot_sos = after_string_to_tensor([SOS_TOKEN], include_eos=False)
whole_onehot_sos.size()

torch.Size([1, 1, 32])

In [201]:
class WholeEncoderRNN(nn.Module):
    def __init__(self, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(WholeEncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [202]:
whole_encoder_rnn = WholeEncoderRNN(words_input_size=len(common_words), chars_input_size=len(chars_normal),
                         words_hidden_size=128, chars_hidden_size=128,
                         words_layers=2, chars_layers=2).cuda()
whole_encoder_rnn

WholeEncoderRNN (
  (rnn_words): LSTM(8192, 64, num_layers=2, batch_first=True, bidirectional=True)
  (rnn_chars): LSTM(104, 64, num_layers=2, batch_first=True, bidirectional=True)
)

In [209]:
class WholeDecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(WholeDecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        self.rnn = nn.GRU(input_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
        
        self.lin_out = nn.Linear(hidden_size, input_size)
        #self.softmax = nn.LogSoftmax()

    def forward(self, char, hidden):
        #char = char.view(1,1,-1)
        #hidden = hidden.view(1,1,-1)
        output, hidden = self.rnn(char, hidden)
        output = output[:, -1] # view(1,-1)
        output = self.lin_out(output)
        output = F.log_softmax(output)
        return output, hidden
    
    def init_rest_hidden(self, input_var):
        hid_var = Variable(torch.zeros(self.n_layers - 1, 1, self.hidden_size)).cuda()
        res = torch.cat((input_var, hid_var), 0)
        return res
        

whole_decoder_rnn = WholeDecoderRNN(input_size=len(chars_after), hidden_size=256, n_layers=2).cuda()
whole_decoder_rnn

WholeDecoderRNN (
  (rnn): GRU(32, 256, num_layers=2, batch_first=True)
  (lin_out): Linear (256 -> 32)
)

In [210]:
whole_encoder_rnn.load_state_dict(torch.load('data/models/whole_gen_2_chars/200000_EncoderRNN'))
whole_decoder_rnn.load_state_dict(torch.load('data/models/whole_gen_2_chars/200000_DecoderRNN'))

In [219]:
def whole_model_eval(s_bef, s_sentence):
    #words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = words_to_tensor(sentence_arr_tokenize(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output = whole_encoder_rnn(words_t, string_t)
    
    encoder_output = encoder_output.view(1,1,-1)
    
    decoder_hidden = whole_decoder_rnn.init_rest_hidden(encoder_output)
    decoder_input = Variable(whole_onehot_sos).cuda()

    decoded_output = []
    max_length = 20
    
    decoder_confidences = []
    
    for _ in range(max_length):
        decoder_output, decoder_hidden = whole_decoder_rnn(decoder_input, decoder_hidden)
        #return decoder_output

        topv, topi = decoder_output.data.topk(1)
        char_index = topi[0][0]
        char = chars_after[char_index] # Use own prediction as next input
        
        decoder_confidences.append(topv[0][0])
                
        if char == EOS_TOKEN:
            break

        decoded_output.append(char)
        
        decoder_input = after_string_to_tensor([char], include_eos=False)
        decoder_input = Variable(decoder_input).cuda()
    
    output = ''.join(decoded_output)
    return output, np.sum(decoder_confidences)
    
whole_model_eval('4200', ['he', 'was', '<SAMPLE>', 'years', 'old'])
whole_model_eval('&', ['<SAMPLE>'])

('four hundred thousan', -6.1360671520233154)

('and', -0.0025339126586914062)

### Letters

In [15]:
def letters(x):
    try:
        x = re.sub('[^a-zA-Z]', '', x)
        x = x.lower()
        result_string = ''
        for i in range(len(x)):
            result_string = result_string + x[i] + ' '
        return(result_string.strip())  
    except:
        return x

In [16]:
letters('X.D')

'x d'

### Plain

In [17]:
def plain(x):
    return x

### Verbatim

In [18]:
def verbatim(x):
    return x

### Electronic

In [19]:
def electronic(x):
    return x

# Running

In [20]:
len(test_data)

1088564

In [113]:
if

[1, 2, 3]

In [101]:
def run_rows_category(pd_rows):
    global current_row
    iter_len = len(pd_rows)
    rows_iter = pd_rows.itertuples()
    iteration_idx = 0
    start = time.time()
    
    #for sample_row in test_data[0:100].itertuples():
    for sample_row in log_progress(rows_iter, every=10, size=iter_len):
        before = sample_row.before
        current_row = sample_row

        sentence_rows = test_data_sentence_index.loc[sample_row.sentence_id]
        sentence_words = list(sentence_rows.before)
        token_id_idx = list(sentence_rows['token_id']).index(sample_row.token_id)
        sentence_words[token_id_idx] = SAMPLE_WORD_TOKEN

        manual_pre_check_after = manual_pre_checks(before)
        if manual_pre_check_after:
            test_data.at[sample_row.Index, 'after'] = manual_pre_check_after
            test_data.at[sample_row.Index, 'pred_class'] = 'MANUAL'
            continue

        try:
            category, category_conf = categorize(before, sentence_words)
            test_data.at[sample_row.Index, 'pred_class'] = category
            test_data.at[sample_row.Index, 'pred_c_conf'] = category_conf
            if category == 'NOT_CHANGED':
                test_data.at[sample_row.Index, 'after'] = result
        except: # Exception as inst:
            test_data.at[sample_row.Index, 'pred_class'] = 'PROBLEM'
            continue

        iteration_idx += 1
        if iteration_idx%10000 == 0:
            print("{:>7d} {:>2.2%} ({:>8})".format(iteration_idx, iteration_idx/iter_len, time_since(start)))

        test_data.at[sample_row.Index, 'after'] = result

        elif category == 'NUMBERS':
            result = numbers(before)
        elif category == 'LETTERS':
            result = letters(before)
        elif category == 'PLAIN':
            result = plain(before)
        elif category == 'VERBATIM':
            result = verbatim(before)
        elif category == 'ELECTRONIC':
            result = electronic(before)
        else:
            print("PROBLEM WITH:", sample_row)

In [22]:
run_rows(test_data[0:20])

In [28]:
run_rows(test_data[:])

  10000 0.92% (  1m 19s)
  20000 1.84% (  2m 32s)
  30000 2.76% (  3m 41s)
  40000 3.67% (  4m 51s)
  50000 4.59% (  6m 10s)
  60000 5.51% (  7m 29s)
  70000 6.43% (  8m 50s)
  80000 7.35% (  10m 1s)
  90000 8.27% ( 10m 50s)
 100000 9.19% (  12m 5s)
 110000 10.11% ( 13m 25s)
 120000 11.02% ( 14m 47s)
 130000 11.94% ( 16m 11s)
 140000 12.86% ( 17m 37s)
 150000 13.78% (  19m 4s)
 160000 14.70% ( 20m 27s)
 170000 15.62% ( 21m 51s)
 180000 16.54% ( 23m 15s)
 190000 17.45% ( 24m 43s)
 200000 18.37% (  26m 9s)
 210000 19.29% ( 27m 35s)
 220000 20.21% (  29m 3s)
 230000 21.13% ( 30m 23s)
 240000 22.05% ( 31m 51s)
 250000 22.97% ( 33m 16s)
 260000 23.88% ( 34m 42s)
 270000 24.80% (  36m 6s)
 280000 25.72% ( 37m 25s)
 290000 26.64% ( 38m 43s)
 300000 27.56% ( 39m 59s)
 310000 28.48% ( 41m 14s)
 320000 29.40% ( 42m 36s)
 330000 30.32% ( 43m 55s)
 340000 31.23% ( 45m 11s)
 350000 32.15% ( 46m 33s)
 360000 33.07% ( 47m 49s)
 370000 33.99% ( 49m 13s)
 380000 34.91% ( 50m 32s)
 390000 35.83% ( 51m 5

In [31]:
pickle.dump(test_data, open('data/output_testing_2_1_first_run.pkl', 'wb'))

In [32]:
test_data[test_data['pred_class']=='PROBLEM']

Unnamed: 0,sentence_id,token_id,before,pred_class,pred_c_conf,after


In [36]:
test_data.groupby('pred_class')['pred_class'].count()/len(test_data)

pred_class
ELECTRONIC     0.000906
LETTERS        0.027239
MANUAL         0.003980
NOT_CHANGED    0.900857
NUMBERS        0.053430
PLAIN          0.012385
VERBATIM       0.001203
Name: pred_class, dtype: float64

In [37]:
test_data.sample(2)

Unnamed: 0,sentence_id,token_id,before,pred_class,pred_c_conf,after
218253,14025,8,times,NOT_CHANGED,0.999523,times
889067,57163,11,pursue,NOT_CHANGED,0.999051,pursue


In [44]:
test_data[test_data['pred_c_conf'] < 0.4].sample(10)

Unnamed: 0,sentence_id,token_id,before,pred_class,pred_c_conf,after
171576,11047,10,Don,PLAIN,0.353442,Don
270842,17405,6,v,NUMBERS,0.392677,v
1036163,66637,7,Page,PLAIN,0.386125,Page
845127,54367,0,Don,NOT_CHANGED,0.385927,Don
659862,42489,2,v,LETTERS,0.383359,v
183033,11762,13,or,NOT_CHANGED,0.385437,or
523532,33676,9,v,LETTERS,0.38464,v
1044552,67181,9,years,LETTERS,0.37761,y e a r s
807594,51960,12,v,NOT_CHANGED,0.34538,v
760125,48900,0,The,NOT_CHANGED,0.393858,The


In [243]:
def run_rows_after_categorizing(pd_rows):
    global current_row
    iter_len = len(pd_rows)
    rows_iter = pd_rows.itertuples()
    iteration_idx = 0
    start = time.time()
    
    #for sample_row in test_data[0:100].itertuples():
    for sample_row in log_progress(rows_iter, every=10, size=iter_len):
        before = sample_row.before
        current_row = sample_row

        sentence_rows = test_data_sentence_index.loc[sample_row.sentence_id]
        sentence_words = list(sentence_rows.before)
        token_id_idx = list(sentence_rows['token_id']).index(sample_row.token_id)
        sentence_words[token_id_idx] = SAMPLE_WORD_TOKEN
        
        category = sample_row.pred_class
        if category == 'NOT_CHANGED':
            result = (before, 0)
        elif category == 'NUMBERS':
            result = num_model_eval(before, sentence_words)
        elif category == 'LETTERS':
            result = whole_model_eval(before, sentence_words)
        elif category == 'PLAIN':
            result = whole_model_eval(before, sentence_words)
        elif category == 'VERBATIM':
            result = whole_model_eval(before, sentence_words)
        elif category == 'ELECTRONIC':
            result = whole_model_eval(before, sentence_words)
        else:
            print("PROBLEM WITH:", sample_row)

        iteration_idx += 1
        if iteration_idx%10000 == 0:
            print("{:>7d} {:>2.2%} ({:>8})".format(iteration_idx, iteration_idx/iter_len, time_since(start)))

        test_data.at[sample_row.Index, 'after'] = result[0]
        test_data.at[sample_row.Index, 'after_conf'] = result[1]

In [244]:
run_rows_after_categorizing(test_data[test_data['pred_class']=='NUMBERS'])

  10000 17.19% (  0m 58s)
  20000 34.39% (  1m 58s)
  30000 51.58% (  2m 59s)
  40000 68.77% (  3m 58s)
  50000 85.97% (  4m 59s)


In [245]:
test_data[test_data['pred_class']=='NUMBERS'].sample(5)

Unnamed: 0,sentence_id,token_id,before,pred_class,pred_c_conf,after,after_conf
922635,59326,8,Dec 1965,NUMBERS,1.0,december nineteen sixty five,-0.005705
959176,61695,14,1,NUMBERS,0.999999,one,-5.3e-05
817151,52565,6,146000,NUMBERS,1.0,one hundred forty six thousand,-0.078651
472613,30385,15,2004-04-06,NUMBERS,0.999995,the sixth of april two thousand four,-0.008052
821847,52866,9,7th,NUMBERS,0.999815,seventh,-0.000122


In [246]:
run_rows_after_categorizing(test_data[
    (test_data['pred_class']!='NUMBERS') &
    (test_data['pred_class']!='NOT_CHANGED') &
    (test_data['pred_class']!='MANUAL') ])

  10000 22.01% (  1m 21s)
  20000 44.02% (  2m 43s)
  30000 66.04% (   4m 2s)
  40000 88.05% (  5m 23s)


In [247]:
test_data[test_data['pred_class']=='LETTERS'].sample(5)

Unnamed: 0,sentence_id,token_id,before,pred_class,pred_c_conf,after,after_conf
744989,47934,4,A.,LETTERS,0.99984,a,-0.004233
51274,3286,0,WZRX,LETTERS,0.999607,w z r r,-3.142048
826725,53185,0,GamePro,LETTERS,0.524487,g a b l e,-5.681352
65591,4210,14,Oh,LETTERS,0.573894,o d,-1.613022
278287,17878,6,WHO,LETTERS,0.993327,w h o,-0.744287


In [248]:
test_data[test_data['pred_class']=='PLAIN'].sample(5)

Unnamed: 0,sentence_id,token_id,before,pred_class,pred_c_conf,after,after_conf
151421,9745,14,exit,PLAIN,0.782171,e x i i,-2.752991
285043,18314,4,-,PLAIN,0.999987,to,-0.028054
186468,11992,6,no,PLAIN,0.918404,number,-0.099773
1008827,64892,4,feet,PLAIN,0.64473,f e e e,-3.764598
483956,31133,14,-,PLAIN,0.999996,to,-0.028176


In [224]:
test_data.loc[375381]

sentence_id          24102
token_id                 6
before                  is
pred_class     NOT_CHANGED
pred_c_conf       0.999976
after                   is
Name: 375381, dtype: object

In [None]:
current_row

# Saving results

### Modify data

In [249]:
result_data = test_data.copy()

tmp = list(result_data.columns)
tmp[tmp.index('before')] = 'after'
result_data.columns = tmp

In [250]:
result_data['id'] = result_data.apply(lambda row: "{}_{}".format(row['sentence_id'], row['token_id']), axis=1)

In [251]:
result_data.sample(2)

Unnamed: 0,sentence_id,token_id,before,pred_class,pred_c_conf,after,after_conf,id
746860,48054,21,Galvan,NOT_CHANGED,0.973958,Galvan,,48054_21
863245,55515,8,Need,NOT_CHANGED,0.988249,Need,,55515_8


result_data.loc[10, 'after'] = '"'

### Saving to file

In [252]:
result_file_path = 'data/en_submission_1.csv'

In [253]:
result_data.to_csv(result_file_path, index=False, columns=['id', 'after'], quoting=csv.QUOTE_ALL)