In [1]:
%matplotlib inline
import importlib
from pytorch_utils_oh_2 import *

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pytorch_utils_oh_2; importlib.reload(pytorch_utils_oh_2); from pytorch_utils_oh_2 import *;

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


# Data loading

In [4]:
chars_normal, chars_normal_index = load_characters_pkl('data/en_features/chars_normal.pkl')
common_words, common_words_index = load_common_words_10k()
chars_with_changes = pickle.load(open('data/en_features/chars_with_changes.pkl', "rb"))
chars_with_no_changes_re = re.compile("[^{}]".format(''.join(chars_with_changes)))
chars_with_single_output_dict = pickle.load(open('data/en_features/chars_with_single_output_dict.pkl', "rb"));

In [7]:
test_data_org = pd.read_csv('data/en_test_2.csv', keep_default_na=False)
len(test_data_org)

956046

In [8]:
test_data = test_data_org.copy()
test_data_sentence_index = test_data.set_index('sentence_id')

In [10]:
test_data.sample(2)

Unnamed: 0,sentence_id,token_id,before
305292,22746,11,coach
321626,23944,1,direct


### Sentences like in other data_preparation

In [16]:
def prepare_sentence_tokenized_by_rows(pd_rows):
    global current_row
    iter_len = len(pd_rows)
    rows_iter = pd_rows.itertuples()
    iteration_idx = 0
    start = time.time()
    
    for sample_row in log_progress(rows_iter, every=10, size=iter_len):
        before = sample_row.before
        current_row = sample_row

        sent_rows = test_data_sentence_index.loc[sample_row.sentence_id]
        befores = list(sent_rows.before)
        token_id_idx = list(sent_rows['token_id']).index(sample_row.token_id)
        
        befores = [simple_tokeniser(w) for w in befores]
        befores[token_id_idx] = [SAMPLE_WORD_TOKEN]
        befores = np.concatenate(befores)
    
        sentence = ' '.join(befores)

        test_data.at[sample_row.Index, 'sentence'] = sentence
        
        iteration_idx += 1
        if iteration_idx%(len(test_data)//10) == 0 or iteration_idx == 5000:
            print("{:>7d} {:>2.2%} ({:>8})".format(iteration_idx, iteration_idx/iter_len, time_since(start)))

In [19]:
prepare_sentence_tokenized_by_rows(test_data)

   5000 0.52% (   0m 3s)
  95604 10.00% (   1m 5s)
 191208 20.00% (  2m 11s)
 286812 30.00% (  3m 16s)
 382416 40.00% (  4m 22s)
 478020 50.00% (  5m 27s)
 573624 60.00% (  6m 35s)
 669228 70.00% (  7m 39s)
 764832 80.00% (  8m 43s)
 860436 90.00% (  9m 48s)
 956040 100.00% ( 10m 53s)


In [23]:
test_data[30:33]

Unnamed: 0,sentence_id,token_id,before,sentence
30,3,4,register,the party applied to <SAMPLE> this with the el...
31,3,5,this,the party applied to register <SAMPLE> with th...
32,3,6,with,the party applied to register this <SAMPLE> th...


In [26]:
pickle.dump(test_data, open('data/en_test_2_sentences.pkl', 'wb'))

# Classification (category_8_fixes)

In [31]:
categories_all = ['ELECTRONIC', 'LETTERS', 'NOT_CHANGED', 'NUMBERS', 'PLAIN', 'VERBATIM']
categories_index = dict((c, i) for i, c in enumerate(categories_all))

### Manual figuring out

In [27]:
def manual_pre_checks(before):
    if chars_with_no_changes_re.search(before): #contains chars that were never changed
        return before 
    if before in chars_with_single_output_dict:
        return chars_with_single_output_dict[before]
    return False

In [28]:
manual_pre_checks('yes')
manual_pre_checks('Ε')
manual_pre_checks('利')

False

'epsilon'

'利'

### Model

In [29]:
class CategorizeRNN(nn.Module):
    def __init__(self, output_size, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_dropout=0, chars_dropout=0, words_layers=1, chars_layers=1):
        super(CategorizeRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 dropout=words_dropout, batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                 dropout=chars_dropout, batch_first=True, bidirectional=True)
        
        self.lin_output = nn.Linear(words_hidden_size+chars_hidden_size, output_size)
        
    def forward(self, words_tensor, string_tensor, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(words_tensor, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        output = self.lin_output(output)
        output = F.log_softmax(output)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [32]:
cat_model = CategorizeRNN(len(categories_all), 
                      words_input_size=len(common_words), chars_input_size=len(chars_normal),
                      words_hidden_size=256, chars_hidden_size=384,
                      words_layers=2, chars_layers=2,
                      words_dropout=0.2, chars_dropout=0.2)
cat_model = cat_model.cuda()

cat_model

CategorizeRNN (
  (rnn_words): LSTM(8192, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (rnn_chars): LSTM(104, 192, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (lin_output): Linear (640 -> 6)
)

In [34]:
cat_model.load_state_dict(torch.load('data/models/category_8_fixes/700000_CategorizeRNN'))

In [35]:
def categorize(s_bef, s_sentence):
    model = cat_model
    
    # MODEL HAD A BUG, IT WAS USING ONLY CHARS INSTEAD OF SENTENCES!!
    # MODEL HAD A BUG, IT WAS USING ONLY CHARS INSTEAD OF SENTENCES!!
    # MODEL HAD A BUG, IT WAS USING ONLY CHARS INSTEAD OF SENTENCES!!
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    # words_t = words_to_tensor(sentence_arr_tokenize(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    output = model(words_t, string_t)
    guess = category_from_output(output, categories_all)
    confidence = torch.nn.functional.softmax(output).topk(1)[0].data[0][0]
    return guess[0], confidence

categorize('hello', ['<SAMPLE> it\'s me'])
categorize('hello', 'Hello welcome to <SAMPLE>'.split(' '))

('NOT_CHANGED', 1.0)

('NOT_CHANGED', 1.0)

### Process

In [50]:
def run_rows_category(pd_rows):
    global current_row
    iter_len = len(pd_rows)
    rows_iter = pd_rows.itertuples()
    iteration_idx = 0
    start = time.time()
    
    #for sample_row in test_data[0:100].itertuples():
    for sample_row in log_progress(rows_iter, every=10, size=iter_len):
        before = sample_row.before
        sentence = sample_row.sentence
        manual_pre_check_after = manual_pre_checks(before)
        
        if manual_pre_check_after:
            test_data.at[sample_row.Index, 'after'] = manual_pre_check_after
            test_data.at[sample_row.Index, 'pred_class'] = 'MANUAL'
            continue
            
        try:
            category, category_conf = categorize(before, sentence)
            test_data.at[sample_row.Index, 'pred_class'] = category
            test_data.at[sample_row.Index, 'pred_c_conf'] = category_conf
            if category == 'NOT_CHANGED':
                test_data.at[sample_row.Index, 'after'] = before
        except: # Exception as inst:
            test_data.at[sample_row.Index, 'pred_class'] = 'PROBLEM'
            continue

        iteration_idx += 1
        if ((iteration_idx % (int(iter_len*0.1)) == 0 and iteration_idx > 1000) or
            iteration_idx == 5000):
            print("{:>7d} {:>2.2%} ({:>8})".format(iteration_idx, iteration_idx/iter_len, time_since(start)))

In [51]:
run_rows_category(test_data[0:10])

In [54]:
test_data[8:(8+3)]

Unnamed: 0,sentence_id,token_id,before,sentence,pred_class,pred_c_conf,after
8,1,4,Channel,there 's more to clear <SAMPLE> than ' the lar...,NOT_CHANGED,1.0,Channel
9,1,5,Than,there 's more to clear channel <SAMPLE> ' the ...,NOT_CHANGED,1.0,Than
10,1,6,',there 's more to clear channel than <SAMPLE> t...,,,


In [55]:
run_rows_category(test_data)

   5000 0.52% (  0m 37s)
  95604 10.00% ( 11m 38s)
 191208 20.00% ( 23m 13s)
 286812 30.00% ( 35m 25s)
 382416 40.00% ( 47m 25s)
 478020 50.00% (  59m 8s)
 573624 60.00% (  71m 2s)
 669228 70.00% ( 82m 59s)
 764832 80.00% ( 94m 48s)
 860436 90.00% (106m 37s)


In [56]:
test_data.sample(5)

Unnamed: 0,sentence_id,token_id,before,sentence,pred_class,pred_c_conf,after
190558,14352,3,",",royal tunbridge wells <SAMPLE> kent : panini c...,NOT_CHANGED,1.0,","
772292,56682,7,.,""" thjalfarar ia 1951 - 2007 "" <SAMPLE>",NOT_CHANGED,1.0,.
931507,68225,15,including,this allowed efilecabinet to expand from servi...,NOT_CHANGED,0.999999,including
379102,28158,26,.,the museum of polo and hall of fame is a 501 (...,NOT_CHANGED,1.0,.
262339,19606,5,figs,it is usually eaten with <SAMPLE> during the s...,NOT_CHANGED,0.999925,figs


In [57]:
pickle.dump(test_data, open('data/en_test_2_3_categorized.pkl', 'wb'))

# Whole RNN

In [63]:
MAX_ATTENTION_LENGTH = 30

In [59]:
wv_vecs, wv_words, wv_idx = load_glove('/home/ohu/koodi/data/glove_wordvec/glove.6B.50d.txt')

In [66]:
words_after_common = pickle.load(open("data/en_features/words_after_ext.pkl", 'rb'))
words_after_index = dict((c, i) for i, c in enumerate(words_after_common))
words_after_common[0:7]
len(words_after_common)

['<EOS>', '<SOS>', '<UNK>', '<0000>', '<SAMPLE>', 'two', 'twenty']

1351

In [67]:
sos_tensor = torch.zeros(1, 1, len(words_after_index))
sos_tensor[0, 0, words_after_index[SOS_TOKEN]] = 1
sos_tensor.size()
#del(onehot_sos)

# sos_tensor = torch.LongTensor([words_after_index[SOS_TOKEN]])

torch.Size([1, 1, 1351])

### Encoder

In [64]:
class EncoderRNN(nn.Module):
    def __init__(self, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_layers=1, chars_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size
        self.hidden_size = words_hidden_size + chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                batch_first=True, bidirectional=True)
        
    def forward(self, word_vectors, string_tensor, hidden = None, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(word_vectors, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        hidden_states_cat = Variable(torch.zeros(MAX_ATTENTION_LENGTH, self.hidden_size)).cuda()
        for ei in range(min(MAX_ATTENTION_LENGTH, len(string_tensor[0]))):
            hidden_states_cat[ei] = torch.cat((output_words, all_outputs_chars[0, ei].view(1,-1)), 1)
                
        output = torch.cat((output_words[0], output_chars[0]), 0)

        #output = torch.cat((output_words, output_chars), 1)
        
        #return output, all_outputs_chars
        return output, hidden_states_cat

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))
    
    
encoder_rnn = EncoderRNN(words_input_size=wv_vecs.shape[-1], chars_input_size=len(chars_normal),
                         words_hidden_size=256, chars_hidden_size=384,
                         words_layers=1, chars_layers=1).cuda()
encoder_rnn


EncoderRNN (
  (rnn_words): LSTM(50, 128, batch_first=True, bidirectional=True)
  (rnn_chars): LSTM(104, 192, batch_first=True, bidirectional=True)
)

### Decoder

In [71]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout_p=0.1, max_length=MAX_ATTENTION_LENGTH):
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        
        #self.rnn = nn.GRU(input_size, hidden_size, n_layers,
        self.rnn = nn.GRU(hidden_size, hidden_size, n_layers,
                                 batch_first=True, bidirectional=False)
        
        self.lin_out = nn.Linear(hidden_size, input_size)
        #self.softmax = nn.LogSoftmax()
        
        #self.embedding = nn.Embedding(input_size, hidden_size)
        self.emb_lin = nn.Linear(input_size, hidden_size)
        self.max_length = max_length
        self.attn = nn.Linear(hidden_size*2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(dropout_p)
        
    #def forward(self, char, hidden):
    def forward(self, last_input, hidden, encoder_outputs):
        #embedded = self.embedding(last_input).view(1, 1, -1)
        embedded = self.emb_lin(last_input).view(1, 1, -1)
        embedded = self.dropout(embedded)
        #IPython.core.debugger.Pdb().set_trace()
        #attn_weights = F.softmax(self.attn(torch.cat((char[0], hidden[0]), 1)))
        attn_weights = F.softmax(self.attn(torch.cat((embedded[0], hidden[0]), 1)))
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        
        output = F.relu(output)
        output, hidden = self.rnn(output, hidden)

        output = F.log_softmax(self.lin_out(output[0]))
        return output, hidden, attn_weights
    
    def init_rest_hidden(self, input_var):
        if self.n_layers > 1:
            hid_var = Variable(torch.zeros(self.n_layers - 1, 1, self.hidden_size)).cuda()
            res = torch.cat((input_var, hid_var), 0)
            return res
        else:
            return input_var
        

decoder_rnn = DecoderRNN(input_size=len(words_after_common), hidden_size=640, n_layers=1)
decoder_rnn = decoder_rnn.cuda()
decoder_rnn

DecoderRNN (
  (rnn): GRU(640, 640, batch_first=True)
  (lin_out): Linear (640 -> 1351)
  (emb_lin): Linear (1351 -> 640)
  (attn): Linear (1280 -> 30)
  (attn_combine): Linear (1280 -> 640)
  (dropout): Dropout (p = 0.1)
)

In [90]:
def whole_encoder_decoder(s_bef, sentence):
    decoder_rnn.eval()
    encoder_rnn.eval()
    
    words_t = Variable(words_to_word_vectors_tensor(list(sentence), wv_vecs, wv_idx)).cuda()
    # NOTE THIS IS A BUG, DO NOT PASS LIST(S_SENTENCE) BY DEFAULT
        # NOTE THIS IS A BUG, DO NOT PASS LIST(S_SENTENCE) BY DEFAULT
     # NOTE THIS IS A BUG, DO NOT PASS LIST(S_SENTENCE) BY DEFAULT
       # NOTE THIS IS A BUG, DO NOT PASS LIST(S_SENTENCE) BY DEFAULT
    # NOTE THIS IS A BUG, DO NOT PASS LIST(S_SENTENCE) BY DEFAULT
        
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    encoder_output, encoder_outputs = encoder_rnn(words_t, string_t)
    
    decoder_hidden = decoder_rnn.init_rest_hidden(encoder_outputs[0].view(1,1,-1))
    decoder_input = Variable(sos_tensor).cuda()
    
    decoded_output = []
    decoded_output_prob = []
    max_length = 20
    
    for _ in range(max_length):
        decoder_output, decoder_hidden, decoder_attns = decoder_rnn(decoder_input, decoder_hidden, encoder_outputs)

        topv, topi = decoder_output.data.topk(1)
        word_index = topi[0][0]
        word = words_after_common[word_index] # Use own prediction as next input
        decoded_output_prob.append(topv[0][0])
                
        if word == EOS_TOKEN:
            break

        decoded_output.append(word)
        
        
        decoder_input = torch.zeros(1, 1, len(words_after_index))
        decoder_input[0, 0, word_index] = 1
        decoder_input = Variable(decoder_input).cuda()
    
    output = ' '.join(decoded_output)
    
    return output, (sum(decoded_output_prob), np.average(decoded_output_prob))
    

In [91]:
whole_encoder_decoder('12.12.2017', 'and here is our <SAMPLE> sentence')

('the twelfth of october twenty ten',
 (-2.7835006713867188, -0.39764295305524555))

### Running the stuff

In [74]:
#state_dict_path = 'data/models/whole_gen_11_fixes/500000_'
state_dict_path = 'data/models/whole_gen_11_fixes/2750000_'
decoder_rnn.load_state_dict(torch.load(state_dict_path + 'DecoderRNN'))
encoder_rnn.load_state_dict(torch.load(state_dict_path + 'EncoderRNN'))

In [96]:
def run_rows_on_gen_whole_rnn(pd_rows):
    global current_row
    iter_len = len(pd_rows)
    rows_iter = pd_rows.itertuples()
    iteration_idx = 0
    start = time.time()
    
    #for sample_row in test_data[0:100].itertuples():
    for sample_row in log_progress(rows_iter, every=100, size=iter_len):
        current_row = sample_row
        
        result = whole_encoder_decoder(sample_row.before, sample_row.sentence)

        iteration_idx += 1
        if iteration_idx%10000 == 0:
            print("{:>7d} {:>2.2%} ({:>8})".format(iteration_idx, iteration_idx/iter_len, time_since(start)))

        test_data.at[sample_row.Index, 'after'] = result[0]
        test_data.at[sample_row.Index, 'after_conf_sum'] = result[1][0]
        test_data.at[sample_row.Index, 'after_conf_avg'] = result[1][1]

In [107]:
gen_data = test_data[~test_data['pred_class'].isin(['MANUAL', 'NOT_CHANGED'])]

In [108]:
gen_data[0:2]

Unnamed: 0,sentence_id,token_id,before,sentence,pred_class,pred_c_conf,after,after_conf_sum,after_conf_avg
2,0,2,2016-03-31,last modified <SAMPLE> .,NUMBERS,1.0,the thirty first of march twenty sixteen,-0.403225,-0.050403
37,3,11,April 2017,the party applied to register this with the el...,NUMBERS,1.0,april twenty fourteen,-0.004974,-0.001244


In [105]:
run_rows_on_gen_whole_rnn(gen_data)

  10000 11.21% (  1m 26s)
  20000 22.41% (  2m 47s)
  30000 33.62% (   4m 6s)
  40000 44.82% (  5m 24s)
  50000 56.03% (  6m 42s)
  60000 67.24% (   8m 0s)
  70000 78.44% (  9m 18s)
  80000 89.65% ( 10m 36s)


In [104]:
test_data[~test_data['pred_class'].isin(['MANUAL', 'NOT_CHANGED'])][0:4]

Unnamed: 0,sentence_id,token_id,before,sentence,pred_class,pred_c_conf,after,after_conf_sum,after_conf_avg
2,0,2,2016-03-31,last modified <SAMPLE> .,NUMBERS,1.0,the thirty first of march twenty sixteen,-0.419025,-0.052378
37,3,11,April 2017,the party applied to register this with the el...,NUMBERS,1.0,april twenty fourteen,-0.008377,-0.002094
43,3,17,May 2017,the party applied to register this with the el...,NUMBERS,1.0,may twenty seventeen,-0.000137,-3.4e-05
45,4,0,21 February 2017,<SAMPLE> .,NUMBERS,1.0,the twenty first of february twenty eleven,-0.573418,-0.071677


In [114]:
gen_data[gen_data['after_conf_avg'] < -0.2]

Unnamed: 0,sentence_id,token_id,before,sentence,pred_class,pred_c_conf,after,after_conf_sum,after_conf_avg
357,33,5,2016,"college board , class of <SAMPLE> sat particip...",NUMBERS,1.000000,two thousand,-0.802635,-0.267545
358,33,6,SAT,"college board , class of 2016 <SAMPLE> partici...",PLAIN,0.984779,saturday,-0.630402,-0.315201
364,33,12,2017,"college board , class of 2016 sat participatio...",NUMBERS,1.000000,twenty thousand,-1.052792,-0.350931
525,48,6,2016,""" greenwood elementary school fast facts <SAMP...",NUMBERS,1.000000,twenty thousand,-0.769245,-0.256415
958,82,13,6 years,"eighteen pupils were killed , of whom sixteen ...",NUMBERS,1.000000,six thousand e a e,-3.654732,-0.609122
1074,93,6,021798,patent and exclusivity for : n <SAMPLE> .,NUMBERS,1.000000,o seven seven seven seven eight eight,-4.490807,-0.561351
1092,95,4,2020,the projected deficit in <SAMPLE> / 21 is £415...,NUMBERS,1.000000,two thousand two,-1.264744,-0.316186
1096,95,8,£415 million,the projected deficit in 2020 / 21 is <SAMPLE> .,NUMBERS,0.999975,four hundred hundred million pounds,-2.201344,-0.366891
1312,116,7,2016,""" miles franklin literary award , the <SAMPLE>...",NUMBERS,1.000000,twenty thousand,-0.786472,-0.262157
1573,137,7,28 years,the transfer of the concession took nearly <SA...,NUMBERS,1.000000,two two thousand s s,-2.829502,-0.471584


In [None]:
current_row

# Saving results

### Modify data

In [115]:
result_data = test_data.copy()

tmp = list(result_data.columns)
tmp[tmp.index('before')] = 'after'
result_data.columns = tmp

In [116]:
result_data['id'] = result_data.apply(lambda row: "{}_{}".format(row['sentence_id'], row['token_id']), axis=1)

In [117]:
result_data.sample(2)

Unnamed: 0,sentence_id,token_id,before,sentence,pred_class,pred_c_conf,after,after_conf_sum,after_conf_avg,id
357385,26563,8,show's,she held that position from may 2000 through t...,NOT_CHANGED,1.0,show's,,,26563_8
3448,287,4,.,community media group llc <SAMPLE>,NOT_CHANGED,1.0,.,,,287_4


result_data.loc[10, 'after'] = '"'

### Saving to file

In [118]:
result_file_path = 'data/en_submission_2_3.csv'

In [119]:
result_data.to_csv(result_file_path, index=False, columns=['id', 'after'], quoting=csv.QUOTE_ALL)