In [1]:
%matplotlib inline
import importlib
from pytorch_utils_oh_2 import *

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pytorch_utils_oh_2; importlib.reload(pytorch_utils_oh_2); from pytorch_utils_oh_2 import *;

Pytorch utils oh: pytorch_utils_oh_2.py
Pytorch: 0.2.0_4


# Data loading

In [4]:
chars_normal, chars_normal_index = load_characters_pkl('data/en_features/chars_normal.pkl')
common_words, common_words_index = load_common_words_10k()
chars_with_changes = pickle.load(open('data/en_features/chars_with_changes.pkl', "rb"))
chars_with_no_changes_re = re.compile("[^{}]".format(''.join(chars_with_changes)))
chars_with_single_output_dict = pickle.load(open('data/en_features/chars_with_single_output_dict.pkl', "rb"));

In [5]:
test_data_org = pd.read_csv('data/en_test.csv', keep_default_na=False)
len(test_data_org)

1088564

In [6]:
test_data = test_data_org.copy()
test_data_sentence_index = test_data.set_index('sentence_id')

In [7]:
test_data.sample(2)

Unnamed: 0,sentence_id,token_id,before
733936,47226,5,Fire
413241,26551,2,loss


In [8]:
def manual_pre_checks(before):
    if chars_with_no_changes_re.search(before): #contains chars that were never changed
        return before 
    if before in chars_with_single_output_dict:
        return chars_with_single_output_dict[before]
    return False

In [9]:
manual_pre_checks('yes')
manual_pre_checks('Ε')
manual_pre_checks('利')

False

'epsilon'

'利'

In [10]:
manual_pre_checks('?')

'?'

# Models

### Categorizer

In [11]:
categories_all = ['NOT_CHANGED', 'NUMBERS', 'LETTERS', 'PLAIN', 'VERBATIM', 'ELECTRONIC']
categories_index = dict((c, i) for i, c in enumerate(categories_all))

In [12]:
class CategorizeRNN(nn.Module):
    def __init__(self, output_size, words_input_size, chars_input_size, words_hidden_size, chars_hidden_size,
                 words_dropout=0, chars_dropout=0, words_layers=1, chars_layers=1):
        super(CategorizeRNN, self).__init__()
        
        self.words_layers = words_layers
        self.chars_layers = chars_layers
        self.words_hidden_size = words_hidden_size
        self.chars_hidden_size = chars_hidden_size

        self.rnn_words = nn.LSTM(words_input_size, words_hidden_size // 2, words_layers,
                                 dropout=words_dropout, batch_first=True, bidirectional=True)

        self.rnn_chars = nn.LSTM(chars_input_size, chars_hidden_size // 2, chars_layers,
                                 dropout=chars_dropout, batch_first=True, bidirectional=True)
        
        self.lin_output = nn.Linear(words_hidden_size+chars_hidden_size, output_size)
        
    def forward(self, words_tensor, string_tensor, init_hidden = True):
        if init_hidden:
            hidden_words, hidden_chars = self.init_hidden()
        
        all_outputs_words, hidden_words = self.rnn_words(words_tensor, hidden_words)
        output_words = all_outputs_words[:, -1]
        
        all_outputs_chars, hidden_chars = self.rnn_chars(string_tensor, hidden_chars)
        output_chars = all_outputs_chars[:, -1]
        
        output = torch.cat((output_words, output_chars), 1)
        
        #output = self.lin_1(output)
        output = self.lin_output(output)
        output = F.log_softmax(output)
        
        return output

    def init_hidden(self):
        var1_1 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var1_2 = Variable(torch.zeros(2 * self.words_layers, 1, self.words_hidden_size // 2))
        var2_1 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var2_2 = Variable(torch.zeros(2 * self.chars_layers, 1, self.chars_hidden_size // 2))
        var1_1 = var1_1.cuda(); var1_2 = var1_2.cuda()
        var2_1 = var2_1.cuda(); var2_2 = var2_2.cuda()
        return ((var1_1, var1_2), (var2_1, var2_2))

In [13]:
m_categorize = CategorizeRNN(len(categories_all), 
                              words_input_size=len(common_words), chars_input_size=len(chars_normal),
                              words_hidden_size=128, chars_hidden_size=128,
                              words_layers=2, chars_layers=2,
                              words_dropout=0.2, chars_dropout=0.2)
m_categorize = m_categorize.cuda()

m_categorize

m_categorize.load_state_dict(torch.load('data/models/category_6_mod_data_common_words/400000_CategorizeRNN'))

CategorizeRNN (
  (rnn_words): LSTM(8192, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (rnn_chars): LSTM(104, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (lin_output): Linear (256 -> 6)
)

In [14]:
def categorize(s_bef, s_sentence):
    model = m_categorize
    words_t = words_to_tensor(list(s_sentence), common_words_index)
    words_t = Variable(words_t).cuda()
    
    string_t = string_to_tensor(s_bef, chars_normal_index)
    string_t = Variable(string_t).cuda()
    
    output = model(words_t, string_t)
    guess = category_from_output(output, categories_all)
    confidence = torch.nn.functional.softmax(output).topk(1)[0].data[0][0]
    return guess[0], confidence

categorize('hello', '<SAMPLE> its me')

('NOT_CHANGED', 0.9996647834777832)

### Numbers

In [15]:
def numbers(x):
    return x

### Letters

In [16]:
def letters(x):
    try:
        x = re.sub('[^a-zA-Z]', '', x)
        x = x.lower()
        result_string = ''
        for i in range(len(x)):
            result_string = result_string + x[i] + ' '
        return(result_string.strip())  
    except:
        return x

In [17]:
letters('X.D')

'x d'

### Plain

In [18]:
def plain(x):
    return x

### Verbatim

In [19]:
def verbatim(x):
    return x

### Electronic

In [20]:
def electronic(x):
    return x

# Running

In [21]:
len(test_data)

1088564

In [98]:
def run_rows(pd_rows):
    global current_row
    iter_len = len(pd_rows)
    rows_iter = pd_rows.itertuples()
    iteration_idx = 0
    start = time.time()
    
    #for sample_row in test_data[0:100].itertuples():
    for sample_row in log_progress(rows_iter, every=10, size=iter_len):
        before = sample_row.before
        current_row = sample_row

        sentence_rows = test_data_sentence_index.loc[sample_row.sentence_id]
        sentence_words = list(sentence_rows.before)
        token_id_idx = list(sentence_rows['token_id']).index(sample_row.token_id)
        sentence_words[token_id_idx] = SAMPLE_WORD_TOKEN

        manual_pre_check_after = manual_pre_checks(before)
        if manual_pre_check_after:
            test_data.at[sample_row.Index, 'after'] = manual_pre_check_after
            test_data.at[sample_row.Index, 'pred_class'] = 'MANUAL'
            continue

        try:
            category, category_conf = categorize(before, sentence_words)
            test_data.at[sample_row.Index, 'pred_class'] = category
            test_data.at[sample_row.Index, 'pred_c_conf'] = category_conf
        except: # Exception as inst:
            test_data.at[sample_row.Index, 'pred_class'] = 'PROBLEM'
            continue

        if category == 'NOT_CHANGED':
            result = before
        elif category == 'NUMBERS':
            result = numbers(before)
        elif category == 'LETTERS':
            result = letters(before)
        elif category == 'PLAIN':
            result = plain(before)
        elif category == 'VERBATIM':
            result = verbatim(before)
        elif category == 'ELECTRONIC':
            result = electronic(before)
        else:
            print("PROBLEM WITH:", sample_row)

        iteration_idx += 1
        if iteration_idx%100 == 0:
            print("{:>7d} {:>2.2%} ({:>8})".format(iteration_idx, iteration_idx/iter_len, time_since(start)))

        test_data.at[sample_row.Index, 'after'] = result


In [99]:
run_rows(test_data[1000:2000])

    100 10.00% (   0m 1s)
    200 20.00% (   0m 2s)
    300 30.00% (   0m 3s)
    400 40.00% (   0m 4s)
    500 50.00% (   0m 6s)
    600 60.00% (   0m 7s)
    700 70.00% (   0m 8s)
    800 80.00% (   0m 9s)
    900 90.00% (  0m 10s)
   1000 100.00% (  0m 11s)


In [79]:
sample_row = current_row

In [85]:
%%timeit
sentence_rows = test_data_sentence_index.loc[sample_row.sentence_id, 'before']

237 µs ± 2.19 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [86]:
%%timeit
sentence_rows = test_data_sentence_index.loc[sample_row.sentence_id]

205 µs ± 4.82 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [47]:
%%timeit
run_rows(test_data[1000:1100])

3.78 s ± 121 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [45]:
%%timeit
run_rows(test_data[1000:1100])

2.25 s ± 61.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
%%timeit
run_rows(test_data[1000:1100])

3.77 s ± 81.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
%%timeit
# using at instead of loc
run_rows(test_data[1000:1100])

447 ms ± 7.74 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [73]:
%%timeit
test_data.loc[1090, 'after'] = 'test'

14.7 ms ± 941 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [72]:
%%timeit
test_data.iloc[1090, 5] = 'iloc_test'

14.3 ms ± 486 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [53]:
%%timeit
test_data.set_value(1090, 'after', 'testa')

7.59 µs ± 99 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [66]:
%%timeit
test_data.at[1090, 'after'] = 'at_test'

12.8 µs ± 144 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [69]:
%%timeit
test_data.iat[1090, 5] = 'iat_test'

7.79 µs ± 76.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [74]:
test_data.loc[1090]['after']

'test'

In [44]:
current_row

pandas.core.frame.Pandas

In [39]:
test_data.loc[1099]

sentence_id             74
token_id                10
before                   ,
pred_class     NOT_CHANGED
pred_c_conf              1
after                    ,
Name: 1099, dtype: object


        if category == 'NOT_CHANGED':
            result = before
        elif category == 'NUMBERS':
            result = numbers(before)
        elif category == 'LETTERS':
            result = letters(before)
        elif category == 'PLAIN':
            result = plain(before)
        elif category == 'VERBATIM':
            result = verbatim(before)
        elif category == 'ELECTRONIC':
            result = electronic(before)
        else:
            print("PROBLEM WITH:", sample_row)

        iteration_idx += 1
        if iteration_idx%10000 == 0:
            print("{:>7d} {:>2.2%} ({:>8})".format(iteration_idx, iteration_idx/iter_len, time_since(start)))

        test_data.loc[sample_row.Index, 'after'] = result

In [31]:
1+1

In [24]:
current_row

Pandas(Index=19, sentence_id=1, token_id=4, before='contains')

In [None]:
run_rows(test_data.itertuples())

In [None]:
run_rows(test_data[24000:])

In [None]:
current_row

In [None]:
test_data[0:10]

In [None]:
test_data[100000:100005]

# Saving results

### Modify data

result_data = test_data[0:20].copy()

In [None]:
result_data = test_data.copy()

In [None]:
tmp = list(result_data.columns)
tmp[tmp.index('before')] = 'after'
result_data.columns = tmp

In [None]:
result_data['id'] = result_data.apply(lambda row: "{}_{}".format(row['sentence_id'], row['token_id']), axis=1)

In [None]:
result_data.sample(2)

result_data.loc[10, 'after'] = '"'

### Saving to file

In [None]:
result_file_path = 'data/en_submission_1.csv'

In [None]:
result_data.to_csv(result_file_path, index=False, columns=['id', 'after'], quoting=csv.QUOTE_ALL)

In [87]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )