In [1]:
import os
import random
from collections import defaultdict

import pandas as pd
import numpy as np

from config import read_config, DEFAULT_CONFIG_FILE
from dialogue_denoiser_lstm import get_sample_weight, make_dataset
from data_utils import make_vocabulary

  from ._conv import register_converters as _register_converters


In [2]:
DATA_FOLDER = 'deep_disfluency_dataset_timings'
MODEL_FOLDER = 'importance_sampling'
CONFIG = read_config(DEFAULT_CONFIG_FILE)

random.seed(273)
np.random.seed(273)

In [3]:
trainset = pd.read_json(os.path.join(DATA_FOLDER, 'trainset.json'))
if CONFIG['use_pos_tags']:
    utterances = []
    for utterance, postags in zip(trainset['utterance'], trainset['pos']):
        utterance_augmented = ['{}_{}'.format(token, pos)
                               for token, pos in zip(utterance, postags)]
        utterances.append(utterance_augmented)
    else:
        utterances = trainset['utterance']
vocab, _ = make_vocabulary(utterances, CONFIG['max_vocabulary_size'])
label_vocab, _ = make_vocabulary(trainset['tags'].values,
                                 CONFIG['max_vocabulary_size'],
                                 special_tokens=[])
rev_label_vocab = {label_id: label
                   for label, label_id in label_vocab.iteritems()}
X_train, y_train = make_dataset(trainset, vocab, label_vocab, CONFIG)

y_train_flattened = np.argmax(y_train, axis=-1)

In [4]:
def samples_to_class_ids(in_samples, in_labels, in_label_vocab):
    return map(in_label_vocab.get, in_labels.take(in_samples))

In [5]:
def make_freq_dict(in_list):
    result = defaultdict(lambda: 0)
    for element in in_list:
        result[element] += 1.0
    return result

In [6]:
def softmax(x, t=1.0):
    """Compute softmax values for each sets of scores in x."""
    x_t = x / t
    e_x = np.exp(x_t - np.max(x_t))
    return e_x / e_x.sum()

In [9]:
def get_class_weight_proportional(in_labels, smoothing_coef=1.0):
    label_freqs = defaultdict(lambda: 0)
    for label in in_labels:
        label_freqs[label] += 1.0
    label_weights = {label: 1.0 / np.power(float(freq), 1.0 / smoothing_coef) for label, freq in label_freqs.iteritems()}
    return label_weights

In [145]:
class_weight = get_class_weight_proportional(y_train_flattened, smoothing_coef=1.05)
sample_weights = get_sample_weight(y_train_flattened, class_weight)
sample_probs = sample_weights / float(sum(sample_weights))

In [146]:
sample = np.random.choice(range(sample_probs.shape[0]), size=10000, p=sample_probs)
for key, value in sorted(make_freq_dict(samples_to_class_ids(sample, y_train_flattened, rev_label_vocab)).iteritems(), key=lambda x: x[1], reverse=True):
    print '{}:\t{}'.format(key, value)

<f/>:	610.0
<e/>:	476.0
<rm-3/><rpMid/>:	453.0
<rm-2/><rpMid/>:	441.0
<rpEndSub/>:	428.0
<rm-2/><rpEndSub/>:	425.0
<rm-1/><rpEndSub/>:	423.0
<rm-1/><rpMid/>:	409.0
<rm-4/><rpMid/>:	404.0
<rm-3/><rpEndSub/>:	394.0
<rm-6/><rpMid/>:	392.0
<rm-5/><rpMid/>:	390.0
<rm-2/><rpEndDel/>:	383.0
<rm-1/><rpEndDel/>:	376.0
<rm-3/><rpEndDel/>:	365.0
<rm-8/><rpMid/>:	361.0
<rm-4/><rpEndSub/>:	354.0
<rm-7/><rpMid/>:	343.0
<rm-5/><rpEndSub/>:	341.0
<rm-4/><rpEndDel/>:	335.0
<rm-6/><rpEndSub/>:	332.0
<rm-7/><rpEndDel/>:	326.0
<rm-5/><rpEndDel/>:	314.0
<rm-8/><rpEndSub/>:	313.0
<rm-7/><rpEndSub/>:	309.0
<rm-6/><rpEndDel/>:	303.0
