In [15]:
from collections import namedtuple
dataset = namedtuple('Dataset', ['x', 'y', 'vocab', 'lm'])
import linear
import kenlm
import solver
import nltk
%matplotlib inline
import matplotlib.pyplot as plt
reload(solver)
Printer = solver.Printer

In [16]:
def generate_all(models, names, dataset_word, dataset_char, n, results, lm_loss_limit=2.0, **kwargs):
    for name in names:
        solver.lm = dataset_word.lm
        dataset = dataset_word
        if name.startswith('VDCNN'):
            dataset = dataset_char
        if name.startswith('NB'):
            results[name] = linear.main(models[name].clf, yelp_data, n, lm_loss_limit=lm_loss_limit, **kwargs)
        else:
            results[name] = solver.main(models[name], dataset.x[0:n], dataset.y[0:n], dataset.vocab, 
                                                    lm_loss_limits=(lm_loss_limit, ), latex=False, printer=solver.NullPrinter(), **kwargs)
    return results
def show_lm(data, results, n_samples=5, key='original_tokens'):    
    lm_results = np.array([solver.score_paragraph(data.lm, entry[key]) for entry in results if entry['score'] > 0.5])
    pl = lm_results[:, 0] / lm_results[:, 1]
    indices = np.nonzero(pl > -1)[0]
    np.random.shuffle(indices)
    for i in indices[:n_samples]:
        print('Sample', i, pl[i], ' '.join(results[i][key]))
    plt.hist(pl, bins=20)
def generate_random(model, data, n, lm_loss_limits, **kwargs):
    solver.lm = data.lm
    return solver.main(model, data.x[:n], data.y[:n], data.vocab, lm_loss_limits, printer=solver.NullPrinter(), method='random', **kwargs)

In [17]:
from __future__ import print_function
def full_scores(lm, s, printer):
    for sent in nltk.sent_tokenize(s):
        tokens = ['<s>'] + filter(lambda x : not is_punct(x), nltk.word_tokenize(sent) ) + ['</s>']
        printer.print(sent)
        for i, (score, ngram, oov) in enumerate(lm.full_scores(' '.join(tokens[1:-1]), eos=True, bos=True)):
            printer.print('{:.2f} {:.2f} {} {}'.format(score, score - lm.score(tokens[i+1], bos=False, eos=False), ' '.join(tokens[i+2-ngram: i+2]), ngram) )

def encode_html(s):
    return s.replace('<', '&lt;').replace('>', '&gt;')

def is_punct(s):
    for ch in s.lower():
        i = ord(ch)
        if 97 <= i <= 122 or 48 <= i <= 57:
            return False
    return True

def parse_tokens_lm(tokens):
    parsed_tokens = [ token.lower() for token in tokens if not is_punct(token) ]
    return ' '.join(parsed_tokens)
def full_classify(clf, tokens, vocab, p):
    if not isinstance(tokens[0], str):
        tokens = [ vocab.vocabulary_.reverse(idx) for idx in np.trim_zeros(tokens) ]
    delta_prob = clf.feature_log_prob_[1] - clf.feature_log_prob_[0]
    for token in tokens:
        word_idx = vocab.vocabulary_.get(token)
        p.print('{} ({:.2f})'.format(encode_html(token), delta_prob[word_idx]), end = ' ')
def col(results, key):
    return [ result[key] for result in results ]

In [18]:
from __future__ import print_function
reload(solver)
def score_all(data, results_dict):
    def ppl(x):
        return 1 / 10 ** x
    for k, results in sorted(results_dict.iteritems()):
        lm_results = [solver.score_paragraph(data.lm, entry['tokens']) for entry in results if entry['score'] > 0.5]
        diffs = [ entry['diff'] / (len(entry['tokens']) + 1e-6) for entry in results if entry['score'] > 0.5]
        lm_results = np.array(lm_results)
        print(k, 1.0 * len(lm_results) / len(results) )
        if len(lm_results) > 0:
            print( np.mean(diffs), ppl(np.sum(lm_results[:, 0]) / np.sum(lm_results[:, 1]) ) )
        
    lm_results = [solver.score_paragraph(data.lm, entry['original_tokens']) for entry in results]
    lm_results = np.array(lm_results)
    print('Clean')
    print( ppl(np.sum(lm_results[:, 0]) / np.sum(lm_results[:, 1]) ) )
    print(len(lm_results) )

# Train Model
First run scripts in shell to train the models.
Lstm models should have tag lstm\*, convolutional model should have tag conv\*.
Training deep cnn use another script, -v should be 2\*.
```sh
python word_model.py train --dataset yelp_review_polarity_csv --num_filters 512 --decay 2e-4 --tag lstm-mean-2 --gpu 1 --mem 0.5
python word_model.py train --dataset yelp_review_polarity_csv --num_filters 300 --decay 2e-4 --tag conv300x1_dc2e4_l200 --gpu 1 --mem 0.5
python deep_model.py train --dataset yelp_review_polarity_csv --blocks 1,1,1,1 -v 2-layer9-run2 --gpu 1 --mem 0.5
```

## Load Models

In [19]:
GPU = '1' 
#GPU = '""' # CPU

In [43]:
yelp_lm = kenlm.LanguageModel('dataset/yelp_review_polarity_csv/lm.arpa.bin')
yelp_models = dict()

#print('training first model')

%run word_model.py notebook --dataset yelp_review_polarity_csv --num_filters 512 --decay 2e-4 --tag lstm-mean-2 --gpu {GPU} --mem 0.5
yelp_models['LSTM'] = model
yelp_data = dataset(x=x_shuffled, y=y_shuffled, vocab=vocab, lm=yelp_lm)

#print('training second model')

%run word_model.py notebook --dataset yelp_review_polarity_csv --num_filters 300 --decay 2e-4 --tag conv300x1_dc2e4_l200 --gpu {GPU} --mem 0.5
yelp_models['WordCNN'] = model
%run deep_model.py notebook --dataset yelp_review_polarity_csv --blocks 1,1,1,1 -v 2-layer9-run2 --gpu {GPU} --mem 0.5
yelp_models['VDCNN-11'] = model
yelp_data_char = dataset(x=x_shuffled, y=y_shuffled, vocab=solver.alphabet, lm=yelp_lm)

yelp_nb_data = linear.load_dataset('yelp_review_polarity_csv')
yelp_clf = linear.model(*yelp_nb_data)
yelp_models['NB'] = solver.SklearnAdaptor(yelp_clf)

INFO:tensorflow:Restoring parameters from word/runs_yrpc/lstm-mean-2/model-7876
Prepare for run in notebook
INFO:tensorflow:Restoring parameters from word/runs_yrpc/conv300x1_dc2e4_l200/model-7876
Prepare for run in notebook
Data from dataset/yelp_review_polarity_csv/test.csv loaded.
INFO:tensorflow:Restoring parameters from runs_yrpc/v2-layer9-run2/model-27566
Prepare for run in notebook
Dataset yelp_review_polarity_csv loaded ..
0.878947368421


## Generate Adversarial Examples

In [None]:
yelp_results=dict()
n = 10
_ = generate_all(yelp_models, ['NB', 'VDCNN-11', 'WordCNN', 'LSTM'], yelp_data, yelp_data_char, n=n, results=yelp_results, 
             target_diffs=0.5, 
             lm_loss_limit=2.0,
             target_proba=(0.9, ) )
n_display = 10
solver.show_results(yelp_results, (0.9, ), fraction_words=1.00, n = n_display)

## Legacy Codes

In [None]:
trec_models = dict()
%run word_model.py test --tag lstm-mean --dropout 0.5,0.5 --num_filters 300 --dataset trec07p --gpu 1 --mem 0.4
trec_lm = kenlm.LanguageModel('lm/trec07p_train.arpa.bin')
trec_data = dataset(x=x_shuffled, y=y_shuffled, vocab=vocab, lm=trec_lm)
trec_models['LSTM'] = model
%run word_model.py test --tag conv300x1 --num_filters 300 --dropout 0.5 --dataset trec07p --gpu 1 --mem 0.4
trec_models['WordCNN'] = model

In [None]:
amazon_models=dict()
%run word_model.py notebook --dataset amazon_review_polarity_csv --num_filters 300 --decay 2e-4 --tag conv300x1_dc2e4_l200 --gpu 1 --mem 0.25 --checkpoint 32000 
amazon_models['WordCNN'] = model
%run word_model.py notebook --dataset amazon_review_polarity_csv --num_filters 512 --decay 2e-4 --tag lstm-mean --gpu 1 --mem 0.25
amazon_models['LSTM'] = model
amazon_data = dataset(x=x_shuffled, y=y_shuffled, vocab=vocab, lm=kenlm.LanguageModel('lm/arpc_train.arpa.bin'))
%run deep_model.py notebook --gpu 1 --dataset amazon_review_polarity_csv --mem 0.3 -v 2
amazon_data_char = dataset(x=x_shuffled, y=y_shuffled, vocab=solver.alphabet, lm=amazon_data.lm)
amazon_models['VDCNN'] = model
amazon_nb_data = linear.load_dataset('amazon_review_polarity_csv')
amazon_clf = linear.model(*amazon_nb_data)
amazon_models['NB'] = solver.SklearnAdaptor(amazon_clf)

In [None]:
imdb_models = dict()
%run word_model.py notebook --dataset aclImdb --num_filters 512 --decay 2e-4 --tag lstm-mean --gpu 1 --mem 0.3 --checkpoint 352
imdb_models['LSTM'] = model
imdb_data = dataset(x=x_shuffled, y=y_shuffled, vocab=vocab, lm=kenlm.LanguageModel('lm/aclImdb_train.arpa.bin'))
%run word_model.py notebook --dataset aclImdb --num_filters 300 --decay 2e-4 --tag conv300x1_dc2e4_l200 --gpu 1 --mem 0.25 --checkpoint 704 
imdb_models['WordCNN'] = model
imdb_nb_data = linear.load_dataset('aclImdb')
imdb_clf = linear.model(*imdb_nb_data)
imdb_models['NB'] = solver.SklearnAdaptor(imdb_clf)

In [None]:
yelp_lm = kenlm.LanguageModel('lm/yrpc_train.arpa.bin')
yelp_models = dict()
dataset = namedtuple('Dataset', ['x', 'y', 'vocab', 'lm'])
#%run word_model.py notebook --dataset yelp_review_polarity_csv --num_filters 512 --decay 2e-4 --tag lstm-mean --gpu 1 --mem 0.3 --checkpoint 7876
#yelp_models['LSTM-I'] = model
%run word_model.py notebook --dataset yelp_review_polarity_csv --num_filters 512 --decay 2e-4 --tag lstm-mean-2 --gpu {GPU} --mem 0.3 --checkpoint 7876
yelp_models['LSTM'] = model
yelp_data = dataset(x=x_shuffled, y=y_shuffled, vocab=vocab, lm=yelp_lm)
%run word_model.py notebook --dataset yelp_review_polarity_csv --num_filters 300 --decay 2e-4 --tag conv300x1_dc2e4_l200 --gpu {GPU} --mem 0.25 --checkpoint 7876 
yelp_models['WordCNN'] = model
%run deep_model.py notebook --gpu {GPU} --dataset yelp_review_polarity_csv --mem 0.3
yelp_models['VDCNN-19'] = model
yelp_data_char = dataset(x=x_shuffled, y=y_shuffled, vocab=solver.alphabet, lm=yelp_lm)
#%run deep_model.py notebook --gpu 1 --dataset yelp_review_polarity_csv --mem 0.4 --blocks 1,1,1,1 -v 2-layer9 --checkpoint 36000
#yelp_models['VDCNN-11-I'] = model
%run deep_model.py notebook --gpu {GPU} --dataset yelp_review_polarity_csv --mem 0.4 --blocks 1,1,1,1 -v 2-layer9-run2 --checkpoint 27566
yelp_models['VDCNN-11'] = model

yelp_nb_data = linear.load_dataset('yelp_review_polarity_csv')
yelp_clf = linear.model(*yelp_nb_data)
yelp_models['NB'] = solver.SklearnAdaptor(yelp_clf)