In [1]:
import argparse
import json
from models import MLPClassifier, Baseline_Embeddings
from models import Seq2Seq, MLP_D, MLP_G, MLP_I, MLP_I_AE, JSDistance, Seq2SeqCAE, Baseline_Embeddings, Baseline_LSTM
from utils import to_gpu, Corpus, batchify, SNLIDataset, collate_snli
import random
import pickle as pkl
import torch
import numpy as np
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable


In [2]:
#python3.6 train_surrogate.py --data_path ./data/classifier --save_path game_output/ --classifier_path ./data --load_pretrained .
cur_dir = '.'

with open(cur_dir + '/vocab.json', 'r') as fin:
    corpus_vocab = json.load(fin)

corpus_train = SNLIDataset(train=True, vocab_size=11004-4, path='./data/classifier')
corpus_test = SNLIDataset(train=False, vocab_size=11004-4, path='./data/classifier')
trainloader= torch.utils.data.DataLoader(corpus_train, batch_size = 32, collate_fn=collate_snli, shuffle=True)
train_iter = iter(trainloader)
testloader= torch.utils.data.DataLoader(corpus_test, batch_size = 32, collate_fn=collate_snli, shuffle=False)
random.seed(1111)
np.random.seed(1111)
torch.manual_seed(1111)

EPS = 3e-2


original vocab 41574; pruned to 11004
Number of sentences dropped from ./data/classifier/train.txt: 448221 out of 549367 total
original vocab 41574; pruned to 11004
Number of sentences dropped from ./data/classifier/test.txt: 8288 out of 9824 total


In [3]:
autoencoder = torch.load(open(cur_dir + '/models/autoencoder_model.pt', 'rb'))
#gan_gen = torch.load(open(cur_dir + '/models/gan_gen_model.pt', 'rb'))
#gan_disc = torch.load(open(cur_dir + '/models/gan_disc_model.pt', 'rb'))
inverter = torch.load(open(cur_dir + '/models/inverter_model.pt', 'rb'))

classifier1 = Baseline_Embeddings(100, vocab_size=11004)
# classifier1 = Baseline_LSTM(100,300,maxlen=10, gpu=args.cuda)

classifier1.load_state_dict(torch.load('./models' + "/baseline/model_emb.pt"))
vocab_classifier1 = pkl.load(open('./models' + "/vocab.pkl", 'rb'))

mlp_classifier = MLPClassifier(100 * 2, 3, layers='100-50')
#if not args.train_mode:
mlp_classifier.load_state_dict(torch.load('./surrogate{0}.pt'.format('100-50')))

print(classifier1)
print(autoencoder)
print(inverter)
print(mlp_classifier)

optimizer = optim.Adam(mlp_classifier.parameters(),
                           lr=1e03,
                           betas=(0.9, 0.999))




Baseline_Embeddings(
  (embedding_prem): Embedding(11004, 100)
  (embedding_hypo): Embedding(11004, 100)
  (linear): Linear(in_features=200, out_features=3, bias=True)
)
Seq2SeqCAE(
  (embedding): Embedding(11004, 300)
  (embedding_decoder): Embedding(11004, 300)
  (encoder): Sequential(
    (layer-1): Conv1d(300, 500, kernel_size=(3,), stride=(1,))
    (bn-1): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (activation-1): LeakyReLU(negative_slope=0.2, inplace)
    (layer-2): Conv1d(500, 700, kernel_size=(3,), stride=(2,))
    (bn-2): BatchNorm1d(700, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (activation-2): LeakyReLU(negative_slope=0.2, inplace)
    (layer-3): Conv1d(700, 1000, kernel_size=(3,), stride=(2,))
    (bn-3): BatchNorm1d(1000, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (activation-3): LeakyReLU(negative_slope=0.2, inplace)
  )
  (linear): Linear(in_features=1000, out_features=300, bias=

In [4]:
from torch.autograd import Variable

def evaluate_model():
    classifier1.eval()

    test_iter = iter(trainloader)
    correct=0
    total=0
    for batch in test_iter:
        premise, hypothesis, target, _, _, _, _ = batch

        if args.cuda:
            premise=premise.cuda()
            hypothesis = hypothesis.cuda()
            target = target.cuda()

        prob_distrib = classifier1.forward((premise, hypothesis))
        predictions = np.argmax(prob_distrib.data.cpu().numpy(), 1)
        correct+=len(np.where(target.data.cpu().numpy()==predictions)[0])
        total+=premise.size(0)
    acc=correct/float(total)
    print("Accuracy:{0}".format(acc))
    return acc

autoencoder.gpu = True
autoencoder = autoencoder.cuda()
autoencoder.start_symbols = autoencoder.start_symbols.cuda()
#gan_gen = gan_gen.cuda()
#gan_disc = gan_disc.cuda()
classifier1 = classifier1.cuda()
inverter = inverter.cuda()
mlp_classifier = mlp_classifier.cuda()


In [5]:
def train_process(premise, hypothesis, target, premise_words, hypothesis_words, premise_length, hypothesis_length):
    #mx = target.max().item()
    #assert(mx >= 0 and mx < 3)
    #for s, s_w in zip(premise, premise_words):
    #    for i, w in zip(s, s_w):
    #        assert(corpus_vocab.get(w, 3) == i)
    #print(hypothesis_words, flush=True)
    autoencoder.eval()
    inverter.eval()
    classifier1.eval()
    mlp_classifier.train()

    #print(premise.max().item(), flush=True)
    #print(hypothesis.max().item(), flush=True)

    premise_idx = torch.tensor([[corpus_vocab.get(w, 3) for w in s] for s in premise_words]).cuda()
    hypothesis_idx = torch.tensor([[corpus_vocab.get(w, 3) for w in s] for s in hypothesis_words]).cuda()

    c_prem = autoencoder.encode(premise_idx, premise_length, noise=False)
    z_prem = inverter(c_prem).detach()

    c_hypo = autoencoder.encode(hypothesis_idx, hypothesis_length, noise=False)
    z_hypo = inverter(c_hypo).detach()

    # z_comb = nn.cat((z_prem, z_hypo), 0).detach()

    output = mlp_classifier(z_prem, z_hypo)
    gold = classifier1((premise, hypothesis)).detach()

    #print(output.shape, flush=True)
    #print(gold.shape, flush=True)

    acc = (torch.argmax(gold, 1) == target).to(torch.float32).mean().item()
    acc_surrogate = (torch.argmax(output, 1) == target).to(torch.float32).mean().item()


    loss = -torch.mean(torch.sum(output * F.softmax(gold, dim=1), 1), 0)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item(), acc, acc_surrogate


In [6]:
def classifier_pred(pw, hw):
    classifier1.eval()

    premise_idx = torch.tensor([vocab_classifier1.get(w, 3) for w in pw]).cuda().unsqueeze(0)
    hypothesis_idx = torch.tensor([vocab_classifier1.get(w, 3) for w in hw]).cuda().unsqueeze(0)

    return F.softmax(classifier1((premise_idx, hypothesis_idx)), 1).squeeze(0).cpu().detach().numpy()


In [7]:
def cross_entropy(p, q):
    q = torch.log(q)
    a = p * q
    a = torch.sum(a)
    a = -a
    return a

In [8]:
ALPHA = 0.05

In [43]:
def perturb(criterion, premise, hypothesis, target, premise_words, hypothesis_words, premise_length, hypothesis_length):
    autoencoder.eval()
    inverter.eval()
    classifier1.eval()
    mlp_classifier.eval()

    premise_words = [premise_words]
    hypothesis_words = [hypothesis_words]
    premisea_length = [premise_length]
    hypothesis_length = [hypothesis_length]


    premise_idx = torch.tensor([[corpus_vocab.get(w, 3) for w in s] for s in premise_words]).cuda()
    hypothesis_idx = torch.tensor([[corpus_vocab.get(w, 3) for w in s] for s in hypothesis_words]).cuda()

    c_prem = autoencoder.encode(premise_idx, premise_length, noise=False)
    z_prem = inverter(c_prem).detach()

#     c_hypo = autoencoder.encode(hypothesis_idx, hypothesis_length, noise=False).detach()
#     c_hypo.requires_grad = True
    c_hypo = autoencoder.encode(hypothesis_idx, hypothesis_length, noise=False)
    z_hypo = inverter(c_hypo)
    
    premise = premise.unsqueeze(0)
    hypothesis = hypothesis.unsqueeze(0)
    target = target.unsqueeze(0)

    mlp_classifier.zero_grad()
    inverter.zero_grad()
    
    #temp = torch.Tensor(1, 300).fill_(0).cuda().detach()
    for j in range(5):
        temp = torch.Tensor(1, 300).normal_(0, 0.1).cuda().detach()
        temp.requires_grad=True
        c_hypoprime = [{'params': temp}]
        optimizer = torch.optim.Adam(c_hypoprime, lr=1e-4)

        for i in range(500):
            output2 = torch.nn.functional.softmax(classifier1.forward((premise_idx, hypothesis_idx))).detach()
            z_hypoprime = inverter(c_hypoprime[0]['params'][0])
            output3 = torch.nn.functional.softmax(mlp_classifier(z_prem, z_hypoprime))
            loss4 = cross_entropy(output3, output2) + ALPHA * torch.norm(z_hypoprime, p=2)
            optimizer.zero_grad()
            loss4.backward()
            optimizer.step()
            print(loss4)
        if(j == 0):
            bestloss = loss4
            bestadv = c_hypoprime
        elif(bestloss > loss4):
            bestloss = loss4
            bestadv = c_hypoprime
            
    c_hypoprime = bestadv
    nhypo_idx = autoencoder.generate(c_hypoprime[0]['params'][0], 10, False)
    return nhypo_idx.squeeze(0).cpu().numpy()

In [49]:
def perturb_premiseonly(criterion, premise, hypothesis, target, premise_words, hypothesis_words, premise_length, hypothesis_length):
    autoencoder.eval()
    inverter.eval()
    classifier1.eval()
    mlp_classifier.eval()

    premise_words = [premise_words]
    hypothesis_words = [hypothesis_words]
    premise_length = [premise_length]
    hypothesis_length = [hypothesis_length]


    premise_idx = torch.tensor([[corpus_vocab.get(w, 3) for w in s] for s in premise_words]).cuda()
    hypothesis_idx = torch.tensor([[corpus_vocab.get(w, 3) for w in s] for s in hypothesis_words]).cuda()

    c_prem = autoencoder.encode(premise_idx, premise_length, noise=False)
    z_prem = inverter(c_prem).detach()

#     c_hypo = autoencoder.encode(hypothesis_idx, hypothesis_length, noise=False).detach()
#     c_hypo.requires_grad = True
#     c_hypo = autoencoder.encode(hypothesis_idx, hypothesis_length, noise=False)
#     z_hypo = inverter(c_hypo)
    
    premise = premise.unsqueeze(0)
    hypothesis = hypothesis.unsqueeze(0)
    target = target.unsqueeze(0)

#     mlp_classifier.zero_grad()
#     inverter.zero_grad()
    for j in range(5):
        temp = torch.Tensor(1, 300).normal_(0, 0.1).cuda().detach()
        temp.requires_grad=True
        c_hypoprime = [{'params': temp}]
        optimizer = torch.optim.Adam(c_hypoprime, lr=1e-4)
        for i in range(500):
            objtive = torch.tensor([[0.98, 0.01, 0.01]]).cuda()
            hypothesis_idx = autoencoder.generate(c_hypoprime[0]['params'][0], 10, False)
            z_hypoprime = inverter(c_hypoprime[0]['params'][0])
            output2 = torch.nn.functional.softmax(mlp_classifier(z_prem, z_hypoprime))
            loss4 = cross_entropy(output2, objtive)
            optimizer.zero_grad()
            loss4.backward()
            optimizer.step()
        if(j == 0):
            bestloss = loss4
            bestadv = c_hypoprime
        elif(bestloss > loss4):
            bestloss = loss4
            bestadv = c_hypoprime        
            
    c_hypoprime = bestadv
    nhypo_idx = autoencoder.generate(c_hypoprime[0]['params'][0], 10, False)
    return nhypo_idx.squeeze(0).cpu().numpy()

In [11]:
torch.tensor([1, 2, 3])

tensor([1, 2, 3])

In [12]:
def maximum(array):
    a = array[0]
    idx = 0
    for i in range(1, 3):
        if(a < array[i]):
            idx = i
            a = array[i]
    return idx

In [13]:
torch.Tensor(1, 300).fill_(0).requires_grad

False

In [14]:
def kl_divergence(p, q):
    k = torch.log(p / q)
    p = p * k
    return torch.sum(p)

In [15]:
a = torch.tensor([0.5118122,  0.23435633, 0.25383145])
b = torch.tensor([0.51985246, 0.29073852, 0.18940896])
print(kl_divergence(b, a))

tensor(0.0153)


In [16]:
import secrets

def samples(alist):
    secure_random = secrets.SystemRandom()
    num_to_select = int(len(alist) / 2)
    list_of_random_items = secure_random.sample(alist, num_to_select)
    return list_of_random_items

In [50]:
criterion = nn.CrossEntropyLoss().cuda()

niter = 0

idx2words = dict(map(lambda x: (x[1], x[0]), corpus_vocab.items()))
oldcorrect = 0
newcorrect = 0
n = 0
alloutputarr = []
while niter < len(testloader):
    niter += 1
    batch = train_iter.next()
    for p, h, t, pw, hw, pl, hl in zip(*batch):
        outputarr = []
        nh = perturb_premiseonly(criterion, p.cuda(), h.cuda(), t.cuda(), pw, hw, pl, hl)
        print('--------------------------------')
        print('Target ', t)
        print(' '.join(pw))
        print(' '.join(hw))
#         outputarr.append(t)
#         outputarr.append(' '.join(pw))
#         outputarr.append(' '.join(hw))
        nhw = (['<sos>'] + [idx2words[i] for i in nh])[:10]
        print(' '.join(nhw))
        oldpred = classifier_pred(pw, hw)
        newpred = classifier_pred(pw, nhw)
        print('Old ', oldpred)
        print('New ', newpred)
        print('Old Prediction: ' + str(maximum(oldpred)))
        print('New Prediction: ' + str(maximum(newpred)))
        print('similarity: ' + str(kl_divergence(torch.tensor(newpred), torch.tensor(oldpred))))
        if(maximum(oldpred) == t.item()):
            oldcorrect = oldcorrect + 1
        if(maximum(newpred) == t.item()):
            newcorrect = newcorrect + 1
        n = n + 1
#         outputarr.append(' '.join(nhw))
#         outputarr.append(oldpred)
#         outputarr.append(newpred)
#         outputarr.append(maximum(oldpred))
#         outputarr.append(maximum(newpred))
#         outputarr.append(kl_divergence(torch.tensor(newpred), torch.tensor(oldpred)))
#         alloutputarr.append(outputarr)
print('oldcorrect: ' + str(oldcorrect))
print('newcorrect: ' + str(newcorrect))
print('number of premises ' + str(n))
#0 entailment #1 neutral #2 contradiction



--------------------------------
Target  tensor(2)
<sos> a woman plays guitar on the beach . <eos>
<sos> a woman plays harmonica at a beach luau .
<sos> a man outside , shooting . <eos> <eos> <eos>
Old  [0.30798134 0.26576054 0.4262581 ]
New  [0.5514052  0.23071563 0.21787916]
Old Prediction: 2
New Prediction: 0
similarity: tensor(0.1423)
--------------------------------
Target  tensor(1)
<sos> two girls waving two fingers . <eos> <pad> <pad>
<sos> the girls are twenty <pad> <pad> <pad> <pad> <pad>
<sos> there with lots of a sitting . <eos> <eos>
Old  [0.43067834 0.2610901  0.30823156]
New  [0.3923007  0.08578537 0.52191395]
Old Prediction: 0
New Prediction: 2
similarity: tensor(0.1428)
--------------------------------
Target  tensor(0)
<sos> a concert outside for good morning america . <eos>
<sos> music is playing on good morning america . <pad>
<sos> there people are on ocean coming <eos> <eos> <eos>
Old  [0.277981   0.40899327 0.31302577]
New  [0.5247121  0.15352434 0.3217636 ]
Old 

--------------------------------
Target  tensor(2)
<sos> a girls is jumping in the sand . <eos>
<sos> a girl is sitting in the sand . <pad>
<sos> there are animals behind a drawing . <eos> <eos>
Old  [0.20457244 0.13908385 0.6563437 ]
New  [0.6379066  0.14980009 0.2122933 ]
Old Prediction: 2
New Prediction: 0
similarity: tensor(0.4970)
--------------------------------
Target  tensor(2)
<sos> men fighting in front of a bar . <eos>
<sos> they are fighting in a boxing ring . <pad>
<sos> people are playing along live <eos> <eos> <eos> people
Old  [0.11490432 0.45096514 0.43413052]
New  [0.57313555 0.23055735 0.19630711]
Old Prediction: 1
New Prediction: 0
similarity: tensor(0.6106)
--------------------------------
Target  tensor(2)
<sos> a black dog sits on the floor . <eos>
<sos> a white dog stands on the couch . <pad>
<sos> someone is enjoying with <eos> <eos> <eos> <eos> <eos>
Old  [0.1711847  0.17021489 0.6586004 ]
New  [0.52920383 0.30931398 0.16148219]
Old Prediction: 2
New Predictio

--------------------------------
Target  tensor(0)
<sos> a man hangs up unusually decorated lamps . <eos>
<sos> unusual lamps being hung up by a man <pad>
<sos> people are surrounded by trash this . <eos> <eos>
Old  [0.44125652 0.29614452 0.2625989 ]
New  [0.5269074  0.24962063 0.22347203]
Old Prediction: 0
New Prediction: 0
similarity: tensor(0.0148)
--------------------------------
Target  tensor(2)
<sos> man about to kick football on a field .
<sos> the man is at home sleeping <pad> <pad> <pad>
<sos> people dancing in stage store . <eos> the people
Old  [0.11818311 0.14022085 0.74159604]
New  [0.5746137  0.16147746 0.26390886]
Old Prediction: 2
New Prediction: 0
similarity: tensor(0.6589)
--------------------------------
Target  tensor(0)
<sos> colorful glassware for sale by an outdoor vendor .
<sos> a vendor is showcasing his glassware collection to passerby
<sos> multiple people are crowded fireworks . <eos> <eos> kids
Old  [0.19264947 0.52633005 0.28102052]
New  [0.6479264  0.209

KeyboardInterrupt: 