In [9]:
from argparse import Namespace
%load_ext autoreload
%autoreload 2
import sys
import os
home = os.environ['HOME']
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
print(os.environ['CUDA_VISIBLE_DEVICES'])
os.chdir(f'{home}/pycharm/automl')
# os.chdir(f'{home}/pycharm/automl/search_policies/rnn')
sys.path.append(f'{home}/pycharm/nasbench')
sys.path.append(f'{home}/pycharm/automl')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
0


In [2]:
# Set the random seed manually for reproducibility.
seed = 1000
import numpy as np
import torch
import torch.backends.cudnn as cudnn
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.set_device(0)
    cudnn.benchmark = True
    cudnn.enabled = True
    torch.cuda.manual_seed_all(seed)


In [3]:
from search_policies.rnn.model import RNNModel
from search_policies.rnn.softws.soft_weight_sharing_model import RNNModelSoftWS

from genotypes import Genotype, PRIMITIVES
from search_policies.search_space import SearchSpace
from search_policies.search_configs import parser

args = parser.parse_args(['--num_intermediate_node', '2']) 
search_space = SearchSpace(args)
args.search_space = search_space
search_space.genotype_from_id(0)

args.softws_num_param_per_node = 4
args.dropoute = 0
args.dropouti = 0
args.dropout = 0
args.dropoutx = 0
args.dropouth = 0

ntoken = 10000
genotype_id = 0
batch_size = 64
model = RNNModel(ntoken=ntoken, args=args, 
                 genotype_id=genotype_id, 
                 genotype=search_space.genotype_from_id(genotype_id, 2))

ws_model = RNNModelSoftWS(
    ntoken=ntoken, args=args, 
                 genotype_id=genotype_id, 
                 genotype=search_space.genotype_from_id(genotype_id, 2)
)


import torch.nn as nn

def initialize_soft_ws_from_original(model, ws_model):
    ws_model.rnns[0]._W0 = nn.Parameter(model.rnns[0]._W0.detach().clone())
    ws_model.rnns[0].initialize_ws_by_values(
        init_tensors=[p.detach() for p in model.rnns[0]._Ws]
    )
    ws_model.encoder.weight = nn.Parameter(model.encoder.weight.detach().clone())
    ws_model.decoder.weight = ws_model.encoder.weight
    ws_model.decoder.bias = nn.Parameter(model.decoder.bias.detach().clone())
    return ws_model

ws_model = initialize_soft_ws_from_original(model, ws_model)

ws_model_params = ws_model.rnns[0].compute_ws()
model_params = [p for p in model.rnns[0].parameters()]

# CHeck the size and value. 0 is indeed.
print(len(ws_model_params), len(model_params))
print([w.size() for w in ws_model_params])
print([w.size() for w in model.rnns[0]._Ws])
print((model_params[1] - ws_model_params[0]).sum())
print((ws_model.rnns[0]._W0 - model.rnns[0]._W0).sum())

# Prepare the data.
from dataloader import Corpus
import utils 
corpus = Corpus('data/penn')
args.cuda = 1
train_data = utils.batchify(corpus.train, batch_size, args)
model.cuda()
ws_model.cuda()
hidden = model.init_hidden(batch_size)

# Testing forward pass
inp_x, inp_y = utils.get_batch(train_data, 10, args, 35)
output_1 = model(inp_x, hidden, return_h=True)
output_2 = ws_model(inp_x, hidden, return_h=True)

print((output_1[0]- output_2[0]).sum())


[0.38076125 0.76041522 0.75035755]
32
Cannot import graphviz package
2 3
[torch.Size([850, 1700]), torch.Size([850, 1700])]
[torch.Size([850, 1700]), torch.Size([850, 1700])]
tensor(0., grad_fn=<SumBackward0>)
tensor(0., grad_fn=<SumBackward0>)
torch.Size([14524, 64])
tensor(0., device='cuda:0', grad_fn=<SumBackward0>)


In [36]:
# Utility functions
def compare_gradient_ws_model(model, ws_model):
    value_pair = {
        'Encoder': [model.encoder.weight, ws_model.encoder.weight],
        'Decoder': [model.decoder.weight ,ws_model.decoder.weight],
        'W0' : [model.rnns[0]._W0, ws_model.rnns[0]._W0],
        'W1' : [model.rnns[0]._Ws[0], ws_model.rnns[0]._Ws[0]],
        'W2' : [model.rnns[0]._Ws[1], ws_model.rnns[0]._Ws[4]],
    }
    for k, v in value_pair.items():
        grads = [p.grad.norm().item() if p.grad is not None else None for p in v]
        if k in ['W1', 'W2']:
            grads[1] = 4 * grads[1]
        if all(grads):
            print(f"{k} grad compare: {grads[0] - grads[1]}")
        else:
            print(f"{k} grad compare: {grads[0]} | ws {grads[1]}")

def compare_weights_ws_model(model, ws_model):
    # Compare the param difference
    ws_model_params = ws_model.rnns[0].compute_ws()
    model_params = [p for p in model.rnns[0].parameters()]
    print("param 1 difference", utils.to_item((model_params[1] - ws_model_params[0]).sum()))
    print("param 2 difference", utils.to_item((model_params[2] - ws_model_params[1]).sum()))
    print("param 0 difference",(ws_model.rnns[0]._W0 - model.rnns[0]._W0).sum().item())


def compare_raw_weights(model, ws_model):
    ws = ws_model.rnns[0].compute_ws()
    value_pair = {
        'Encoder': [model.encoder.weight, ws_model.encoder.weight],
        'Decoder': [model.decoder.weight ,ws_model.decoder.weight],
        'W0' : [model.rnns[0]._W0, ws_model.rnns[0]._W0],
        'W1' : [model.rnns[0]._Ws[0], ws_model.rnns[0]._Ws[0]],
        'W2' : [model.rnns[0]._Ws[1], ws_model.rnns[0]._Ws[4]],
        'W1-compute' : [model.rnns[0]._Ws[0], ws[0]],
        'W2-compute' : [model.rnns[0]._Ws[1], ws[1]],
    }
    for k, v in value_pair.items():
        weights = [w.norm().item() for w in v]
        if all(weights):
            print(f"{k} weights norm compare : {weights[0]} | {weights[1]}")

# testing the backward and forward in 10 batches
import numpy as np
from torch.optim import SGD
batch, i, pop_index = 0, 0, 0

hidden = model.init_hidden(batch_size)
output_1 = model(inp_x, hidden, return_h=True)
output_2 = ws_model(inp_x, hidden, return_h=True)
print('before optimization: ', (output_2[0] - output_1[0]).sum())
compare_weights_ws_model(model, ws_model)

optimizer1 = SGD([model.rnns[0]._W0], lr=0.01)
# optimizer1 = SGD(model.parameters(), lr=0.01)
optimizer2 = SGD([ws_model.rnns[0]._W0], lr=0.01)
# optimizer2 = SGD(ws_model.parameters(), lr=0.01)
# 
# Looping the dataset.
while i < train_data.size(0) - 1 - 1:
    print("Start a new batch ... ")
    # computing the genotype of the next particle
    genotype_id = np.random.randint(32)
    new_genotype = search_space.genotype_from_id(genotype_id)

    # selecting the current subDAG in our DAG to train
    model.change_genotype(genotype=new_genotype, genotype_id=genotype_id)
    ws_model.change_genotype(genotype=new_genotype, genotype_id=genotype_id)

    bptt = 35
    seq_len = int(bptt)

    # preparing batch of data for training
    cur_data, cur_targets = utils.get_batch(train_data, i, args, seq_len=seq_len)
    cur_targets = cur_targets.contiguous().view(-1)
    # print(cur_data.size(2))
    optimizer1.zero_grad()
    optimizer2.zero_grad()
    
    hidden = utils.repackage_hidden(hidden)
    # hidden_valid[s_id] = repackage_hidden(hidden_valid[s_id])

    # forward pass
    log_prob, _, rnn_hs, dropped_rnn_hs = model(cur_data,hidden,return_h=True)
    # print('hidden norm before', hidden[0].norm())
    hidden = utils.repackage_hidden(hidden)
    log_prob2, _, rnn_hs2, dropped_rnn_hs2 = ws_model(cur_data,hidden,return_h=True)
    print("Model output output 1 - output 2 = ", (log_prob - log_prob2).sum().item())
    # print('hidden norm after', hidden[0].norm())
    
    # loss using negative-log-likelihood
    raw_loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), cur_targets)
    raw_loss2 = nn.functional.nll_loss(log_prob2.view(-1, log_prob2.size(2)), cur_targets)
    
    loss = raw_loss
    loss2 = raw_loss2
    print("Loss of original model : ", loss.item())
    print("Loss of new model : ", loss2.item())
    
    loss2.backward()
    loss.backward()

    # applying the gradient updates
    # utils.clip_grad_norm(model.parameters(), args.clip)
    # utils.clip_grad_norm(ws_model.parameters(), args.clip)
    
    compare_gradient_ws_model(model, ws_model)
    optimizer1.step()
    optimizer2.step()
    
    batch += 1
    i += seq_len
    if batch > 30:
        break
        
    print("Finish batch \n")

before optimization:  tensor(0., device='cuda:0', grad_fn=<SumBackward0>)
param 1 difference 0.0
param 2 difference 0.0
param 0 difference 0.0
Start a new batch ... 
Model output output 1 - output 2 =  0.0
Loss of original model :  9.210322380065918
Loss of new model :  9.210322380065918
Encoder grad compare: 0.0
Decoder grad compare: 0.0
W0 grad compare: 0.0
W1 grad compare: 0.0
W2 grad compare: 0.0
Finish batch 

Start a new batch ... 
Model output output 1 - output 2 =  0.0
Loss of original model :  9.210332870483398
Loss of new model :  9.210332870483398
Encoder grad compare: 0.0
Decoder grad compare: 0.0
W0 grad compare: 0.0
W1 grad compare: 0.0
W2 grad compare: 0.0
Finish batch 

Start a new batch ... 
Model output output 1 - output 2 =  0.0
Loss of original model :  9.210433959960938
Loss of new model :  9.210433959960938
Encoder grad compare: 0.0
Decoder grad compare: 0.0
W0 grad compare: 0.0
W1 grad compare: 0.0
W2 grad compare: 0.0
Finish batch 

Start a new batch ... 
Model 

In [6]:

compare_raw_weights(model, ws_model)


Encoder weights norm compare : 67.33435821533203 | 67.33435821533203
Decoder weights norm compare : 67.33435821533203 | 67.33435821533203
W0 weights norm compare : 39.251556396484375 | 39.251556396484375
W1 weights norm compare : 27.766395568847656 | 27.766395568847656
W2 weights norm compare : 27.747833251953125 | 27.747833251953125
W1-compute weights norm compare : 27.766395568847656 | 27.766395568847656
W2-compute weights norm compare : 27.747833251953125 | 27.747833251953125


In [7]:
print("Before step")
compare_weights_ws_model(model, ws_model)
optimizer1.step()
print("After step 1")
compare_weights_ws_model(model, ws_model)
optimizer2.step()
print("After step 2")
compare_weights_ws_model(model, ws_model)


# check if the weights of ws model are the same or not.

# print(ws_model.rnns[0].soft_param_dicts[0][0])
print(len([p.size() for p in ws_model.parameters()]))
print(len([p.size() for p in model.parameters()]))

Before step
param 1 difference 0.0
param 2 difference 0.0
param 0 difference -0.6326582431793213
After step 1
param 1 difference 0.0
param 2 difference 0.0
param 0 difference -0.6183065176010132
After step 2
param 1 difference 0.0
param 2 difference 0.0
param 0 difference -0.6303144097328186
11
5


In [8]:
print(ws_model)
print(model)

RNNModelSoftWS(
  (lockdrop): LockedDropout()
  (encoder): Embedding(10000, 850)
  (rnns): ModuleList(
    (0): BenchmarkCellSoftWS(
      (_Ws): ParameterList(
          (0): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
          (1): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
          (2): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
          (3): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
          (4): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
          (5): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
          (6): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
          (7): Parameter containing: [torch.cuda.FloatTensor of size 850x1700 (GPU 0)]
      )
    )
  )
  (decoder): Linear(in_features=850, out_features=10000, bias=True)
)
RNNModel(
  (lockdrop): LockedDropout()
  (encoder): Em

In [9]:
print([p.grad.size() for p in model.parameters()])
# print([p.grad.size() for p in ws_model.parameters()])


[torch.Size([10000, 850]), torch.Size([1700, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([10000])]


In [10]:

# loss2.backward()
print([p.grad.size() for p in ws_model.parameters()])
# optimizer1.step()
print(ws_model.rnns[0]._Ws[0].grad)


[torch.Size([10000, 850]), torch.Size([1700, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([850, 1700]), torch.Size([10000])]
tensor([[-2.3829e-05, -1.3522e-05,  2.8911e-05,  ..., -1.2082e-04,
         -1.2098e-04,  1.7041e-04],
        [-2.2591e-05, -1.4650e-05,  2.8086e-05,  ..., -1.4077e-04,
         -1.5443e-04,  2.2530e-04],
        [-3.2155e-05, -1.8940e-05,  3.7770e-05,  ..., -1.6261e-04,
         -1.7970e-04,  2.4731e-04],
        ...,
        [-1.2547e-05, -6.9961e-06,  1.4064e-05,  ..., -7.7625e-05,
         -7.0915e-05,  9.7396e-05],
        [-2.9369e-05, -1.7936e-05,  3.6615e-05,  ..., -1.6770e-04,
         -1.7058e-04,  2.3913e-04],
        [ 8.8107e-06,  5.8585e-06, -8.3779e-06,  ...,  7.0062e-05,
          7.8304e-05, -1.1476e-04]], device='cuda:0')


# Testing the mapping function.


In [10]:

import search_policies.rnn.softws.soft_ws_mapping_node2 as genotype_mapping 
from search_policies.rnn.softws.soft_weight_sharing_model import get_fn_map

def visualize_mapping_for_genotype(map_fn, search_space):
    # if issubclass(map_fn, genotype_mapping.StoreQueryFull):
    #     map_fn = map_fn([4, 4], 2, search_space=search_space)
    for i in range(32):        
        g_id = i
        genotype = search_space.genotype_from_id(i)
        print("Genotype {} {}".format(g_id, genotype))
        print(map_fn(genotype, g_id, [4, 4], 2))
        


# visualize_mapping_for_genotype(genotype_mapping.map_v1, search_space)
# visualize_mapping_for_genotype(genotype_mapping.map_v2, search_space)
# visualize_mapping_for_genotype(get_fn_map('soft_map_v3', None), search_space)
# visualize_mapping_for_genotype(genotype_mapping.map_random_v1, search_space)
# visualize_mapping_for_genotype(genotype_mapping.map_random_v2, search_space)
# visualize_mapping_for_genotype(genotype_mapping.map_random_v3, search_space)
# visualize_mapping_for_genotype(default_genotype_to_param_query, search_space)
args = Namespace(softws_init_v=0.05)
visualize_mapping_for_genotype(get_fn_map('soft_map_v3_init', args), search_space)

Genotype 0 Genotype(recurrent=[('tanh', 0), ('tanh', 0)], concat=range(1, 3))
[[0.85, 0.05, 0.05, 0.05], [0.85, 0.05, 0.05, 0.05]]
Genotype 1 Genotype(recurrent=[('tanh', 0), ('relu', 0)], concat=range(1, 3))
[[0.05, 0.85, 0.05, 0.05], [0.85, 0.05, 0.05, 0.05]]
Genotype 2 Genotype(recurrent=[('tanh', 0), ('sigmoid', 0)], concat=range(1, 3))
[[0.05, 0.05, 0.85, 0.05], [0.85, 0.05, 0.05, 0.05]]
Genotype 3 Genotype(recurrent=[('tanh', 0), ('identity', 0)], concat=range(1, 3))
[[0.05, 0.05, 0.05, 0.85], [0.85, 0.05, 0.05, 0.05]]
Genotype 4 Genotype(recurrent=[('tanh', 0), ('tanh', 1)], concat=range(1, 3))
[[0.85, 0.05, 0.05, 0.05], [0.85, 0.05, 0.05, 0.05]]
Genotype 5 Genotype(recurrent=[('tanh', 0), ('relu', 1)], concat=range(1, 3))
[[0.05, 0.85, 0.05, 0.05], [0.85, 0.05, 0.05, 0.05]]
Genotype 6 Genotype(recurrent=[('tanh', 0), ('sigmoid', 1)], concat=range(1, 3))
[[0.05, 0.05, 0.85, 0.05], [0.85, 0.05, 0.05, 0.05]]
Genotype 7 Genotype(recurrent=[('tanh', 0), ('identity', 1)], concat=rang

In [None]:
map_fn = genotype_mapping.map_random_v2([4,4], 2, search_space=search_space)
print(map_fn.id_query)
print(search_space.num_solutions)