## Unleash and visualize thousands of reproducible experiments on toolkit

**Only four steps:**

1. Define the trainval function 
2. Define hyperparameters      
3. Run the experiments sequentially or on toolkit 
4. Visualize their results and job status

	
                    

**Folder structure for a single experiment**
```
results/
├── <exp_hash_id>/             # the md5 hash of the hyperparameters
│   │  
│   ├── code/                  # copy of the code for reproducibility
│   ├── exp_dict.json          # the experiment hyperparameters
│   ├── score_list.json        # contains the metric scores at each epoch
│   ├── model.pth              # saved model weights
│   └── job_dict.json          # contains the job id that ran this experiment
```

##  1. Define the trainval function for a small mnist experiment

In [3]:
import torch, torchvision, os, pprint
from tqdm import notebook as tqdm
import pandas as pd

from haven import haven_utils as hu

def trainval(exp_dict, savedir_base):
  """
  exp_dict: dictionary defining the hyperparameters of the experiment
  savedir_base: the base directory where the experiment will be saved
  """
  exp_id = hu.hash_dict(exp_dict)
  print('\n%s\nexp_id: %s' % ("="*20, exp_id))
  pprint.pprint(exp_dict)

  # create experiment directory 
  savedir = os.path.join(savedir_base, exp_id)
  os.makedirs(savedir, exist_ok=True)
  hu.save_json(os.path.join(savedir, "exp_dict.json"), exp_dict)

  # get dataset and loader
  transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
  dataset = torchvision.datasets.MNIST(savedir_base, 
                                       train=False, download=True,
                                       transform=transform)
  train_loader = torch.utils.data.DataLoader(dataset, batch_size=64)

  # get model and optimizer
  model = torch.nn.Linear(784, 10, bias=False)
  opt = torch.optim.Adam(model.parameters(), lr=exp_dict['lr'])

  # run training loop
  score_list = []
  for e in range(exp_dict['max_epoch']):
    pbar = tqdm.tqdm(total=len(train_loader), leave=False)

    for batch in train_loader:
      images, labels = batch

      # train on batch
      logits = model.forward(images.view(images.shape[0], -1))
      loss = torch.nn.CrossEntropyLoss()(logits, labels)

      # update optimizer
      opt.zero_grad()
      loss.backward()
      opt.step()

      pbar.set_description("%d/%d - Loss: %.3f" % 
                          (e, exp_dict['max_epoch'], loss))
      pbar.update(1)

    pbar.close()

    # get score dict
    score_dict = {'loss': float(loss), 'epoch':e}

    # save score_list and model
    score_list += [score_dict]
    hu.save_pkl(os.path.join(savedir, "score_list.pkl"), score_list)

  print('\n', pd.DataFrame(score_list))
  print('experiment finished')
    
#   metric_to_metric_map = [{'acc_u':'acc' }, {'acc':'acc'}, {'accuracy':'acc'}]

## 2. Define hyperparameters across learning rates

In [4]:
# define each experiment as a dictionary of hyperparameters
exp_list = []
for lr in [1e-1, 1e-4, 1e-10]:
  exp_dict = {'dataset': 'mnist', 
              'model': 'logistic', 
              'max_epoch': 3,
              'lr':lr}

  exp_list += [exp_dict]

pprint.pprint(exp_list)

[{'dataset': 'mnist', 'lr': 0.1, 'max_epoch': 3, 'model': 'logistic'},
 {'dataset': 'mnist', 'lr': 0.0001, 'max_epoch': 3, 'model': 'logistic'},
 {'dataset': 'mnist', 'lr': 1e-10, 'max_epoch': 3, 'model': 'logistic'}]


## 3.1 Run the experiments sequentially 

In [5]:
# define directory for the results
savedir_base = 'results'

# run trainval for each exp_dict (sequentially)
for exp_dict in exp_list:
    trainval(exp_dict=exp_dict, savedir_base=savedir_base)


exp_id: cabe35c36d2ea37fd7104e3ff3cb6c50
{'dataset': 'mnist', 'lr': 0.1, 'max_epoch': 3, 'model': 'logistic'}
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to results/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting results/MNIST/raw/train-images-idx3-ubyte.gz to results/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to results/MNIST/raw/train-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting results/MNIST/raw/train-labels-idx1-ubyte.gz to results/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to results/MNIST/raw/t10k-images-idx3-ubyte.gz



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting results/MNIST/raw/t10k-images-idx3-ubyte.gz to results/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to results/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting results/MNIST/raw/t10k-labels-idx1-ubyte.gz to results/MNIST/raw
Processing...


  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


Done!


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


        loss  epoch
0  0.648104      0
1  0.300613      1
2  0.031733      2
experiment finished

exp_id: e9d2bfeddf2aa2d7dc49fd1e92a1e5c9
{'dataset': 'mnist', 'lr': 0.0001, 'max_epoch': 3, 'model': 'logistic'}


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


        loss  epoch
0  1.744576      0
1  1.352030      1
2  1.090075      2
experiment finished

exp_id: 34256462904885fe0ffce27c4bb3dac9
{'dataset': 'mnist', 'lr': 1e-10, 'max_epoch': 3, 'model': 'logistic'}


HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=157.0), HTML(value='')))


        loss  epoch
0  2.285532      0
1  2.285532      1
2  2.285532      2
experiment finished


## 3.2 Run the experiments on toolkit

In [13]:
# run trainval for each exp_dict (on toolkit)
from haven import haven_jobs as hjb

job_config = {'image': 'registry.console.elementai.com/%s/ssh' % os.environ['EAI_ACCOUNT_ID']  ,
              'data': ['eai.colab.public:/mnt/public'],
              'restartable':True,
              'resources': {'cpu': 4, 'mem': 8, 'gpu': 1},}

jm = hjb.JobManager(exp_list=exp_list, 
            savedir_base='/mnt/public/results/example', 
            workdir='/mnt/home/projects/haven',
            account_id=os.environ['EAI_ACCOUNT_ID'],
            job_config=job_config
            )

for exp_dict in exp_list:
    command = 'python example.py'
    savedir = os.path.join(savedir_base, hu.hash_dict(exp_dict))
    job_dict = jm.launch_exp_dict(exp_dict, savedir, command, job=None)
    job_id = job_dict['job_id']

    print(job_id)
    break

  > Copying code from /mnt/home/projects/haven/ to results/cabe35c36d2ea37fd7104e3ff3cb6c50/code


ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 15 Oct 2020 17:00:48 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '197', 'Connection': 'keep-alive', 'Access-Control-Allow-Credentials': 'true', 'Access-Control-Allow-Origin': 'https://console.elementai.com', 'Access-Control-Max-Age': '86400', 'Cache-Control': 'no-cache, no-store, must-revalidate, private', 'Pragma': 'no-cache', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains', 'Vary': 'Origin', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'deny', 'X-Request-Id': '21fbe359-d117-4a88-9c99-e6429631bab0'})
HTTP response body: {"error":{"status":"Bad Request","code":400,"type":"job","message":"job: Invalid workdir parameter. A valid workdir must start with '/'. Got value: results/cabe35c36d2ea37fd7104e3ff3cb6c50/code"}}



In [12]:
os.path.dirname(os.path.realpath(__file__))

NameError: name '__file__' is not defined

## 4. Visualize Experiments

In [14]:
from haven import haven_jupyter as hj
from haven import haven_results as hr
from haven import haven_utils as hu

# visualize experiments using dashboard
rm = hr.ResultManager(exp_list=None, savedir_base='results', verbose=0)
filterby_list = {'model':'logistic'}
y_metrics = ['loss']
x_metric = 'epoch'
hj.get_dashboard(rm, vars(), wide_display=True)

100%|██████████| 4/4 [00:00<00:00, 571.68it/s]


'Selected 3/3 experiments using "filterby_list"'

VBox(children=(Label(value='Select exp_group', layout=Layout(width='200px')), HBox(children=(Dropdown(layout=L…

<IPython.core.display.Javascript object>

Output()

<haven.haven_jupyter.DashboardManager at 0x7efef9e0c810>

In [None]:
# visualize dataframe manually
dataframe = rm.get_score_df()
rm.get_score_lists()
rm.get_plot_all(y_metric_list=['train_loss', 'val_acc'], x_metric='epoch', figsize=(10,5))


Unnamed: 0,exp_id,batch_size,dataset,model.n_layers,model.name
0,'b170a17c1f7c2121dc4015e5911cd0e3',1,'mnist',30,'mlp'


## INSTALLATIONS

In [None]:
!pip install --upgrade git+https://github.com/haven-ai/haven-ai