In [1]:
# Load Tensorboard Jupyter extension.
%load_ext tensorboard

In [None]:
# Show NVIDIA info.
!nvidia-smi

In [2]:
# Install requirements.
!pip install -r ../requirements.txt

Collecting aiohttp==3.8.1
  Downloading aiohttp-3.8.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 1.1 MB/s eta 0:00:01
[?25hCollecting aiosignal==1.2.0
  Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)
Collecting asttokens==2.0.5
  Downloading asttokens-2.0.5-py2.py3-none-any.whl (20 kB)
Collecting async-timeout==4.0.2
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting attrs==21.4.0
  Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)
[K     |████████████████████████████████| 60 kB 3.2 MB/s eta 0:00:01
Collecting black==22.3.0
  Downloading black-22.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 5.5 MB/s eta 0:00:01
[?25hCollecting cachetools==5.0.0
  Downloading cachetools-5.0.0-py3-none-any.whl (9.1 kB)
Collecting certifi==2021.10.8
  Downloading certifi-2021.10.8-py2.py3

In [3]:
import datasets
import torch
import numpy as np
import pytorch_lightning as pl
import transformers
import torchmetrics
import gzip
import requests
import pandas as pd
import re
import os
import json
import yaml
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)

from lib.data import load_amazon, load_imdb, stats, build_collate
from lib.train import train
from lib.test import test

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
root_dir = "logs"
val_ratio = 0.1
seed = 42

In [6]:
# Load IMDB dataset.
imdb_train, imdb_val, imdb_test = load_imdb(
    val_ratio=val_ratio, seed=seed, directory="../data/imdb"
)

# Load Amazon dataset.
amazon_train, amazon_val, amazon_test = load_amazon(
    train_size=len(imdb_train),
    val_size=len(imdb_val),
    test_size=len(imdb_test),
    seed=seed,
    directory="../data/amazon",
)

Loading dataset from [../data/imdb]
Loading dataset from [../data/amazon]


In [7]:
print(
    f"IMDb:", len(imdb_train)+len(imdb_val)+len(imdb_test)
    )
stats(imdb_train, imdb_val, imdb_test)
print()
print(
    f"Amazon:", len(amazon_train)+len(amazon_val)+len(amazon_test)
    )
stats(amazon_train, amazon_val, amazon_test)

IMDb: 50000
Train (22500)
| Positive: 11250 (50.00%)
| Negative: 11250 (50.00%)
Val (2500)
| Positive: 1250 (50.00%)
| Negative: 1250 (50.00%)
Test (25000)
| Positive: 12500 (50.00%)
| Negative: 12500 (50.00%)

Amazon: 50000
Train (22500)
| Positive: 11250 (50.00%)
| Negative: 11250 (50.00%)
Val (2500)
| Positive: 1250 (50.00%)
| Negative: 1250 (50.00%)
Test (25000)
| Positive: 12500 (50.00%)
| Negative: 12500 (50.00%)


In [8]:
# Load train configs.
with open("../experiments/train.yaml", "r") as f:
    train_config = yaml.safe_load(stream=f)
train_experiments = [
    {**train_config["common"], **experiment}
    for experiment in train_config["experiments"]
]

# Load test configs.
with open("../experiments/test.yaml", "r") as f:
    test_config = yaml.safe_load(stream=f)
test_experiments = [
    {**test_config["common"], **experiment} for experiment in test_config["experiments"]
]

In [9]:
# Clear cache and checkpoint logs.
!rm -rf logs
!rm -rf /tmp/.tensorboard-info

In [10]:
# Launch Tensorboard session.
%tensorboard --logdir=logs

In [None]:
# Run train experiments.
for experiment_args in train_experiments:
    if experiment_args["dataset"] == "imdb":
        train(**experiment_args, train_dataset=imdb_train, val_dataset=imdb_val)
    elif experiment_args["dataset"] == "amazon":
        train(**experiment_args, train_dataset=amazon_train, val_dataset=amazon_val)
    else:
        raise ValueError(f"Unknown dataset: {experiment_args['dataset']}")

In [None]:
# Run test experiments.
for experiment_args in test_experiments:
    if experiment_args["dataset"] == "imdb":
        test(**experiment_args, test_dataset=imdb_test)
    elif experiment_args["dataset"] == "amazon":
        test(**experiment_args, test_dataset=amazon_test)
    else:
        raise ValueError(f"Unknown dataset: {experiment_args['dataset']}")