In [36]:
import torch
import argparse
import os
import logging
import time
from torch import nn
from contextlib import nullcontext
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
from transformers import BertTokenizer, BertModel, AdamW
#from tqdm.auto import tqdm

In [37]:
# get distributed conf
WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
RANK = int(os.environ.get("RANK", 0))

In [38]:
def parse_args(args):
    print(123)
    import argparse
    parser = argparse.ArgumentParser(description="PyTorch PERT Example")
    parser.add_argument("--batch-size", type=int, default=16, metavar="N",
                        help="input batch size for training (default: 16)")
    parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N",
                        help="input batch size for testing (default: 1000)")
    parser.add_argument("--epochs", type=int, default=1, metavar="N",
                        help="number of epochs to train (default: 10)")
    parser.add_argument("--lr", type=float, default=1e-5, metavar="LR",
                        help="learning rate (default: 0.01)")
    parser.add_argument("--seed", type=int, default=1, metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument("--dataset", type=int, default=1, metavar="D",
                        help="dataset size (default 1 * 9600)")
    parser.add_argument("--save-model", action="store_true", default=False,
                        help="For Saving the current Model")
    parser.add_argument("--local-only", action="store_true", default=False,
                        help="If set to true, then load model from disk")
    parser.add_argument("--model-path", type=str, default="/ppml/model",
                        help="Where to load model")
    parser.add_argument("--device", type=str, default="cpu",
                    help="Where to train model, default is cpu")
    # Only for test purpose
    parser.add_argument("--load-model", action="store_true", default=False,
                        help="For loading the current model")
    parser.add_argument("--mini-batch", type=int, default=0, metavar="M",
                    help="If set, the PyTorch will conduct M local-batch computation before doing a all_reduce sync")
    parser.add_argument("--log-interval", type=int, default=2, metavar="N",
                    help="how many batches to wait before logging training status")
    parser.add_argument("--log-path", type=str, default="",
                    help="Path to save logs. Print to StdOut if log-path is not set")
    return parser.parse_args(args)

In [39]:
def set_log_path(args):
    print(456)
    if args.log_path == "":
        logging.basicConfig(
        format="%(asctime)s %(levelname)-8s %(message)s",
        datefmt="%Y-%m-%dT%H:%M:%SZ",
        level=logging.DEBUG)
    else:
        logging.basicConfig(
            format="%(asctime)s %(levelname)-8s %(message)s",
            datefmt="%Y-%m-%dT%H:%M:%SZ",
            level=logging.DEBUG,
            filename=args.log_path)

In [40]:
class Dataset(torch.utils.data.Dataset):
    # data_type is actually split, so that we can define dataset for train set/validate set
    def __init__(self, data_type, dataset_load):
        self.dataset_load = dataset_load
        self.data = self.load_data(data_type)

    def load_data(self, data_type):
        tmp_dataset = load_dataset(path='seamew/ChnSentiCorp', split=data_type)
        Data = {}
        # So enumerate will return a index, and  the line?
        # line is a dict, including 'text', 'label'
#         if data_type == 'train':
        for i in range(self.dataset_load):
            for idx, line in enumerate(tmp_dataset):
                sample = line
                Data[idx + i * len(tmp_dataset)] = sample
#         else:
#             for idx, line in enumerate(tmp_dataset):
#                 sample = line
#                 Data[idx] = sample
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [41]:
def get_dataloader(args, train_data, valid_data, tokenizer):
    if is_distributed():
        train_sampler = DistributedSampler(
            train_data, num_replicas=WORLD_SIZE, rank=RANK, shuffle=True, drop_last=False, seed=args.seed)
        valid_sampler = DistributedSampler(
            valid_data, num_replicas=WORLD_SIZE, rank=RANK, shuffle=True, drop_last=False, seed=args.seed)
        train_dataloader = DataLoader(
            train_data, batch_size=args.batch_size, collate_fn=lambda x: collate_fn(x, tokenizer), sampler=train_sampler)
        valid_dataloader = DataLoader(
            valid_data, batch_size=args.test_batch_size, collate_fn=lambda x: collate_fn(x, tokenizer), sampler=valid_sampler)
    else:
        train_dataloader = DataLoader(
            train_data, batch_size=args.batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer))
        valid_dataloader = DataLoader(
            valid_data, batch_size=args.test_batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer))
    return train_dataloader, train_dataloader

In [42]:
def should_distribute():
    return dist.is_available() and WORLD_SIZE > 1

In [43]:
def is_distributed():
    return dist.is_available() and dist.is_initialized()

Define dataset, so it is easier to load different split in the dataset

In [55]:
# Return a batch of data, which is used for training
def collate_fn(batch_samples, tokenizer):
    batch_text = []
    batch_label = []
    for sample in batch_samples:
        batch_text.append(sample['text'])
        batch_label.append(int(sample['label']))
    # The tokenizer will make the data to be a good format for our model to understand
    X = tokenizer(
        batch_text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

In [56]:
# define model
class NeuralNetwork(nn.Module):
    def __init__(self, args):
        super(NeuralNetwork, self).__init__()
        if args.local_only:
            checkpoint = args.model_path
            self.bert_encoder = BertModel.from_pretrained(
                checkpoint, local_files_only=True)
        else:
            checkpoint = 'hfl/chinese-pert-base'
            self.bert_encoder = BertModel.from_pretrained(checkpoint)
        self.classifier = nn.Linear(768, 2)

    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0]
        logits = self.classifier(cls_vectors)
        return logits

In [57]:
# define training loop
def train_loop(args, dataloader, model, loss_fn, optimizer, epoch, total_loss):
    # Set to train mode
    model.train()
    total_dataset = 0
    optimizer.zero_grad(set_to_none=True)
    enumerator = enumerate(dataloader, start=1)
    for batch, (X, y) in enumerator:
        my_context = model.no_sync if WORLD_SIZE > 1 and args.mini_batch > 0 and batch % args.mini_batch != 0 else nullcontext
        with my_context():
            X, y = X.to(args.device), y.to(args.device)
            # Forward pass
            pred = model(X)
            loss = loss_fn(pred, y)
            loss.backward()
            total_loss += loss.item()
        if args.mini_batch == 0 or batch % args.mini_batch == 0:
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

        total_dataset += args.batch_size
        from torch.utils.tensorboard import SummaryWriter   
        writer = SummaryWriter('/ppml/test/pert.log')
        writer.add_scalar('loss',loss.item(), (epoch - 1) * len(dataloader) + batch)
        if batch % args.log_interval == 0:
            msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
                epoch, batch, len(dataloader),
                100. * batch / len(dataloader), loss.item())
            logging.info(msg)

    return total_loss, total_dataset

In [58]:
# define test loop to get acc
def test_loop(args, dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(args.device), y.to(args.device)
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    #correct /= size
    #correct *= WORLD_SIZE
    correct = correct / (size / WORLD_SIZE)
    print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")
    return correct

In [59]:
def do_train(args, train_dataloader, valid_dataloader, model, loss_fn, optimizer):
    total_loss = 0.
    best_acc = 0.
    total_time = 0.
    total_throughput = 0.

    for t in range(args.epochs):
        print(f"Epoch {t+1}/{args.epochs + 1}\n-------------------------------")
        if is_distributed():
            # set seed
            train_dataloader.sampler.set_epoch(t)
            valid_dataloader.sampler.set_epoch(t)
        start = time.perf_counter()
        # start to train
        total_loss, total_dataset = train_loop(
            args, train_dataloader, model, loss_fn, optimizer, t+1, total_loss)
        end = time.perf_counter()
        print(f"Epoch {t+1}/{args.epochs + 1} Elapsed time:",
              end - start, flush=True)
        print(f"Epoch {t+1}/{args.epochs + 1} Processed dataset length:",
              total_dataset, flush=True)
        msg = "Epoch {}/{} Throughput: {: .4f}".format(
            t+1, args.epochs+1, 1.0 * total_dataset / (end-start))
        total_time += (end - start)
        total_throughput += total_dataset
        print(msg, flush=True)
        # to valid acc
        valid_acc = test_loop(args, valid_dataloader, model, mode='Valid')

    print("[INFO]Finish all test", flush=True)
    msg = "[INFO]Average training time per epoch: {: .4f}".format(total_time / args.epochs)
    print(msg, flush=True)

    msg = "[INFO]Average throughput per epoch: {: .4f}".format(total_throughput / total_time)
    print(msg, flush=True)

In [60]:
def main(args=None):
    # parse args
    args = parse_args(args)
    print(args)
    # set log_path
    set_log_path(args)
    
    # init tokenizer
    if args.local_only:
        checkpoint = args.model_path
        tokenizer = BertTokenizer.from_pretrained(
        checkpoint, model_max_length=512, local_files_only=True)
    else:
        checkpoint = 'hfl/chinese-pert-base'
        tokenizer = BertTokenizer.from_pretrained(checkpoint, model_max_length=512)

    torch.manual_seed(args.seed) # set seed
    
    # init pytorch distributed network if need
    if should_distribute():
        print("Using distributed PyTorch with {} backend".format(
            "GLOO"), flush=True)
        dist.init_process_group(backend=dist.Backend.GLOO)

    # Load train and valid data
    print("[INFO]Before data get loaded", flush=True)
    train_data = Dataset('train', args.dataset)
    print("######train data length:", len(train_data.data), flush=True)
    valid_data = Dataset('validation', 1)

    # init data_loader
    train_dataloader, valid_dataloader = get_dataloader(args, train_data, valid_data, tokenizer)
    print("[INFO]Data get loaded successfully", flush=True)

    #init model
    model = NeuralNetwork(args).to(args.device)
    print("what happen")
    # load pre-train model
    if (args.load_model):
        model.load_state_dict(torch.load('./pert.bin'))
    # local or distributed model
    if is_distributed():
        Distributor = nn.parallel.DistributedDataParallel
        model = Distributor(model, find_unused_parameters=True)
    
    # set loss function and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=args.lr)
    
    # train epoch
    save_model = do_train(args, train_dataloader, valid_dataloader, model, loss_fn, optimizer)

    # save model and exit
    if (args.save_model):
        torch.save(save_model.state_dict(), "pert.bin")
    if is_distributed():
        dist.destroy_process_group()

In [61]:
if __name__ == "__main__":
    import os
    os.environ['HF_DATASETS_OFFLINE'] = '1'
    args=["--epoch", "2",
         "--log-interval", "20",
          "--test-batch-size", "16", 
          "--batch-size", "16",
          "--local-only",
          "--dataset", "1",
          "--model-path", "/ppml/model"
         ]
    main(args)

123
Namespace(batch_size=16, dataset=1, device='cpu', epochs=2, load_model=False, local_only=True, log_interval=20, log_path='', lr=1e-05, mini_batch=0, model_path='/ppml/model', save_model=False, seed=1, test_batch_size=16)
456
[INFO]Before data get loaded


2023-05-16T01:47:10Z DEBUG    open file: /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85/dataset_info.json
2023-05-16T01:47:10Z DEBUG    open file: /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85/dataset_info.json


######train data length: 9600


2023-05-16T01:47:11Z DEBUG    open file: /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85/dataset_info.json
2023-05-16T01:47:11Z DEBUG    open file: /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85/dataset_info.json


[INFO]Data get loaded successfully


Some weights of the model checkpoint at /ppml/model were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


what happen
Epoch 1/3
-------------------------------




Epoch 1/3 Elapsed time: 1373.5751556998584
Epoch 1/3 Processed dataset length: 9600
Epoch 1/3 Throughput:  6.9891
Valid Accuracy: 94.8%

Epoch 2/3
-------------------------------




Epoch 2/3 Elapsed time: 1402.1715151919052
Epoch 2/3 Processed dataset length: 9600
Epoch 2/3 Throughput:  6.8465
Valid Accuracy: 95.0%

[INFO]Finish all test
[INFO]Average training time per epoch:  1387.8733
[INFO]Average throughput per epoch:  6.9171


In [2]:
import k8s_deployment
args = ["--nnodes", "2",
        "--namespace", "default",
        "--image", "10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-deep-learning-gramine-ref:2.3.0-SNAPSHOT",
        "--env", "http_proxy", "http://child-prc.intel.com:913/",
        "--env", "https_proxy", "http://child-prc.intel.com:913/",
        "--env", "no_proxy", "10.239.45.10:8081,10.112.231.51,10.239.45.10,172.168.0.*",
        "--env", "GLOO_TCP_IFACE", "ens803f0",
        "--env", "HF_DATASETS_OFFLINE", "1",
        "--env", "SGX_ENABLED", "false",
        "--pod_cpu", "13",
        "--pod_memory", "64G",
        "--pod_epc_memory", "68719476736",
        "--driver_port", "29500",
        "--volume", '{"name":"device-plugin", "hostPath":{"path":"/var/lib/kubelet/device-plugins"}}',
        "--volume", '{"name":"aesm-socket", "hostPath":{"path":"/var/run/aesmd/aesm.socket"}}',
        "--volume", '{"name":"source-code", "nfs":{"server":"172.168.0.205", "path": "/mnt/sdb/disk1/nfsdata/wangjian/idc"}}',
        "--volume", '{"name":"nfs-data", "nfs":{"server":"172.168.0.205", "path":"/mnt/sdb/disk1/nfsdata/guancheng/hf"}}',
        "--volume", '{"name":"nfs-model", "nfs":{"server":"172.168.0.205", "path":"/mnt/sdb/disk1/nfsdata/guancheng/model/chinese-pert-base"}}',
        "--volume_mount", '{"mountPath":"/var/lib/kubelet/device-plugins","name":"device-plugin"}',
        "--volume_mount", '{"mountPath":"/var/run/aesmd/aesm.socket","name":"aesm-socket"}',
        "--volume_mount", '{"mountPath":"/root/.cache","name":"nfs-data"}',
        "--volume_mount", '{"mountPath":"/ppml/model","name":"nfs-model"}',
        "--volume_mount", '{"mountPath": "/ppml/test", "name":"source-code"}',
        "--main_script", "/ppml/test/pert.ipynb",
        "--main_script_args", "--epochs 5 --log-interval 20 --test-batch-size 16 --batch-size 16 --local-only --dataset 1 --model-path /ppml/model"]
k8s_deployment.run_k8s(args)

begin of the script
Namespace(driver_port='29500', env=[['http_proxy', 'http://child-prc.intel.com:913/'], ['https_proxy', 'http://child-prc.intel.com:913/'], ['no_proxy', '10.239.45.10:8081,10.112.231.51,10.239.45.10,172.168.0.*'], ['GLOO_TCP_IFACE', 'ens803f0'], ['HF_DATASETS_OFFLINE', '1'], ['SGX_ENABLED', 'false']], image='10.239.45.10/arda/intelanalytics/bigdl-ppml-trusted-deep-learning-gramine-ref:2.3.0-SNAPSHOT', main_script='/ppml/test/pert.ipynb', main_script_args='--epochs 5 --log-interval 20 --test-batch-size 16 --batch-size 16 --local-only --dataset 1 --model-path /ppml/model', namespace='default', nnodes=2, pod_cpu='13', pod_epc_memory='68719476736', pod_memory='64G', volume=['{"name":"device-plugin", "hostPath":{"path":"/var/lib/kubelet/device-plugins"}}', '{"name":"aesm-socket", "hostPath":{"path":"/var/run/aesmd/aesm.socket"}}', '{"name":"source-code", "nfs":{"server":"172.168.0.205", "path": "/mnt/sdb/disk1/nfsdata/wangjian/idc"}}', '{"name":"nfs-data", "nfs":{"server"

[NbConvertApp] Converting notebook /ppml/test/pert.ipynb to script


/ppml/test/pert.ipynb
['python', '/ppml/test/pert.py', '--epochs', '5', '--log-interval', '20', '--test-batch-size', '16', '--batch-size', '16', '--local-only', '--dataset', '1', '--model-path', '/ppml/model']
before service get created
Created Driver Service: bigdl-idc-e7db535-driver-service
service get created successfully
Created Driver Pod: bigdl-idc-e7db535-driver
Created Rank 1 Pod: bigdl-idc-e7db535-worker-1
You can use the following commands to check out the pods status and logs.
**** kubectl get pods -l bigdl-app=e7db535 ****
**** kubectl logs bigdl-idc-e7db535-driver ****


[NbConvertApp] Writing 14358 bytes to /ppml/test/pert.py


In [None]:
1.define model
2.define dataset and dataloader

In [None]:
1. init ppml_context
    k8s or local
2. init model
3. init dataset and dataloader
4. 