In [1]:
import torch
import argparse
import os
import logging
import time
from torch import nn
from contextlib import nullcontext
from datasets import load_dataset
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
from transformers import BertTokenizer, BertModel, AdamW

In [2]:
# get local or distributed conf
WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
RANK = int(os.environ.get("RANK", 0))

In [3]:
def parse_args(args):
    import argparse
    parser = argparse.ArgumentParser(description="PyTorch PERT Example")
    parser.add_argument("--batch-size", type=int, default=16, metavar="N",
                        help="input batch size for training (default: 16)")
    parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N",
                        help="input batch size for testing (default: 1000)")
    parser.add_argument("--epochs", type=int, default=1, metavar="N",
                        help="number of epochs to train (default: 10)")
    parser.add_argument("--lr", type=float, default=1e-5, metavar="LR",
                        help="learning rate (default: 0.01)")
    parser.add_argument("--seed", type=int, default=1, metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument("--dataset-size", type=int, default=1, metavar="D",
                        help="dataset size (default 1 * 9600)")
    parser.add_argument("--model-save-path", type=str, default="",
                        help="For Saving the current Model")
    parser.add_argument("--model-load-path", type=str, default="",
                        help="Where to load pretrained model, can set it to /ppml/model")
    parser.add_argument("--device", type=str, default="cpu",
                    help="Where to train model, default is cpu")
    # Only for test purpose
    parser.add_argument("--load-current-model", type=str, default="",
                        help="For loading the current model")
    return parser.parse_args(args)

In [4]:
class Dataset(torch.utils.data.Dataset):
    # data_type is actually split, so that we can define dataset for train set/validate set
    def __init__(self, data_type, dataset_load_iter):
        self.dataset_load_iter = dataset_load_iter
        self.data = self.load_data(data_type)

    def load_data(self, data_type):
        tmp_dataset = load_dataset(path='seamew/ChnSentiCorp', split=data_type)
        Data = {}
        for i in range(self.dataset_load_iter):
            for idx, line in enumerate(tmp_dataset):
                sample = line
                Data[idx + i * len(tmp_dataset)] = sample
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [5]:
# Return a batch of data, which is used for training
def collate_fn(batch_samples, tokenizer):
    batch_text = []
    batch_label = []
    for sample in batch_samples:
        batch_text.append(sample['text'])
        batch_label.append(int(sample['label']))
    # The tokenizer will make the data to be a good format for our model to understand
    X = tokenizer(
        batch_text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    y = torch.tensor(batch_label)
    return X, y

In [6]:
def get_dataloader(tokenizer_load_path, batch_size, test_batch_size, train_data, valid_data):
    #init tokenizer
    if tokenizer_load_path != "":
        checkpoint = tokenizer_load_path
        tokenizer = BertTokenizer.from_pretrained(
        checkpoint, model_max_length=512, local_files_only=True)
    else:
        checkpoint = 'hfl/chinese-pert-base'
        tokenizer = BertTokenizer.from_pretrained(checkpoint, model_max_length=512)
    train_dataloader = DataLoader(
        train_data, batch_size=batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer))
    valid_dataloader = DataLoader(
        valid_data, batch_size=test_batch_size, shuffle=True, collate_fn=lambda x: collate_fn(x, tokenizer))
    return train_dataloader, train_dataloader

In [7]:
# define model
class NeuralNetwork(nn.Module):
    def __init__(self, args):
        super(NeuralNetwork, self).__init__()
        if args.model_load_path != "":
            checkpoint = args.model_load_path
            self.bert_encoder = BertModel.from_pretrained(
                checkpoint, local_files_only=True)
        else:
            checkpoint = 'hfl/chinese-pert-base'
            self.bert_encoder = BertModel.from_pretrained(checkpoint)
        self.classifier = nn.Linear(768, 2)

    def forward(self, x):
        bert_output = self.bert_encoder(**x)
        cls_vectors = bert_output.last_hidden_state[:, 0]
        logits = self.classifier(cls_vectors)
        return logits

In [8]:
# define training loop
def train_loop(device, dataloader, model, loss_fn, optimizer, epoch, total_loss):
    # Set to train mode
    model.train()
    total_dataset = 0
    optimizer.zero_grad(set_to_none=True)
    enumerator = enumerate(dataloader, start=1)
    for batch, (X, y) in enumerator:
        X, y = X.to(device), y.to(device)
        # Forward pass
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        total_loss += loss.item()
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

        total_dataset += dataloader.batch_size
        if batch % 1 == 0:
            msg = "Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
                epoch, batch, len(dataloader),
                100. * batch / len(dataloader), loss.item())
            print(msg, flush=True)

    return total_loss, total_dataset

In [9]:
# define test loop to get acc
def test_loop(device, dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    size = len(dataloader.dataset)
    correct = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    #correct /= size
    #correct *= WORLD_SIZE
    correct = correct / (size / WORLD_SIZE)
    print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")
    return correct

In [10]:
def do_train(epochs, device, train_dataloader, valid_dataloader, model, loss_fn, optimizer):
    total_loss = 0.
    best_acc = 0.
    total_time = 0.
    total_throughput = 0.

    for t in range(epochs):
#         print(f"Epoch {t+1}/{epochs}\n-------------------------------", flush=True)
        start = time.perf_counter()
        # start to train
        total_loss, total_dataset = train_loop(
            device, train_dataloader, model, loss_fn, optimizer, t+1, total_loss)
        end = time.perf_counter()
        print(f"Epoch {t+1}/{epochs} Elapsed time:",
              end - start, flush=True)
        print(f"Epoch {t+1}/{epochs} Processed dataset length:",
              total_dataset, flush=True)
        msg = "Epoch {}/{} Throughput: {: .4f}".format(
            t+1, epochs+1, 1.0 * total_dataset / (end-start))
        total_time += (end - start)
        total_throughput += total_dataset
        print(msg, flush=True)
        # to valid acc
        valid_acc = test_loop(device, valid_dataloader, model, mode='Valid')

    print("[INFO]Finish all test", flush=True)
    msg = "[INFO]Average training time per epoch: {: .4f}".format(total_time / epochs)
    print(msg, flush=True)

    msg = "[INFO]Average throughput per epoch: {: .4f}".format(total_throughput / total_time)
    print(msg, flush=True)

In [11]:
def main(args=None):
    # parse args
    args = parse_args(args)
    print(args)
    torch.manual_seed(args.seed) # set seed
    
    #init
    import ppml_context
#     ppml_context = ppml_context.PPMLContext(k8s_enabled=True)

    # Load train and valid data and init data_loader
    train_data = Dataset('train', args.dataset_size)
    print("######train data length:", len(train_data.data), flush=True)
    valid_data = Dataset('validation', 1)

    train_dataloader, valid_dataloader = get_dataloader(args.model_load_path, args.batch_size, args.test_batch_size, train_data, valid_data)
    print("[INFO]Data get loaded successfully", flush=True)

    #init model
    model = NeuralNetwork(args).to(args.device)

    # load pre-train model
    if args.load_current_model != "":
        model.load_state_dict(torch.load(args.load_current_model))
    
    # set loss function and optimizer
    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=args.lr)
    
    # get distributed
#     model, train_dataloader, valid_dataloader = ppml_context.get_distributed(model, train_dataloader, valid_dataloader)
    
    # train epoch
    save_model = do_train(args.epochs, args.device, train_dataloader, valid_dataloader, model, loss_fn, optimizer)

    # save model and exit
    if args.model_save_path != "":
        torch.save(save_model.state_dict(), args.model_save_path)
    

In [None]:
if __name__ == "__main__":
    import os
    import sys
    os.environ['HF_DATASETS_OFFLINE'] = '1'
    import ppml_conf
    local_conf = ppml_conf.PPMLConf(k8s_enabled = False) \
    .set("epoch", "2") \
    .set("test-batch-size", "16") \
    .set("batch-size", "16") \
    .set("dataset-size", "1") \
    .set("model-load-path", "/ppml/model")

    args1=local_conf.conf_to_args()
        
    main(args1)
    sys.exit()

init
['--epoch', '2', '--test-batch-size', '16', '--batch-size', '16', '--dataset-size', '1', '--model-load-path', '/ppml/model']
Namespace(batch_size=16, test_batch_size=16, epochs=2, lr=1e-05, seed=1, dataset_size=1, model_save_path='', model_load_path='/ppml/model', device='cpu', load_current_model='')


Found cached dataset chn_senti_corp (/root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


######train data length: 9600


Found cached dataset chn_senti_corp (/root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85)


[INFO]Data get loaded successfully


Some weights of the model checkpoint at /ppml/model were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




In [3]:
import k8s_deployment

import ppml_conf
k8s_conf = ppml_conf.PPMLConf(k8s_enabled = True, sgx_enabled=False) \
.set_k8s_env("GLOO_TCP_IFACE", "ens803f0") \
.set_k8s_env("HF_DATASETS_OFFLINE", "1") \
.set_k8s("nnodes", "2") \
.set_k8s("pod_cpu", "13") \
.set_k8s("pod_memory", "64G") \
.set_k8s("pod_epc_memory", "68719476736")

# set_volumn_host("volume_name,host_path")
# set_volumn_nfs("volume_name, nfs_server, nfs_path")
# set_volumn_mount("mount_path,volume_name")
k8s_conf \
.set_volume_nfs("source-code", "172.168.0.205","/mnt/sdb/disk1/nfsdata/wangjian/idc") \
.set_volume_mount("/ppml/notebook/nfs", "source-code") \
.set_volume_nfs("nfs-data", "172.168.0.205", "/mnt/sdb/disk1/nfsdata/guancheng/hf") \
.set_volume_mount("/root/.cache", "nfs-data") \
.set_volume_nfs("nfs-model", "172.168.0.205", "/mnt/sdb/disk1/nfsdata/guancheng/model/chinese-pert-base") \
.set_volume_mount("/ppml/model", "nfs-model") \

k8s_args = k8s_conf.conf_to_args()


k8s_deployment.run_k8s(k8s_args)

init
['--namespace', 'default', '--image', 'intelanalytics/bigdl-ppml-trusted-deep-learning-gramine-ref:2.4.0-SNAPSHOT', '--driver_port', '29500', '--nnodes', '2', '--pod_cpu', '13', '--pod_memory', '64G', '--pod_epc_memory', '68719476736', '--env', 'http_proxy', 'http://child-prc.intel.com:913/', '--env', 'https_proxy', 'http://child-prc.intel.com:913/', '--env', 'no_proxy', '10.239.45.10:8081,10.112.231.51,10.239.45.10,172.168.0.*', '--env', 'SGX_ENABLED', 'false', '--env', 'GLOO_TCP_IFACE', 'ens803f0', '--env', 'HF_DATASETS_OFFLINE', '1', '--volume', '{"name": "device-plugin", "hostPath": {"path": "/var/lib/kubelet/device-plugins"}}', '--volume', '{"name": "aesm-socket", "hostPath": {"path": "/var/run/aesmd/aesm.socket"}}', '--volume', '{"name": "source-code", "nfs": {"server": "172.168.0.205", "path": "/mnt/sdb/disk1/nfsdata/wangjian/idc"}}', '--volume', '{"name": "nfs-data", "nfs": {"server": "172.168.0.205", "path": "/mnt/sdb/disk1/nfsdata/guancheng/hf"}}', '--volume', '{"name": 