In [None]:
# Using parallelformers To parallelize models form Huggingface
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM, pipeline
from torch.distributed.pipeline.sync import Pipe
import torch
import os
from parallelformers import parallelize
from tqdm import tqdm

In [None]:
device = 'cpu'
dtype = torch.float

tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", cache_dir='/root/NAS/huggingface_cache', torch_dtype=dtype, device_map='auto')

In [None]:
model.parallelize()

In [None]:
parallelize(model, num_gpus=2, fp16=False)

In [None]:
text = ["How babies are born?"]*1000

for inp in tqdm(text):
    generate_kwargs = {'max_length': 300}

    input_ids = tokenizer(inp, return_tensors="pt").to('cuda')
    outs = model.generate(**input_ids, **generate_kwargs).cpu()

    aa = tokenizer.batch_decode(outs, skip_special_tokens=True)
print(aa)

In [None]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device, torch_dtype=dtype)
generator(text, **generate_kwargs, )

In [None]:
# Memory Consumption:
# 1. Size of moedel
# 2. Intermediate outputs during forward pass (Depend on batch size)
# 3. Discard intermididte values and store gradient values in backword pass (Equal to Model Size)
# 4. Store all the states for optimizer for each weight parameter in optim step (n_states x Model size)

# 5. Mixed Precision Training cuts the memory in half during orward pass by using 16bit for forward pass and storing gradients in 32bits
# 6. Twice the memory to store the model when using DDP (Need to store gradients communicated from all other GPUs)
# 7. During Infrence memeory consumed is only model size



d_model=4096
nhead=64
num_encoder_layers=8
num_decoder_layers=4
dim_feedforward=10240
dim_feedforward=80

In [None]:
# Model Paralleism on 2 GPUs

class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()

        en_layer = torch.nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)
        den_layer = torch.nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward)

        self.enc = nn.Sequential(nn.TransformerEncoder(en_layer, num_encoder_layers)).to(0)

        self.dec = nn.TransformerDecoder(den_layer, num_decoder_layers).to(1)

        
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        
        memory = self.enc(src.to(0))
        
        out = self.dec(tgt.to(1), memory.to(1))
        
        return out

model = Transformer()
print(f'Size of model is: {sum(p.numel() for p in model.parameters() if p.requires_grad)*4 :,} bytes')

In [None]:
cretation = nn.MSELoss()
optim = torch.optim.AdamW(model.parameters())

in_seq_len = 12
out_seq_len = 18
batch_size = 128
embedding_size = d_model

start = time.time()
for i in tqdm(range(10)):
    src = torch.randn(in_seq_len, batch_size, embedding_size)
    tgt = torch.randn(out_seq_len, batch_size, embedding_size)

    # Forward Pass
    optim.zero_grad()
    out = model(src, tgt[:-1, :, :])
    loss = cretation(out, tgt[1:, :].to(1))
    # Backword Pass
    loss.backward()
    # Optim step
    optim.step()

print(f"Time taken for Model parallel on 2 GPUs: {time.time() - start :0.2f}s")


In [None]:
from varuna import CutPoint, Varuna

In [None]:
# Model Paralleism on 2 GPUs with Varuna

def get_batch_fn(size: int, device: torch.device):
    in_seq_len = 12
    out_seq_len = 18

    batch_size = size

    embedding_size = d_model
    src = torch.randn(batch_size, in_seq_len, embedding_size).to(device)
    tgt = torch.randn(batch_size, out_seq_len, embedding_size).to(device)

    return {"src": src, "tgt": tgt}

class TransformerVaruna(nn.Module):
    def __init__(self):
        super(TransformerVaruna, self).__init__()

        en_layer = torch.nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True)
        den_layer = torch.nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True)

        num_cutpoints = 1
        self.cutpoints = [CutPoint() for i in range(num_cutpoints)]

        self.enc = nn.Sequential(nn.TransformerEncoder(en_layer, num_encoder_layers))
        self.dec = nn.TransformerDecoder(den_layer, num_decoder_layers)

        
    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        
        memory = self.enc(src)
        memory = self.cutpoints[0](memory)

        out = self.dec(tgt, memory)
        
        return out

model = TransformerVaruna()
print(f'Size of model is: {sum(p.numel() for p in model.parameters() if p.requires_grad)*4 :,} bytes')


In [None]:
model = Varuna(model, args.stage_to_rank_map, get_batch_fn, args.batch_size, args.chunk_size, args.fp16, local_rank=args.local_rank, device=args.local_rank)

cretation = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters())
model.set_optimizer(optimizer)



In [None]:
start = time.time()
for i in range(10):
    inputs = get_batch_fn(100, 'cpu')
    loss, overflow = model.step(inputs)
    loss = torch.Tensor([loss])

    if not overflow:
        optimizer.step()
print(f"Time taken for Model parallel on 2 GPUs: {time.time() - start :0.2f}s")