# Setup environment

In [1]:
# !pip install -q accelerate
# !pip install -q bitsandbytes
# !pip install -q peft
# !pip install -q jsonargparse
# !pip install requests==2.27.1
!pip freeze | grep accelerate
!pip freeze | grep bitsandbytes

accelerate==0.31.0
bitsandbytes==0.43.1


In [2]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from pathlib import Path
from typing import Optional
from torch.utils.data import random_split
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from torch import nn, optim
import torch
import os
from tqdm.notebook import tqdm
from torch.cuda.amp import GradScaler, autocast
import torch.nn.functional as F
from transformers import BitsAndBytesConfig
import numpy as np
from transformers import get_cosine_schedule_with_warmup

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
os.environ['TOKENIZER_PARALLELISM'] = 'false'
os.environ['CURL_CA_BUNDLE'] = ''

proxy = 'http://192.168.5.8:3128'
os.environ['HTTP_PROXY'] = proxy
os.environ['HTTPS_PROXY'] = proxy

# General Settings

In [3]:
llm_backbone = 'mistralai/Mistral-7B-v0.1'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
max_seq_length = 380
num_epochs = 1
micro_batch_size = 2
learning_rate = 0.00003
warmup_steps = 1000
weight_decay = 0.005
eval_steps = 200
logging_step = 50
accumulation_steps = 2

# Prepare data

In [4]:
from prepare_data import prepare
# prepare()

# Load data

In [5]:
train_data = torch.load("./data/dolly/train_set.pt")
val_data = torch.load('./data/dolly/test_set.pt')

In [6]:
def longest_seq_length(data):
    lengths = [len(d['input_ids']) for d in data]
    longest_seq_length = max(lengths)
    longest_seq_idx = lengths.index(longest_seq_length)
    return longest_seq_length, longest_seq_idx

In [7]:
torch.manual_seed(289)
def get_batch(data, mode, train_idx, longest_seq_idx, max_seq_length, micro_batch_size):
    
    if mode == 'train':
        idx = train_idx
    else:
        idx = torch.randint(len(data), (micro_batch_size,))

    if longest_seq_idx is not None:
        idx[0] = longest_seq_idx

    input_ids = [data[i]['input_ids']for i in idx]
    labels = [data[i]['labels']for i in idx]

    max_len = max([len(s) for s in input_ids])
    
    def pad_right(x, pad_id):
        n = max_len - len(x)
        return torch.cat([x, torch.full((n,), pad_id)], dim=0)
    
    x = torch.stack([pad_right(x, pad_id=1) for x in input_ids])
    y = torch.stack([pad_right(y, pad_id=-1) for y in labels])
    
    if max_seq_length:
        x = x[:, :max_seq_length]
        y = y[:, :max_seq_length]
        
    return x.to(device), y.to(device)

input_ids, targets = get_batch(
    train_data, 
    train_idx=torch.tensor([0, 1]),
    mode='train', 
    longest_seq_idx=None, 
    max_seq_length=500, 
    micro_batch_size=4
)

# Tokenizizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(llm_backbone, token='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', cache_dir="../cache")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
tokenizer.decode(train_data[597]['input_ids'])

'<s> Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nIs Leonidas from Sparta a real man? What was he famous for?\n\n### Response:Yes, Leonidas from Sparta was a real man. He was famous for his courage and leadership during the Battle of Thermopylae in 480 BC.</s>'

# Architecture

In [9]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16
        )
        
        self.backbone =  AutoModelForCausalLM.from_pretrained(
            llm_backbone,
            quantization_config=bnb_config,
            cache_dir="../cache",
            token='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'
        )
        
        self.peft_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM, 
            inference_mode=False,
            r=8,
            lora_alpha=16, 
            lora_dropout=0.05
        )
        
        self.backbone = get_peft_model(self.backbone, self.peft_config)
        self.backbone.print_trainable_parameters()
        
    def forward(self, input_ids, targets):
        logits = self.backbone(input_ids).logits
        logits = logits[..., :-1, :]
        targets = targets[..., 1:]
        
        B, T, C = logits.shape
        logits = logits.reshape(B*T, C)
        targets = targets.reshape(-1)
        
        loss = F.cross_entropy(logits, targets, ignore_index=-1)
        
        return logits, loss

# Fine-tuning

In [10]:
torch.manual_seed(279)
model = Net().to(device)

optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


train_indices = []
shuffled_indices = torch.randperm(len(train_data))
for step in range(0, len(train_data)-micro_batch_size, micro_batch_size):
    indices = shuffled_indices[step:step+micro_batch_size]
    if len(indices) == micro_batch_size:
        train_indices.append(indices)
            
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps, 
    num_training_steps=len(train_indices)*num_epochs
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


In [11]:
torch.manual_seed(279)

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    
    shuffled_indices = torch.randperm(len(train_data))
    train_indices = []
    for step in range(0, len(train_data)-micro_batch_size, micro_batch_size):
        indices = shuffled_indices[step:step+micro_batch_size]
        if len(indices) == micro_batch_size:
            train_indices.append(indices)
    
    train_indices = tqdm(train_indices, desc=f'Epoch: {epoch+1}')
    
    for batch_idx, batch in enumerate(train_indices):
        
        model.train()

        input_ids, targets = get_batch(
            train_data,
            train_idx=batch,
            mode='train',
            micro_batch_size=None,
            longest_seq_idx=None,
            max_seq_length=max_seq_length
        )

        _, loss = model(input_ids, targets)

        train_losses.append(loss.item())
        loss = loss / accumulation_steps  
        loss.backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        scheduler.step()

        if not batch_idx % logging_step:
            with torch.no_grad():
                model.eval()

                input_ids, targets = get_batch(
                    val_data,
                    train_idx=None,
                    mode='val',
                    micro_batch_size=micro_batch_size,
                    longest_seq_idx=None,
                    max_seq_length=max_seq_length
                )
                
                _, val_loss = model(input_ids, targets)
                val_losses.append(val_loss.item())

                print(
                f'Epoch: {epoch + 1}/{num_epochs}'
                f' | Batch: {batch_idx}/{len(train_indices)}'
                f' | Train Loss: {np.mean(train_losses)}'
                f' | Val loss: {np.mean(val_losses)}'
                )


        if not batch_idx % 100:
            with torch.no_grad():
                model.eval()
                text = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nTell me about Viettel Networks (VTNet)?\n\n### Response:\n"""
                answer = tokenizer.decode(
                    model.backbone.generate(
                        **tokenizer(text, return_tensors='pt').to(device),
                        max_new_tokens=300,
                        pad_token_id=tokenizer.pad_token_id,
                    )[0]
                )
                print(answer)

Epoch: 1:   0%|          | 0/6754 [00:00<?, ?it/s]

Epoch: 1/1 | Batch: 0/6754 | Train Loss: 1.7163934707641602 | Val loss: 2.467076301574707
<s> Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Tell me about Viettel Networks (VTNet)?

### Response:

Viettel Networks (VTNet) is a Vietnamese telecommunications company that provides a range of services, including mobile, fixed-line, and internet services. The company was founded in 2001 and is headquartered in Hanoi, Vietnam.

VTNet is a subsidiary of Viettel Group, which is a state-owned enterprise that is majority-owned by the Vietnamese government. The company has a strong presence in Vietnam, with a network that covers over 90% of the country’s population.

VTNet offers a range of mobile services, including voice, data, and SMS services. The company also provides fixed-line services, including broadband and voice services. In addition, VTNet offers a range of internet services, including broadband, mobile data,

In [12]:
torch.save(model.state_dict(), 'mistral_7b_dolly.pt')