## Scenario 2 - New trained model implementation

In [1]:
import os
import torch
from typing import List, Optional
import llama
from llama import Llama, Dialog
# import datasets # needed for handling datasets
from datasets import (  load_dataset_builder, # finding info, description, etc.
                        load_dataset, # Loading from our Huggingface profile
                        )
# import transformers
from transformers import (
    LlamaForCausalLM , 
    # LlamaTokenizer # Two core modules for handling model and tokenizer
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
    HfArgumentParser,
    TrainingArguments,
    IntervalStrategy,
    pipeline,
    logging
)
# trl stands for Transformer Reinforcement Learning
from trl import SFTTrainer
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    TaskType
)
import evaluate

from tabulate import tabulate
import matplotlib.pyplot as plt
import datetime


  from .autonotebook import tqdm as notebook_tqdm
`AnnotionFormat` is deprecated and will be removed in v4.38. Please use `transformers.image_utils.AnnotationFormat` instead.



bin d:\GitHub repositories\CRMSC\LlamaChatTraining Environment\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda118.dll




In [2]:
# Define major elements of Llama2 7b
os.environ['models_loc'] : str                  =       'D:\GitHub repositories\llama\\'
os.environ['ckpt_dir']   : str                  =       os.environ['models_loc'] + 'llama-2-7b-chat' 
os.environ['tokenizer_path'] : str              =       os.environ['models_loc'] + 'tokenizer,model'
os.environ['ckpt_dir_crmsc']   : str            =       os.environ['models_loc'] + 'llama-2-7b-chat-hf' 
os.environ['ckpt_dir_crmsc_output']   : str     =       os.environ['models_loc'] + 'llama-2-7b-chat-hf-crmsc' 
os.environ['RANK']                      =       '0'
os.environ['WORLD_SIZE']                =       '1'
os.environ['MASTER_ADDR']               =       'localhost'
os.environ['MASTER_PORT']               =       '12355'
B_INST, E_INST              =   "[INST]", "[/INST]"
B_SYS, E_SYS                =   "<<SYS>>", "<</SYS>>"
PAD                         =   '[PAD]'
train_dataset               =       [
    'env_1 - converted.txt',
    # 'env_2 - converted.txt',
    # 'env_3 - converted.txt',
    # 'env_4 - converted.txt',
    ]
validation_dataset          =   [
    'eenv_1 - converted.txt',
]
bnb_4bit_compute_dtype                  =       'float16' # Compute dtype for 4-bit base models
use_4bit                                =       True # Activate 4-bit precision base model loading
bnb_4bit_quant_type                     =       'nf4' # Quantization type (fp4 or nf4)
use_nested_quant                        =       False # Activate nested quantization for 4-bit base models
__cuda                                  =       torch.device("cuda:0" if torch.cuda.is_available() else 'cpu')
device_map                              =       __cuda#{"": 0 } # Load the entire 
lora_alpha                              =       64 
lora_dropout                            =       0.05
lora_r                                  =       512 # might be too much, needs to be modified later
per_device_train_batch_size     =   2
per_device_eval_batch_size      =   2
gradient_accumulation_steps     =   1       #  Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
gradient_checkpointing          =   False   # Default is false,  If True, use gradient checkpointing to save memory at the expense of slower backward pass.
optim                           =   "paged_adamw_32bit" # adamw_torch , adamw_hf
save_steps                      =   100  # save every x steps
logging_steps                   =   1   # log every x updates steps
learning_rate                   =   8e-4
fp16                            =   False   #   Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
bf16                            =   False   #   Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training
max_grad_norm                   =   0.64     #   Maximum gradient norm (for gradient clipping). default is 1.0
max_steps                       =   -1      #   number of optimizer update steps / training steps to perform
# warmup_ratio                    =   0     #   Ratio of total training steps used for a linear warmup from 0 to learning_rate.
# warmup_steps                    =   0
# weight_decay                    =   0.0
# group_by_length                 =   True
# lr_scheduler_type               =   "linear"    # better than cosine
max_seq_length                  =   None
max_new_tokens                  =   600
packing                         =   False # use packing dataset training
evalaution_strategy             =   IntervalStrategy.STEPS
# settings for tokenizer
padding_side                    =   'left'
max_length                      =   400 # this might be as same as max_seq_length, but for making a difference between trainer and tokenizer, we defined this parameter
clean_up_tokenization_spaces    =   True # False by default
use_default_system_prompt       =   True # False by default
# Inhertir from Guardrail ML ( https://colab.research.google.com/drive/134o_cXcMe_lsvl15ZE_4Y75Kstepsntu?usp=sharing#scrollTo=nAMzy_0FtaUZ )
def load( padding_side : str = padding_side):
    compute_dtype   =   getattr(torch,bnb_4bit_compute_dtype) # focusing on 4 bits quantization
    bnb_config      =   BitsAndBytesConfig (
        load_in_4bit    =   use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant
    )
    
    # Initializing the model
    # model   =   AutoModelForCausalLM.from_pretrained( os.environ['ckpt_dir_crmsc'],
    #                                                 device_map              =   __cuda,
    #                                                 quantization_config     =   bnb_config,
    #                                                   )
    # model.config.use_cache      =   False   # Whether or not the model should return the last key/values attentions
    # model.config.pretraining_tp =   1       # for faster computation, but inaccurate, increase for better accuracy but slow calculation

    # Initializing Parameter-Efficient Fine-Tuning configuration (Peft)
    # Harnessing Low-Rank approximation technique
    # peft_config     =   LoraConfig    (
    #     lora_alpha=lora_alpha,
    #     lora_dropout=lora_dropout,
    #     r=lora_r,
    #     bias='lora_only',
    #     task_type=TaskType.CAUSAL_LM
    # )

    # Finally, loading tokenizer
    # we use models location instead of '.model' to avoid warning, as in new version (v5) will be deprecated, also trust argument needs to be checked later
    tokenizer       =   AutoTokenizer.from_pretrained( os.environ['ckpt_dir_crmsc'] , 
                                                      trust_remote_code=True,
                                                      padding_side=padding_side,
                                                      add_bos_token=False,   # bos is True by default
                                                      add_eos_token=False,   # eos is False by default
                                                      clean_up_tokenization_spaces  =   clean_up_tokenization_spaces, 
                                                      use_default_system_prompt     =   use_default_system_prompt,
                                                      ) 
    # tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # or 'tokenizer.eos_token
    tokenizer.pad_token = E_INST
    return tokenizer #, peft_config

In [3]:
tokenizer  = load()

In [4]:
peftmodel       =   LlamaForCausalLM.from_pretrained( os.environ['ckpt_dir_crmsc_output'] )
# peftmodel       =   PeftModel.from_pretrained( model , os.environ['ckpt_dir_crmsc_output'] )
# peftmodel.merge_and_unload()

Loading checkpoint shards:  67%|██████▋   | 2/3 [00:03<00:01,  1.79s/it]

In [None]:
Scenario1    =   """
On this environment, we have one country called 'Z', in which one supplier and one consumer can be found, and their names are 'X' and 'A' respectively.
There are no more countries on this environment
This country holds 205 metric tons of Lithium Ore, providing it for its suppliers. suppliers do not supply ore but the hydroxide type of lithium to consumers.
The supplier can convert Lithium Ore to Lithium Hydroxide by the conversion ratio of 0.9, meaning that one ton of Lithium Ore can be converted to 0.9 tons of Lithium Hydroxide.
The supplier ('X') provides 4 tons of Lithium Hydroxide to the consumer, 'A'.
The transportation delivery from 'X' to 'A' is 3 days.
"""
Question1       =   "What are the relationships between suppliers and consumers? On which country are they located?"
Question2       =   "What is the HHI of Lithium Ore on this environment?"
Question3       =   "How much Lithium Hydroxide can be produced from one ton of lithium ore, considering the extraction and refining process?"
Question4       =   "Can you make a brief report of the interactions between suppliers and consumers?"

In [None]:
def Ask(  Question    :   str , Scenario   : str  = None ):
    # New approach

    generation_config   =   GenerationConfig(
    num_beams       =   12, # by specifying a number of beams higher than 1, you are effectively switching from greedy search to beam search. This strategy evaluates several hypotheses at each time step and eventually chooses the hypothesis that has the overall highest probability for the entire sequence.
    # early_stopping  =   True # No idea what it is
    # decorder_start_token_id     =   0,
    num_beams_group             =   3,
    diversity_penalty           =   0.3,
    do_sample                   =   False, # triggering group beam search
    # top_k                       =   150,
    top_p                       =   1.0,
    temperature                 =   0.7,
    # max_new_tokens              =   1024,
    max_length                  =   2048,
    early_stopping              =   True,
    use_cache                   =   False,

    eos_token_id                =   tokenizer.eos_token_id,   # End of sequence token
    bos_token_id                =   tokenizer.bos_token_id,   # Beginning of sequence token
    pad_token_id                =   tokenizer.pad_token_id    # padding token
)
    pipe        =   pipeline( 
    task='text-generation',
    model=peftmodel,
    tokenizer=tokenizer,
    max_new_tokens=max_new_tokens,
    config=generation_config
)
    ## This is going to be deprecated
    # generator   =   pipeline( task="text-generation" , model=trainer.model , tokenizer=trainer.tokenizer )
    # result      =   generator(f"<s>[INST] {prompt} [/INST]")
    # print( result[0]['generated_text'])
    Prompt          =   f"<s>{B_SYS} {Scenario} {E_SYS}{B_INST} {Question} {E_INST}"
    Results         =   pipe( Prompt )
    generated_text  =   Results[0]['generated_text']
    index_end       =   generated_text.find( E_INST )
    if index_end != -1: # something has been found :D
        substring       =   generated_text[ index_end + len(E_INST) : ].strip()
    else: # nothing is generated :(
        substring       =   generated_text.strip()
    print( "*" * 20 , end=' ' )
    print( " << Question >> " , end=' ')
    print( "*" * 20  )
    print( Question )
    print( "*" * 20 , end=' ' )
    print( " << Response >> " , end=' ')
    print( "*" * 20  )
    print( substring )
    


In [None]:
Ask( Question1 , Scenario1 )

In [None]:
Ask( Question2 , Scenario1 )

In [None]:
Ask( Question3 , Scenario1 )

In [None]:
Ask( Question4 , Scenario1 )

In [None]:
# with torch.no_grad(): # Disabling gradient calculation is useful for inference, when you are sure that you will not call Tensor.backward(). It will reduce memory consumption for computations that would otherwise have requires_grad=True
    # inputs = {k: v.to('cpu') for k, v in inputs.items()}
    # outputs = model.generate( input_ids = inputs['input_ids'] , max_new_tokens = 100 )