In [24]:
import os, json, time
import gc
from IPython.display import display, Markdown 
from dotenv import load_dotenv

In [2]:
class CFG:
    OFFLINE = False #True # for Test offline environment
    USE_LLAMA3 = False # for GPU version
    USE_GAMMA = True # for GPU version
    USE_GEMMA2 = False # for GPU version only 
    TASK_GEN = True # for generative Text output task (suitable for RAG project)
    model1 = "meta-llama/Meta-Llama-3-8B-Instruct"  # llama3 8B
    model2 =  "google/gemma-1.1-2b-it" #  gemma 2B
    model3 = "google/gemma-7b-it"# gemma 7B
    model4 =  "google/gemma-2-9b-it" # gemma 2 9B
    model5 =  'yentinglin/Llama-3-Taiwan-8B-Instruct' # Chinese version of llama3
    model6 = 'Qwen/Qwen-7B' # support Chinese version 
    model7 = "THUDM/chatglm-6b" # support Chinese version
    embedModel1 = 'intfloat/multilingual-e5-small' # for embedding model support chinese
    embedModel2 = "all-MiniLM-L6-v2"
    embedModel3 = "BAAI/bge-base-en-v1.5" # for embedding model support chinese
    embedModel4 = "BAAI/bge-m3" # for multilingual embedding model
    


    FEW_SHOT_TEST= False#True
    USE_RAG = False#False#False #True#True , in this project, prefer use fine tuning for p
    USE_WANDB = True#True # for  LLM evalution and debug , track fine tuning performance
    USE_TRULENS = False # for LLM evalution For RAG prefer 
    USE_DEEPEVAL = False # for LLM evalution   (require openAI API key)
    USE_TRAIN =  True #True #False#True Much be use GPU for Training 
    loggingSteps= 10#100 #100, #20, #5,#10,
    maxTrainData = 200#3500#5000 #10000#5000 #10000
    maxEvalData = 20#100 # 20 
    maxToken=  512#768#512#768 # 512 for test only

In [3]:
import numpy as np
import pandas as pd
import transformers
import torch
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                         AutoModelForCausalLM,
                         TrainingArguments)

from langchain_community.document_loaders import (TextLoader,
                                                PyPDFLoader)

from langchain.prompts.prompt import  PromptTemplate
from langchain_community.vectorstores import (FAISS, 
                                              Chroma 
                                              )

from langchain_text_splitters import (RecursiveCharacterTextSplitter,
                                      CharacterTextSplitter ,
                                       SentenceTransformersTokenTextSplitter)   


from langchain.embeddings import HuggingFaceEmbeddings
from datasets import Dataset, DatasetDict, load_dataset


import evaluate
import trulens

2024-07-21 14:59:26.752519: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-21 14:59:26.973749: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-21 14:59:27.084258: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-21 14:59:27.085037: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-21 14:59:27.250704: I tensorflow/core/platform/cpu_feature_gua

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
def clearMemory():
    for _ in range(5):
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(0.3)

### Get HuggingFace Hub Access for download model

In [6]:
load_dotenv()
huggingfaceToken = os.getenv("HuggingFace") #get huggeface token from .env file

In [7]:
if CFG.USE_WANDB:
    # train report to  W&B tool
    import wandb
    reportTo= "wandb"
    my_secret = os.getenv("wandb_api_key") 
    wandb.login(key=my_secret) # login 
else: 
    reportTo = "none"# None

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## LLM Model Initialization

In [8]:
# Quantized Config for GPU support only
bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True # Activate nested quantization for 4-bit base models (double quantization)

)


In [9]:
# tokenizer = AutoTokenizer.from_pretrained(CFG.model2, token=huggingfaceToken)

In [10]:
if device.type == "cuda": # use 7b/8b/9b model gain performance
    if CFG.USE_LLAMA3:
        modelSel = CFG.model1
        llmModel = "llama3_8b"
        
    elif CFG.USE_GEMMA2:
        modelSel = CFG.model4
        llmModel = "gemma2_9b"
    
    elif CFG.USE_GAMMA:
        modelSel = CFG.model3
        llmModel = "gemma_7b"
    else:
        modelSel = CFG.model2
        llmModel = 'gemma_2b'
        
    if CFG.TASK_GEN:
        model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto",  
                                                 quantization_config= bnb_config ,
                                                 token=huggingfaceToken)

    else:
        model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto",  
                                                 quantization_config= bnb_config, token=huggingfaceToken)
    tokenizer = AutoTokenizer.from_pretrained(modelSel, token=huggingfaceToken) # inital tokenizer
    tokenizer.padding_side = "right"
    

else: # for cpu select smaller model
    modelSel = CFG.model2
    llmModel = 'gemma_2b'
    if CFG.TASK_GEN:
        model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto", token=huggingfaceToken)

    else:
        model = AutoModelForCausalLM.from_pretrained(modelSel, device_map="auto", token=huggingfaceToken)

    tokenizer = AutoTokenizer.from_pretrained(modelSel, token=huggingfaceToken) # inital tokenizer
    tokenizer.padding_side = "right"
    

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [11]:
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 3072, padding_idx=0)
    (layers): ModuleList(
      (0-27): 28 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=3072, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=24576, bias=False)
          (down_proj): Linear4bit(in_features=24576, out_features=3072, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
  

In [12]:
llmModel


'gemma_7b'

# Prompt Engineering

In [14]:

templatePrompt1 = """Question: {question}.\nOnly require given final result in JSON format with key 'answer'
            """
templatePrompt2 = "Answer the user Question.\n###\n{format_instructions}\n###\nQuestion: {query}\n"

"Question: {question}.\nOnly require given final result in JSON format with key 'answer'\n            "

## Generate LLM response

In [18]:
def generateResponse(query, maxOutToken = 512):
    """
    Direct send message to LLM model, get response
    """
    inputIds = tokenizer(query, return_tensors="pt").to(device)
    response = model.generate(**inputIds,
                              do_sample=True,
                              top_p=0.95,
                              top_k = 3,
                              temperature=0.5,
                              max_new_tokens= maxOutToken,
                             )
    return tokenizer.decode(response[0][len(inputIds["input_ids"]):], skip_special_tokens = True)
    
    

## simple parser for extract data

In [42]:
import re
from  json.decoder import JSONDecodeError
if CFG.TASK_GEN:

    def isInteger(text):
        try:
            if int(text) >= 0:
                return True
            else:
                return False
        except ValueError:
            return False

    def llmJSONparser(txt, key="answer"):
        """
        try to get answer from LLM response , expect in JSON format, 
        """
        try:
            subText = txt.split("{") # split several {} in list 
            for txtSeg in subText: # loop in list to find answer
                end = txtSeg.find("}") # find end position in text segment
                sub = txtSeg[:end] #subsring with {} context
                print(sub)
                temp = sub.replace("*", "") # remove * symbol
                temp = temp.replace("\"", "") # reomve \" symbol
                temp = temp.lower() # convert to lower case
                answerloc = temp.find(key) # find key word "answer" position
                if answerloc != -1:
                    print(f"find answer location : {answerloc}")
                    newTxt = temp[answerloc:] # substring start answer
#                   print("Temp: ", temp)
                    subTxt = newTxt.split("\n")
                    #       print(subTxt)
                    rel =subTxt[0][len(key):].strip() # get answer value with remove space
                    rel= rel.replace(',', '') # remove , symbol
                    print(rel)
                    return rel
                
            return None # can't find answer
        except :
            print(f"""Error LLM JSON parser input txt {txt}""" )
            return None
        return None


    def getLLMAnswerParser(txt, key="answer:"):
        """
        when json parser failure, seem answer not JSON format, 
        use "answer" for key word search final answer 
        """
         # find answer  
        temp = txt.replace("*", "") # remove * symbol
        temp = temp.replace("\"", "") # reomve "" symbol
        temp = temp.lower() # convert to lower case
        # find answer key word
        start = temp.find(key)
        print(f"Start loc: {start}")
        subStr = temp[start:]
        if start != -1:
            subTxt = subStr.split("\n")
           #print(subTxt)
            rel =subTxt[0][len(key):].strip() # get answer value with remove space
            rel= rel.replace(',', '') # remove , symbol
            print(rel)
            return rel
    
        print(subStr)
        return None

## Add parser  to control extreact data from  LLM Structure Output

In [30]:
from langchain_core.output_parsers import (StrOutputParser, 
                                           JsonOutputParser,
                                           PydanticOutputParser,
                                          )
# for LLM structure output
from langchain_core.pydantic_v1 import BaseModel, Field, validator
# from pydantic import BaseModel, Field

### Test LLM Model

In [26]:
%%time
ret =generateResponse("What is Machine Learning?" , maxOutToken=256)

CPU times: user 6.13 s, sys: 0 ns, total: 6.13 s
Wall time: 6.13 s


In [27]:
print(ret)

What is Machine Learning?

Machine learning is a field of computer science that enables computers to learn from data, identify patterns, and make predictions. It involves teaching computers to learn from experience, rather than being explicitly programmed.

**Key Concepts:**

* **Learning:** The process of a computer acquiring knowledge from data.
* **Algorithm:** A set of rules or instructions that guide the learning process.
* **Model:** A representation of the learned knowledge that can be used to make predictions or decisions.
* **Data:** The raw material that is used for learning.
* **Supervised Learning:** Learning from labeled data, where the computer is shown examples and learns from them.
* **Unsupervised Learning:** Learning from unlabeled data, where the computer discovers patterns without any examples.

**Types of Learning:**

* **Supervised Learning:** Involves learning from labeled data, where the computer is shown examples and learns from them.
* **Unsupervised Learning:

In [28]:
display(Markdown(ret)) # display in

What is Machine Learning?

Machine learning is a field of computer science that enables computers to learn from data, identify patterns, and make predictions. It involves teaching computers to learn from experience, rather than being explicitly programmed.

**Key Concepts:**

* **Learning:** The process of a computer acquiring knowledge from data.
* **Algorithm:** A set of rules or instructions that guide the learning process.
* **Model:** A representation of the learned knowledge that can be used to make predictions or decisions.
* **Data:** The raw material that is used for learning.
* **Supervised Learning:** Learning from labeled data, where the computer is shown examples and learns from them.
* **Unsupervised Learning:** Learning from unlabeled data, where the computer discovers patterns without any examples.

**Types of Learning:**

* **Supervised Learning:** Involves learning from labeled data, where the computer is shown examples and learns from them.
* **Unsupervised Learning:** Involves learning from unlabeled data, where the computer discovers patterns without any examples.
* **Reinforcement Learning:** Involves learning through trial and error, where the computer learns by interacting with its environment.

**Applications:**

Machine learning has a wide range of applications in various fields, including

In [45]:
%%time
query = "What is Machine Learning?"
newPrompt = PromptTemplate(template=templatePrompt1,
                           input_variables=["question"])
finalPrompt = newPrompt.format(
                question=query    
            )
rel =generateResponse(finalPrompt, maxOutToken=1024)
# jsonTxt = getLLMAnswerParser(rel, key="answer")
# print(f"Question : {query}\nResponse Answer: {jsonTxt}")

CPU times: user 885 ms, sys: 25.4 ms, total: 911 ms
Wall time: 909 ms


In [46]:
print(rel)

Question: What is Machine Learning?.
Only require given final result in JSON format with key 'answer'
            {
                "answer": "Machine learning is a field of computer science that enables computers to learn from data, identify patterns, and make predictions."
            }


In [47]:
clearMemory()

In [None]:
## Prepare RAG 