In [8]:
OFFLINE = True

In [None]:
import os, json, time
import gc
from IPython.display import display, Markdown

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import transformers
import torch
from transformers import AutoTokenizer,BitsAndBytesConfig,AutoModelForCausalLM, TrainingArguments
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import ConfigurableField
from langchain_community.vectorstores import FAISS, Chroma
# Text embedding / Texxt Splitter for RAG 
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from datasets import Dataset, DatasetDict, load_dataset

# Adv RAG library 

# for evaluate  LLM 
import evaluate # require online 
# from deepeval.metrics import GEval
# from deepeval.test_case import LLMTestCase
# from deepeval.metrics import AnswerRelevancyMetric, ContextualPrecisionMetric , ContextualRelevancyMetric, ContextualRecallMetric
import pytest
# import trulens

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [7]:
def clearMemory():
    for _ in range(5):
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(0.3)

In [None]:
# Enable/Disable Function
FEW_SHOT_TEST= False#True
USE_RAG = True#False#False #True#True
USE_WANDB = False#True # for  LLM evalution and debug , track fine tuning performance
USE_TRULENS = False # for LLM evalution
USE_DEEPEVAL = False # for LLM evalution   (require openAI API key)
USE_TRAIN =  True #True #False#True 
USE_INFER =  False # for submision prediction only , no test model 
if OFFLINE :
    USE_WANDB = False # Wandb only support online  
if device.type == "cpu": #requred GPU support for fine turning 
    USE_TRAIN= False

if USE_WANDB:
    # train report to  W&B tool
    import wandb
    from kaggle_secrets import UserSecretsClient
    reportTo= "wandb"
    user_secrets = UserSecretsClient()
    my_secret = user_secrets.get_secret("wandb_api_key") 
    wandb.login(key=my_secret) # login 
else: 
    reportTo = "none"# None
#     os.environ["WANDB_DISABLED"] = True#“true”
    

In [None]:
if USE_TRULENS:
    from trulens_eval import Tru
    tru = Tru()
    tru.reset_database()

In [None]:
reportTo


In [None]:
device.type


In [None]:
if device.type == "cuda" and USE_TRAIN == True: #requred GPU support
    # for LoRA fine tuning
    from trl import SFTTrainer
    from peft import LoraConfig, PeftModel

In [None]:
sampleSubmitFile = "/kaggle/input/ai-mathematical-olympiad-prize/sample_submission.csv"
trainFile = "/kaggle/input/ai-mathematical-olympiad-prize/train.csv"
testFile = "/kaggle/input/ai-mathematical-olympiad-prize/test.csv"
mathQSATrainFile = "/kaggle/input/math-qsa-dataset/train.csv"
mathQSATestFile = "/kaggle/input/math-qsa-dataset/test.csv"
gsm8kTrainFile = "/kaggle/input/gsm8k-grade-school-math-8k-dataset-for-llm/gsm8k/main/train-00000-of-00001.parquet"
gsm8kTestFile = "/kaggle/input/gsm8k-grade-school-math-8k-dataset-for-llm/gsm8k/main/test-00000-of-00001.parquet"
mathQATrainFile = "/kaggle/input/math-qa-for-aqua-rat-dataset/MathQA/train.json"
mathQATestFile = "/kaggle/input/math-qa-for-aqua-rat-dataset/MathQA/test.json"
orcaMath200kFile = "/kaggle/input/microsoftorca-math-word-problems-200k/orca-math-word-problems-200k/data/train-00000-of-00001.parquet"

In [None]:
clearMemory()


In [None]:
trainDF = pd.read_csv(trainFile)
trainDF

In [None]:
trainDF.describe()


In [None]:
trainDF.iloc[7]["answer"]


In [None]:
testDF = pd.read_csv(testFile)
testDF.info()

In [None]:
trainQSADF = pd.read_csv(mathQSATrainFile)
trainQSADF.head()

In [None]:
trainQSADF["problem"][0]


In [None]:
trainQSADF.describe()


In [None]:
trainQADF= pd.read_json(mathQATrainFile)
trainQADF.head()

In [None]:
trainQADF["options"][5]


In [None]:
trainGSM8KDF =pd.read_parquet(gsm8kTrainFile)
trainGSM8KDF.head()


In [None]:
trainGSM8KDF.info()


In [None]:
trainGSM8KDF.iloc[0]["answer"]


In [None]:
testGSM8KDF = pd.read_parquet(gsm8kTestFile)
testGSM8KDF.head()

In [None]:
testGSM8KDF["answer"][12]


In [None]:
trainOracMath200kDF = pd.read_parquet(orcaMath200kFile)
trainOracMath200kDF.head()

In [None]:
trainOracMath200kDF["answer"].iloc[1]


In [None]:
trainDF["problem"][9]


In [None]:
## cleaning data set
# trainDF["problem"] = trainDF["problem"].str.replace("$", '')
# trainDF["problem"] = trainDF["problem"].str.replace("\\vert", '|')
# trainDF["problem"] = trainDF["problem"].str.replace("\\left", '')
# trainDF["problem"] = trainDF["problem"].str.replace("\\right", '')
# trainDF["problem"] = trainDF["problem"].str.replace("\\mathbb", '')

In [None]:
print(trainDF["problem"][1])


In [None]:
print(trainDF["answer"][1])


In [None]:
# define model 
USE_LLAMA3 = False # for GPU version
modelName1 = "/kaggle/input/gemma/transformers/2b-it/3"
modelName2 =  "/kaggle/input/gemma/transformers/7b-it/3" # careful memory usage , will out of Memory both CPU or GPU
modelName3 =  "/kaggle/input/llama-3/transformers/8b-chat-hf/1" 
do_sample= True 
top_p=0.95 
top_k= 2
temperature=0.2#0.7 
num_beams = 3
max_length= 512

# Quantized Config
bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True # Activate nested quantization for 4-bit base models (double quantization)
)

In [None]:
device.type


In [None]:
if device.type == "cuda": # use 7b model gain Math performance
    if USE_LLAMA3: 
        modelSel = modelName3
        llmModel = "llama3_8b"
    else: 
        modelSel = modelName2
        llmModel = "gemma_7b"
    model = AutoModelForCausalLM.from_pretrained(modelSel , device_map="auto", quantization_config= bnb_config)   # intial with GPU quantized
    tokenizer = AutoTokenizer.from_pretrained(modelSel) # inital tokenizer
else: 
    modelSel = modelName1
    llmModel = "gemma_2b"
    model = AutoModelForCausalLM.from_pretrained(modelSel , device_map="auto")   # intial 
    tokenizer = AutoTokenizer.from_pretrained(modelSel) # inital tokenizer

In [None]:
model


In [None]:
llmModel


In [None]:
def generateResponse(query, maxOutToken=256):
    """
     Direct send message to gemini, get response
    """
    inputIds = tokenizer(query, return_tensors="pt").to(device)
    response = model.generate(**inputIds , 
                              do_sample =True,
                              top_p = 0.95,
                              top_k= 2,
                              temperature= 0.2, #0.7,#0.3,#0.7,
#                               max_length=maxOutToken,
                              max_new_tokens= maxOutToken,
                             )
#     return tokenizer.decode(response[0])
    return tokenizer.decode(response[0][len(inputIds["input_ids"]):], skip_special_tokens = True)
    

In [None]:
def generateReponseInst(promptTemp, query, maxOutToken=256):
    """
    Insert prompt template instruction with message
    """
#     prompt = f"""{promptTemp}\nQuestion: {query}\nAnswer: 
#     """
    prompt = f"""{promptTemp}\nQuestion: {query}\n### Instruction: Given Answer in JSON format with key 'answer' and 'explanation' ### 
    """
    inputIds = tokenizer(prompt, return_tensors="pt").to(device)
    response = model.generate(**inputIds,
                              do_sample =True,
                              top_p = 0.95,
                              top_k= 2,
                              temperature= 0.2, #0.7, # 0.3,#0.7,
#                               max_length=maxOutToken,
                              max_new_tokens= maxOutToken,)
#     return tokenizer.decode(response[0]) # reutrn 
    return tokenizer.decode(response[0][len(inputIds["input_ids"]):], skip_special_tokens = False)

In [None]:
def generateReponseRAG(promptTemp, ragContext, query, maxOutToken=256):
    """
    Use Insert prompt insturction, RAG retrieve with query 
    """
    info = "\n".join(ragContext)
    prompt = f"""
    {promptTemp}\n
    Question: {query}\n
    Information: {info}\n
    Answer:
    """
    inputIds = tokenizer(prompt, return_tensors="pt").to(device)
    response = model.generate(**inputIds, 
                              do_sample =True,
                              top_p = 0.95,
                              top_k= 2,
                              temperature= 0.2, #0.7, #0.3, #0.7,
#                               max_length=maxOutToken,
                              max_new_tokens= maxOutToken,)
    return tokenizer.decode(response[0][len(inputIds["input_ids"]):], skip_special_tokens=True)

In [None]:
import re


In [None]:
from json.decoder import JSONDecodeError
def isInteger(text):
    try:
        if int(text) >= 0:
            return True
        else:
            return False
    except ValueError:
        return False
    

def llmJSONParser1(txt):
#     print(txt)
    try:
        txt = txt.replace("<eos>", "")
        subTxt = txt.split("```")
        subTxt[1] = subTxt[1].replace("json", "")
        subTxt[1] = subTxt[1].replace("<eos>", "")
#             print(subTxt[1])
        jsonTxt = json.loads(subTxt[1])
    except  JSONDecodeError as e:
        print("Error LLM JSON parser", e)
        return None
    except :
        print(f"""Error LLM JSON parser input txt {txt}""" )
        return None
    return jsonTxt


def llmJSONParser2(txt):
#     print(txt)
    try:
        subText = txt.split("{")
        start = txt.find("{")
        end = txt.find("}")
        print(f"Start loc: {start}, End loc: {end}")
        subString = txt[start:end+1]
        print(subString)
        jsonTxt = json.loads(subString)
    except  JSONDecodeError as e:
        print("Error LLM JSON parser", e)
        return None
    except :
        print(f"""Error LLM JSON parser input txt {txt}""" )
        return None
    return jsonTxt
    
def llmJSONParser3(txt):
    '''
    Manual JSON answer parser without , json library 
    '''
    try:
        subText = txt.split("{") # split several {} in list
        for txtSeg in subText: # loop in list to find answer
            end = txtSeg.find("}") # find end position in text segment
            sub = txtSeg[:end] #subsring with {} context
            temp = sub.replace("*", "") # remove * symbol
            temp = temp.replace("\"", "") # reomve \" symbol
            temp = temp.lower() # convert to lower case
            answerloc = temp.find("answer:") # find key word "answer" position
            if answerloc != -1:
                print(f"find answer location : {answerloc}")
                newTxt = temp[answerloc:] # substring start answer
#                 print("Temp: ", temp)
                subTxt = newTxt.split("\n")
                #       print(subTxt)
                rel =subTxt[0][len("answer:"):].strip() # get answer value with remove space
                rel= rel.replace(',', '') # remove , symbol
                print(rel)
                if isInteger(rel):
                    return rel
                else:
                    continue # not find the value
#                 print(rel)
                
                
        return None # can't find answer

    except  JSONDecodeError as e:
        print("Error LLM JSON parser", e)
        return None
    except :
        print(f"""Error LLM JSON parser input txt {txt}""" )
        return None
    return jsonTxt
    