# LLM Evaluation

### Libraries

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.evaluation import load_evaluator
from langchain.vectorstores import FAISS
import os, yaml, time, mlflow, pandas as pd

## Configuration

### Parameters

In [None]:
# Read config.yaml
config_path = '/jupyterlab/config/config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

params = {
    'main_model_id': 'GritLM-7B',
    'main_model_temperature': 0.7,
    'main_model_max_new_tokens': 200,
    'embeddings_model_id': 'mxbai-embed-large-v1',
    'normalize_embeddings': False,
    'search_k': 1,
    'chunk_size': 1024,
    'chunk_overlap': 128,
    'template': '''
    DOCUMENT: {} 
    
    QUESTION: {} 
    
    INSTRUCTIONS: 
    Answer the users QUESTION using the DOCUMENT text above. Keep your answer ground in the facts of the DOCUMENT.
    '''
}

### Pretrained LLM

##### Model load

With quantization

In [None]:
# 4-bit quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4")
# 8-bit quantization
# bnb_config = BitsAndBytesConfig(load_in_8bit=True, activation_dtype="int8")

model = AutoModelForCausalLM.from_pretrained('/jupyterlab' + params['main_model_id'], quantization_config=bnb_config, device_map = 'cuda')
tokenizer = AutoTokenizer.from_pretrained('/jupyterlab/' + params['main_model_id'], device_map = 'cuda', trust_remote_code=True)

Without quantization

In [None]:
model = AutoModelForCausalLM.from_pretrained('/jupyterlab/' + params['main_model_id'], device_map = 'cuda', trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('/jupyterlab/' + params['main_model_id'], device_map = 'cuda', trust_remote_code=True)

##### Pipeline generation

In [None]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, max_new_tokens = params['main_model_max_new_tokens'])

model_kwargs = {
    'temperature': params['main_model_temperature'],
    'device' : 0
}

llm = HuggingFacePipeline(
    pipeline = pipe,
    model_kwargs=model_kwargs
)

### Manual processing

In [None]:
new_manual_path = config.get('new_manual_path')
loader = UnstructuredMarkdownLoader(new_manual_path) 
document= loader.load()

splitter = CharacterTextSplitter(chunk_size=params['chunk_size'], chunk_overlap=params['chunk_overlap'], separator='\n')
documents = splitter.split_documents(document)

### Embeddings model

In [None]:
model_kwargs = {'device': 0}
encode_kwargs = {'normalize_embeddings': params['normalize_embeddings']}
embeddings = HuggingFaceEmbeddings(
    model_name='/jupyterlab/' + params['embeddings_model_id'],
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

### Vector DB

In [None]:
db = FAISS.from_documents(documents, embeddings)

## Performance evaluation

### Dataframe

In [None]:
jsonl_path = '/jupyterlab/datasets/test_data.jsonl'
df = pd.read_json(jsonl_path, lines=True)

### Query

In [None]:
def query(question):
    docs = db.similarity_search(question, topk=params['search_k'])
    context = '\n'.join([doc.page_content for doc in docs])

    query = params['template'].format(context, question)

    ini = time.time()
    r = llm.invoke(query)
    fin = time.time()
    duration = fin - ini
    r = r[len(query):]
    
    return r, docs, duration

df[['Prediction', 'Sources', 'Time']] = df['Question'].apply(lambda q: pd.Series(query(q)))

### String evaluator

In [None]:
def evaluation(row, evaluator):
    question = row['Question']
    answer = row['Answer']
    prediction = row['Prediction']
    return evaluator.evaluate_strings(input=question, prediction=prediction, reference=answer)

##### Embedding distance

In [None]:
evaluator = load_evaluator("embedding_distance", embeddings=embeddings)
df['Embedding Distance'] = df.apply(lambda r: evaluation(r, evaluator=evaluator)['score'], axis=1)

##### String distance

In [None]:
evaluator = load_evaluator("string_distance")
df['String Distance'] = df.apply(lambda r: evaluation(r, evaluator=evaluator)['score'], axis=1)

##### Criteria Evaluation

In [None]:
evaluator = load_evaluator("labeled_criteria", llm=llm, criteria="relevance")

df[['Criteria Reasoning', 'Criteria Value', 'Criteria Score']] = df.apply(lambda r: pd.Series(evaluation(r, evaluator=evaluator)), axis=1)

### Total metrics

In [None]:
sum_embedding_dist = df['Embedding Distance'].sum()
sum_string_dist = df['String Distance'].sum()
sum_criteria_score = df['Criteria Score'].sum()
sum_time = df['Time'].sum()

df.loc[len(df)] = ["", "", "", "", sum_time, sum_embedding_dist, sum_string_dist, "", "", sum_criteria_score]

### GPU Usage

In [None]:
import GPUtil as GPU
mem_gpu = GPU.getGPUs()[0].memoryUsed

GPUs = GPU.getGPUs()
for gpu in GPUs:
    print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

### Final Dataframe

In [None]:
df

### Final Results

In [None]:
for index, row in df.iterrows():
    print(row['Question'])
    print(row['Answer'])
    print(row['Prediction'])
    print()

# MLflow

### Connection

In [None]:
### MLFlow server credentials

### Logs

In [None]:
import jsonlines

mlflow_df = df[['Question', 'Answer', 'Prediction']]
jsonl_filename = "df.jsonl"
with jsonlines.open(jsonl_filename, mode='w') as writer:
    for index, row in mlflow_df.iterrows():
        writer.write(row.to_dict())

experiment_name = 'Optimisation of hyperparameters'

if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(name=experiment_name)

experiment = mlflow.get_experiment_by_name(experiment_name)

with mlflow.start_run(experiment_id = experiment.experiment_id):
    mlflow.log_params(params)
    mlflow.log_metric('Total Embedding Distance', sum_embedding_dist)
    mlflow.log_metric('Total String Distance', sum_string_dist)
    mlflow.log_metric('Total Criteria Score', sum_criteria_score)
    mlflow.log_metric('Total Time', sum_time)
    mlflow.log_metric('Size in MB', mem_gpu)
    mlflow.log_artifact(jsonl_filename, "df.jsonl")

os.remove(jsonl_filename)

# Output Example

In [None]:
# Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
query = df['Question'][0]

print('-------------------Instructor Embeddings------------------\n')
print(query)
llm_response = qa_chain(query)
process_llm_response(llm_response)