### Requirements

Embedding Models
- https://huggingface.co/sentence-transformers/all-mpnet-base-v2
- https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
- https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2

[Article](https://medium.com/@nils_reimers/openai-gpt-3-text-embeddings-really-a-new-state-of-the-art-in-dense-text-embeddings-6571fe3ec9d9)



In [None]:
!pip install langchain sentence_transformers tiktoken chromadb faiss-cpu pypdf InstructorEmbedding -U sentence-transformers

In [None]:
# !pip install accelerate bitsandbytes sentencepiece Xformers

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

### Setting Llama Modell

In [None]:
!wget https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q5_K_M.gguf

In [5]:
%ls

llama-2-13b-chat.Q5_K_M.gguf  [0m[01;34msample_data[0m/


In [6]:
from langchain.chains import LLMChain
from langchain.embeddings import LlamaCppEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp

In [7]:
"""
# tokenizer
import torch
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline

tokenizer = LlamaTokenizer.from_pretrained("TheBloke/Llama-2-13B-chat-GGUF")

model = LlamaForCausalLM.from_pretrained("TheBloke/Llama-2-13B-chat-GGUF",
                                              load_in_8bit=True,
                                              device_map='auto',
                                              torch_dtype=torch.float16,
                                              low_cpu_mem_usage=True
                                              )"""

SyntaxError: ignored

In [None]:
"""from transformers import pipeline
from langchain.llms import HuggingFacePipeline
import torch

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=1024,
    temperature=0,
    top_p=0.95,
    repetition_penalty=1.15
)

local_llm = HuggingFacePipeline(pipeline=pipe)"""

In [None]:
#print(local_llm('What is the capital of Japan?'))

In [None]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
# DEFAULT_SYSTEM_PROMPT = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information"
DEFAULT_SYSTEM_PROMPT = "Answer the question by the document you have."

SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT +E_SYS


In [None]:
def get_prompt(instruction):
    return B_INST + SYSTEM_PROMPT + instruction + E_INST

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

In [None]:
prompt_template = B_INST +SYSTEM_PROMPT + "{user_message}" + E_INST
prompt_template

In [None]:
llm_chain = LLMChain(llm=llm, prompt=PromptTemplate.from_template(prompt_template))  # set llama

In [None]:
llm_chain.run("Hai tere!")

In [8]:
"""n_batch = 512
n_gpu_layers = 1
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Alternative: llama = LlamaCppEmbeddings
llama = LlamaCpp(model_path="llama-2-13b-chat.Q5_K_M.gguf",
                           n_batch = n_batch,
                           n_gpu_layers = n_gpu_layers,
                           n_ctx=2048,
                           f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
                           callback_manager=callback_manager,
                           verbose=True,
                           ) """

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [None]:
n_batch = 512
n_gpu_layers = 1
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

# Alternative: llama = LlamaCppEmbeddings
llama = LlamaCpp(model_path="llama-2-13b-chat.Q5_K_M.gguf",
                           n_batch = n_batch,
                           n_gpu_layers = n_gpu_layers,
                            temperature=0.75,
                            max_tokens=2048,
                            top_p=1,
                            verbose=True,
                           )

### Faiss with Retriever

In [9]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

  from tqdm.autonotebook import trange


In [10]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive"

Mounted at /content/gdrive


In [11]:
# loader = TextLoader('.txt')
loader = DirectoryLoader(f'{root_dir}/Documents/', glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [12]:
len(documents)

15

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunked_data = text_splitter.split_documents(documents)

### HuggingFace Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings
sentences = ["This is an example sentence", "Each sentence is converted"]

#model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')
embeddings = model.encode(sentences)
#print(embeddings)

In [15]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L12-v2",
        model_kwargs={"device": "cuda"},) # cpu lokal

In [None]:
db = FAISS.from_documents(chunked_data, embedding_function)

query = "How many Use Case are there?"
docs = db.similarity_search(query)

print(docs[0].page_content)

In [None]:
retriever = db.as_retriever(search_kwargs={"k": 5}) # 3,5 oder 10

In [None]:
qa_chain = RetrievalQA.from_chain_type(llama,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [None]:
## Cite sources

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [None]:
"""# Mit Tokenizer
query = "What is the project about?"
llm_response = qa_chain(query)
process_llm_response(llm_response)"""

In [None]:
# Mit Callbackmanager
query = "What is the project about?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

# zu langsam:
# all-MiniLM-L6-v2 output:Based on the provided context, it appears that the project is focused on developing a proof-of-concept prototype for simulating learning behavior and generating synthetic data using generative agents. The specific use case being explored is debating in a group of simulations with different opinions, but the project may also involve other use cases such as learning analytics or simulating learning behavior of small groups. The goal is to evaluate and select appropriate OS LLMs for the use case and work with commonly used LLM techniques...

# TO DO all-MiniLM-L12-v2 output
# TO DO all-MiniLM-mnet-v2 output


In [None]:
# Mit Callbackmanager
query = "How many use cases are mentioned?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

# all-MiniLM-L6-v2 Output:  Yes, there are three use cases mentioned in the context given. They are Use Case 1, Use Case 2, and Use Case 3. Yes, there are three use cases mentioned in the context given. They are Use Case 1, Use Case 2, and Use Case
# 3. Sources: /content/gdrive/My Drive/Documents/AWT_2023_LLM_Agents.pdf

# TO DO all-MiniLM-L12-v2 output
# TO DO all-MiniLM-mnet-v2 output

In [None]:
qa_chain.retriever.search_type , qa_chain.retriever.vectorstore

In [None]:
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)