In [1]:
# !pip install -q -U torch torchvision datasets transformers==4.45.1 tensorflow langchain playwright html2text sentence_transformers faiss-cpu
# !pip install -q accelerate peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7 # I did not install because my GPU only has 16 gb memory
# !pip install -U langchain-community # Install the missing langchain-community package

NOTE:
1. go to https://huggingface.co/, create or login. At the top right icon, click settings -> access tokens -> create new token (click all permissions). Copy and paste it to the login line.

2. if you get this error: `OSError: You are trying to access a gated repo.`, you need to go to https://huggingface.co/mistralai/Mistral-7B-v0.1 and accept usage terms 

In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
#     BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
# from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline
from langchain.chains import LLMChain

# Load (NOT quantized) Mistral7b

In [3]:
#################################################################
# Tokenizer
#################################################################
from huggingface_hub import login

# Use your access token for read-only access
login(token="")
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

model_name='mistralai/Mistral-7B-Instruct-v0.1'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
# use_4bit = True

# Compute dtype for 4-bit base models
# bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
# bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
# use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
# compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=use_4bit,
#     bnb_4bit_quant_type=bnb_4bit_quant_type,
#     bnb_4bit_compute_dtype=compute_dtype,
#     bnb_4bit_use_double_quant=use_nested_quant,
# )

# Check GPU compatibility with bfloat16
# if compute_dtype == torch.float16 and use_4bit:
#     major, _ = torch.cuda.get_device_capability()
#     if major >= 8:
#         print("=" * 80)
#         print("Your GPU supports bfloat16: accelerate training with bf16=True")
#         print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
model = AutoModelForCausalLM.from_pretrained(
    model_name,
#     quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 7241732096
all model parameters: 7241732096
percentage of trainable model parameters: 100.00%


## Build Mistral text generation pipeline

In [5]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000,
)

Device set to use cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 15.74 GiB of which 209.69 MiB is free. Process 3053087 has 2.41 GiB memory in use. Including non-PyTorch memory, this process has 12.61 GiB memory in use. Of the allocated memory 12.46 GiB is allocated by PyTorch, and 1.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [None]:
prompt_template = """
### [INST] Instruction: Answer the question based on your up to date sport knowledge. Here is context to help:

{context}

### QUESTION:
{question} [/INST]
 """

# Create prompt from prompt template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

llm_chain.invoke({"context": "", "question": "who won the 2024 world series?"})

## Load and chunk documents. Load chunked documents into FAISS index

In [None]:
# !pip install playwright
# # !sudo playwright install
# # !sudo playwright install-deps
# !pip install html2text
# !pip install sentence-transformers
# !pip install faiss-cpu
# # !pip install tf-keras

In [None]:
import nest_asyncio
nest_asyncio.apply()

# Articles to index
articles = ["https://www.si.com/mlb/sports-illustrated-celebrates-dodgers-2024-world-series-commemorative-issue",
]

# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()

In [None]:
# !pip show faiss-cpu

In [None]:
# Converts HTML to plain text
html2text = Html2TextTransformer()
docs_transformed = html2text.transform_documents(docs)

# Chunk text
text_splitter = CharacterTextSplitter(chunk_size=100,
                                      chunk_overlap=0)
chunked_documents = text_splitter.split_documents(docs_transformed)

# Load chunked documents into the FAISS index
db = FAISS.from_documents(chunked_documents, HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device':'cpu'}))

retriever = db.as_retriever()

In [None]:
rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)

result = rag_chain.invoke("Who won the 2024 world series?")

In [None]:
result['context']
print(result['text'])