In [2]:
%pip install langchain langchain-community bitsandbytes sentence-transformers faiss-cpu gradio

Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
from transformers import (LlavaNextProcessor,
                          LlavaNextForConditionalGeneration,
                          BitsAndBytesConfig, BlipProcessor,
                          BlipForConditionalGeneration,
                         )

import tqdm
import os

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

import torch

from langchain_core.documents import Document
from langchain_text_splitters.html import HTMLHeaderTextSplitter

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
large_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
large_model = LlavaNextForConditionalGeneration.from_pretrained(
    "llava-hf/llava-v1.6-mistral-7b-hf",
    quantization_config=bnb_config
)
large_model.to(device)

preprocessor_config.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/70.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

LlavaNextForConditionalGeneration(
  (vision_tower): CLIPVisionModel(
    (vision_model): CLIPVisionTransformer(
      (embeddings): CLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
        (position_embedding): Embedding(577, 1024)
      )
      (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-23): 24 x CLIPEncoderLayer(
            (self_attn): CLIPSdpaAttention(
              (k_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear4bit(in_features=1024, out_features=1024, bias=True)
            )
            (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
            

In [5]:
small_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
small_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large", torch_dtype=torch.float16).to(device)

preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

In [6]:
def txt2doc(text: str, path: str, modality: str):
    metadata = {
        "source": path,
        "type": modality,
    }
    document = Document(page_content=text, metadata=metadata)

    return document

In [7]:
def split_html(path: str):
    headers = [
        ("p", ""),
    ]
    
    docs = []

    with open(path, "r") as html:
        content = html.read()
    splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers, return_each_element=True)
    splitted = splitter.split_text(content)
    splitted = filter(lambda x: len(x.page_content) >= 50, splitted)
    metadata = {
        "source": path,
        "type": "text",
    }
    for txt in splitted:
        txt.metadata = metadata
        docs.append(txt)
    
    return docs

In [8]:
txt_docs = []
img_docs = []

In [9]:
IMG_DIR = "/kaggle/input/scraped-data/scraped/scraped_images"
TEXT_DIR = "/kaggle/input/scraped-data/scraped/scraped_text"

In [10]:
for txt in tqdm.tqdm((os.listdir(TEXT_DIR)), total=len(os.listdir(TEXT_DIR))):
    splitted = split_html(os.path.join(TEXT_DIR, txt))
    txt_docs.extend(splitted)

100%|██████████| 1903/1903 [00:55<00:00, 34.43it/s]


In [11]:
def desc_img(img):
    inputs = small_processor(img, return_tensors="pt").to("cuda", torch.float16)

    out = small_model.generate(**inputs)
    return small_processor.decode(out[0], skip_special_tokens=True)


In [12]:
import cv2 as cv
def resize_img(img, fx, fy):
    resized_img = cv.resize(img, (0, 0), fx=fx, fy=fy)
    return resized_img

In [13]:
from PIL import Image
scale_x = scale_y = 0.5
for img in tqdm.tqdm((os.listdir(IMG_DIR)), total=len(os.listdir(IMG_DIR))):
    if img.endswith(('.jpg', '.png')):
        image = cv.imread(os.path.join(IMG_DIR, img))
        resized_img = resize_img(image, scale_x, scale_y)
        splitted = desc_img(resized_img)
        doc = txt2doc(splitted, os.path.join(IMG_DIR, img), "image")
        img_docs.append(doc)

100%|██████████| 1830/1830 [05:09<00:00,  5.92it/s]


In [14]:
txt_docs.extend(img_docs)
db = FAISS.from_documents(
    txt_docs,
    HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
)

  HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [23]:
from IPython.display import display

def MultimodalRAG(query: str):
    
    conversation = [
    {

      "role": "user",
      "content": [
          {"type": "text", "text": "Answer the question given the context.\n \
          If there are no provided photos or context, just say so, don't make up answer."},
        ],
    },
    ]
    retriever = db.as_retriever(
            search_type="similarity",
            search_kwargs={'k': 5, 'fetch_k': 50}
    )
    retrieved = retriever.invoke(query)
    images = []
    
    for context in retrieved:
        if context.metadata["type"] == "text":
            context_txt = {
                "type": "text", "text": context.page_content
            }
            conversation[0]["content"].append(context_txt)
        elif context.metadata["type"] == "image":
            path = context.metadata["source"]
            img = Image.open(path)
            display(img)
            context_img = {
                "type": "image"
            }
            conversation[0]["content"].append(context_img)
            images.append(img)
                    
    prompt = large_processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = large_processor(text=prompt, return_tensors="pt").to(device)
    output = large_model.generate(**inputs, max_new_tokens=300)

    print("-" * 20)
    return large_processor.decode(output[0], skip_special_tokens=True)

In [35]:
query = "How did Meta extended its Llama family of models?"
answer = MultimodalRAG(query)
print(answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


--------------------
[INST] Answer the question given the context.
           If there are no provided photos or context, just say so, don't make up answer.Meta extended its Llama family of models into two new categories: vision-language and sizes that are small enough to fit in edge devices.Meta extended its Llama family of models into two new categories: vision-language and sizes that are small enough to fit in edge devices.What’s new: Meta introduced Llama 3.2, including two larger vision-language models and two smaller text-only models as well as developer tools for building agentic applications based on the new models. Weights and code are free to developers who have less than 700 million monthly active users. Multiple providers offer cloud access.What’s new: Meta introduced Llama 3.2, including two larger vision-language models and two smaller text-only models as well as developer tools for building agentic applications based on the new models. Weights and code are free to develo