In [None]:
# Retrieval Augmented Generation with FAISS & LLM
# Using Google Colab, local memory and cpu (intel) on personal mac not enough

## Packages

In [2]:
#!pip install faiss-cpu langchain_huggingface datasets langchain_community bitsandbytes

In [3]:
# Data processing & modeling

from typing import Optional, List, Tuple
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM, BitsAndBytesConfig
import datasets
from datasets import Dataset
import torch
import multiprocessing as mp
import bitsandbytes

# Langchain

from langchain.docstore.document import Document as LangchainDocument
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

# Basic pandas

import pandas as pd
pd.set_option("display.max_colwidth", None)

# Visualize

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Record processing time

from datetime import datetime

In [19]:
# Show responses with wrap text

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))

get_ipython().events.register('pre_run_cell', set_css)

## Set models, common parameters and constraints

In [23]:
# Hierarchal method of separating documents to build corpus

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

DATASET = 'kpericak/waterfront_centris_nearmontreal_feb2025' # HF parquet
DATA_SECT = 'train' # Part of dataset, default all = train
COL_NAME = 'Details' # Column with the text data to process

EMBEDDING_MODEL = 'thenlper/gte-small' # Embedding model
CHUNK_OVERLAP = 50 # Token overlap of each document

DISTANCE_STRATEGY = DistanceStrategy.COSINE # With FAISS, others possible

NUM_RETURN_K = 10 # Number of chunks returned, top similarity results

READER_MODEL = 'Qwen/Qwen2.5-7B-Instruct-1M'
TEMPERATURE = 0.6 # Low = conservative selection on next token vs high = creative
REP_PENALTY = 1.1  # 1 is no penalty, > 1 penalty exists for repeat tokens
MAX_TOKENS = 500 # Per processed document

## Process

In [5]:
def load_process_hf_lc(dataset_name, dataset_section, column_name):

    ds = datasets.load_dataset(dataset_name,split=dataset_section)
    raw_knowledge_base = [
        LangchainDocument(page_content=doc[column_name])
        for doc in tqdm(ds)
    ]
    print('--->Raw knowledge base loaded.')
    return raw_knowledge_base

def split_unq_docs(raw_knowledge_base, embedding_model_name, chunk_overlap,
                   separator_list):

    # Split documents into chunks of maximum size `chunk_size` tokens
    # Based on embedding model selected, chunk size set to tokenizer

    ms = SentenceTransformer(embedding_model_name).max_seq_length
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer = AutoTokenizer.from_pretrained(embedding_model_name),
        chunk_size = ms,
        chunk_overlap = int(ms / chunk_overlap),
        add_start_index = True,
        strip_whitespace = True,
        separators = separator_list,
    )

    # Make list of processed documents

    docs_processed = []
    for doc in raw_knowledge_base:
        docs_processed  += text_splitter.split_documents([doc])

    # Remove duplicates for corpus

    unq_texts = {}
    corpus = []
    for doc in docs_processed :
        if doc.page_content not in unq_texts:
            unq_texts[doc.page_content] = True
            corpus.append(doc)

    print('--->Documents tokenized and unique.')
    return corpus

def embedding_normalize_similarity(embedding_model_name,
                       corpus, distance_strategy_name):
    embedding_model = HuggingFaceEmbeddings(
        model_name = embedding_model_name,
        multi_process = True,
        model_kwargs = {'device': 'cuda'}, # Not possible on mac intel
        encode_kwargs = {'normalize_embeddings': True},
        )

    # Facebook similarity search

    vectors = FAISS.from_documents(
        corpus, embedding_model, distance_strategy=distance_strategy_name
        )
    print('--->Vectors with embeddings made.')
    return embedding_model, vectors

def inference(reader_model_name, temperature, penalty, max_tokens):

  bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype='float16',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
    )

  model = AutoModelForCausalLM.from_pretrained(reader_model_name,
                                               quantization_config=bnb_config,
                                               # trust_remote_code=True,
                                               torch_dtype="auto",
                                               device_map="auto")

  tokenizer = AutoTokenizer.from_pretrained(reader_model_name,
                                            # trust_remote_code=True
                                            )
  # Trust remote code allows custom model from HF repo for models like DeepSeek

  reader_llm = pipeline(
      model = model,
      tokenizer = tokenizer,
      task = "text-generation",
      do_sample = True,
      temperature = temperature,
      repetition_penalty = penalty,
      return_full_text = False,
      max_new_tokens = max_tokens,
  )

  print('--->LLM reader ready.')
  return reader_llm, tokenizer

## Pipeline

In [27]:
# Core pipeline

def main_pipeline():

  start = datetime.now()
  print('Start:' + str(start)+'\n')

  mp.set_start_method("spawn", force=True)
  raw_knowledge_base = load_process_hf_lc(DATASET, DATA_SECT, COL_NAME)
  step1 = datetime.now()
  print('Runtime:' + str(step1-start)+'\n')

  corpus = split_unq_docs(raw_knowledge_base, EMBEDDING_MODEL,
                          CHUNK_OVERLAP, MARKDOWN_SEPARATORS)
  step2 = datetime.now()
  print('Runtime:' + str(step2-step1)+'\n')

  embedding_model, vectors = embedding_normalize_similarity(EMBEDDING_MODEL,
                                                            corpus,
                                                            DISTANCE_STRATEGY)
  step3 = datetime.now()
  print('Runtime:' + str(step3-step2)+'\n')

  reader_llm, tokenizer = inference(READER_MODEL, TEMPERATURE, REP_PENALTY,
                                    MAX_TOKENS)
  end = datetime.now()
  print('Runtime: '+ str(end-step3))
  print('Total runtime: '+ str(end-start))

  return reader_llm, vectors, tokenizer

# Get answer using LLM

def get_answer(user_query, reader_llm, tokenizer, vectors, k_num):

  retrieved_docs = vectors.similarity_search(query=user_query,
                                              k=k_num)
  relevant_docs = [doc.page_content for doc in retrieved_docs]

  prompt_in_chat_format = [
      {
          "role": "system",
          "content": """With only the information provided in the context,
          answer only the question asked concisely and comprehensively.
          When relevant, share the property address. Answer saying "Unable to
          answer." when the context does not provide enough information to
          answer.""",
      },
      {
          "role": "user",
          "content": """Context:
  {context}
  ---
  This is real estate information for the current market as of February 2025 in
  and around Montreal, Quebec. Use it to answer this question.

  Question: {question}""",
      },
  ]

  prompt_template = tokenizer.apply_chat_template(
      prompt_in_chat_format, tokenize=False, add_generation_prompt=True
  )

  context = "\nMost relevant properties pertaining to question:\n"
  context += "".join([f"{str(i)}:\n" + doc for i, doc in
                      enumerate(relevant_docs)])

  final_prompt = prompt_template.format(question=user_query, context=context)

  answer = reader_llm(final_prompt)[0]["generated_text"]

  return answer

## Run main pipeline

In [13]:
reader_llm, vectors, tokenizer = main_pipeline()

Start:2025-02-06 00:21:42.225448



  0%|          | 0/1582 [00:00<?, ?it/s]

--->Raw knowledge base loaded.
Runtime:0:00:01.148830

--->Documents tokenized and unique.
Runtime:0:00:18.983568

--->Vectors with embeddings made.
Runtime:0:00:15.206080



config.json:   0%|          | 0.00/825 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0


--->LLM reader ready.
Runtime: 0:07:12.232313
Total runtime: 0:07:47.570791


## Query and answer with LLM

In [28]:
QUERY =  '''I am looking for a property that I could use for vacations and
weekends. I want at least 4 bedrooms, close proximity to a ski hill and a spa,
and waterfront access. Which town should I search?'''

answer = get_answer(QUERY, reader_llm, tokenizer, vectors, NUM_RETURN_K)
print(answer)

Based on your requirements, you should search in Mont-Tremblant, Quebec. This town is renowned for its ski slopes and nearby spa facilities, and many listings indicate properties with multiple bedrooms suitable for vacation and weekend stays.


In [40]:
QUERY =  '''Which ski mountains (usually called "Mont" something or "Saint"
something) are near short-term rentals?'''

answer = get_answer(QUERY, reader_llm, tokenizer, vectors, NUM_RETURN_K)
print(answer)

Based on the information provided, the following ski mountains with "Mont" or "Saint" in their names are near short-term rentals:

1. Mont-Avalanche
2. Mont de ski La Réserve
3. Mont-Tremblant
4. Montagne (part of Mont Tremblant complex)

These locations are mentioned in conjunction with short-term rentals within the given text.


In [41]:
QUERY =  '''Is there a spa near Mont de ski La Réserve?'''

answer = get_answer(QUERY, reader_llm, tokenizer, vectors, NUM_RETURN_K)
print(answer)

Based on the given information, there is no direct mention of a spa near Mont de ski La Réserve. The closest spa mentioned is located near Mont-Tremblant (property #3). Therefore, unable to answer based on the provided context.


In [36]:
QUERY =  '''What are some properties for sale near Mont-Avalanche that have 4 or
more bedrooms and that are used for short-term rentals?'''

answer = get_answer(QUERY, reader_llm, tokenizer, vectors, NUM_RETURN_K)
print(answer)

Based on the information provided:

1. **Property Address:** 153, Chemin du Grand-Duc, Mont-Blanc  
   - Bedrooms: 6 (4 in basement)  
   - Short-term rent possible  
   - Description: A large, lakeside, furnished house with water sports activities.

Answering the question about properties near Mont-Avalanche with 4 or more bedrooms suitable for short-term rentals:

**1 Property Found:**
- **Address:** 153, Chemin du Grand-Duc, Mont-Blanc  
- **Bedrooms:** 6 (including 4 in the basement)  
- **Features:** Comfortable, private location, lakeside, water sports in summer, skiing in winter, furnished living room, short-term rent possibility.


In [43]:
QUERY =  '''Can you share three addresses for properties for sale that have 4 or
more bedrooms, cost less than $1,000,000 and exist in
Mont-Tremblant? I want to be able to swim, ski, and go to the spa. It should be
move-in ready! No renovations needed.'''

answer = get_answer(QUERY, reader_llm, tokenizer, vectors, NUM_RETURN_K)
print(answer)

Based on the provided information:

1. **Address:** 133-135, Rue Jasmin, Mont-Tremblant  
   *Details:* Two charming cottages with 2 bedrooms each on the main floor and 1 bedroom in the basement. Total 6 bedrooms, 2 bathrooms, and 1 powder room. Price: $850,000. Move-in ready.

2. **Address:** 165, Impasse des Trèfles, Mont-Tremblant  
   *Details:* Modern and luxurious, constructed in 2022, 3 bedrooms, 2 bathrooms, and 1 powder room. Fully furnished. Price: $925,000. Move-in ready.

3. **Address:** 110, Chemin du Pont-de-Fer, Mont-Tremblant  
   *Details:* Full renovation, 6 general bedrooms, 4 full bathrooms, privileged access to the resort, and vibrant heart of Mont-Tremblant. Price: $1,325,000 but meets other criteria.


### Resources
https://arxiv.org/pdf/2312.10997<br>
https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/<br>
https://huggingface.co/learn/cookbook/en/advanced_rag<br>
https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/<br>
https://www.langchain.com/<br>
https://www.nature.com/articles/s42003-022-03628-x#Abs1<br>
https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard<br>
https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-1M<br>
https://huggingface.co/deepseek-ai/DeepSeek-R1#usage-recommendations<br>
https://research.ibm.com/blog/retrieval-augmented-generation-RAG<br>
https://huggingface.co/datasets/kpericak/waterfront_centris_nearmontreal_feb2025<br>
https://stackoverflow.com/questions/58890109/line-wrapping-in-collaboratory-google-results<br>

