## Installations

In [None]:
!pip install --upgrade transformers
!pip install sentence_transformers
!pip install datasets
!pip install "weaviate-client==3.*"



## Imports

In [None]:
import transformers
from sentence_transformers import SentenceTransformer, LoggingHandler
from datasets import load_dataset
import logging
import weaviate

## Loading the dataset

In [None]:
# Load Wikipedia dataset
wiki_dataset = load_dataset("wikipedia", "20220301.simple")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
wiki_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 205328
    })
})

In [None]:
# documents = { row['title'] : row['text'] for row in wiki_dataset['train']}
documents = { # considering only 5000 examples (documents) for simplicity
    wiki_dataset['train'][i]['title'] : wiki_dataset['train'][i]['text']
    for i in range(5000)
}

## Configure Weaviate database Instance

In [None]:
WEAVIATE_INSTANCE_URL = 'YOUR_WEAVIATE_INSTANCE_URL' # Use your Instance URL
WEAVIATE_API_KEY = 'YOUR_WEAVIATE_API_KEY' # Use a valid WEAVIATE API KEY

client = weaviate.Client(
  url=WEAVIATE_INSTANCE_URL,
  auth_client_secret=weaviate.auth.AuthApiKey(api_key=WEAVIATE_API_KEY),
  timeout_config=(5, 15),  # (Optional) Set connection timeout & read timeout time in seconds
)

client.is_ready()  # Will return True if the client is connected & the server is ready to accept requests

True

## Configure the LLM

In [None]:
from transformers import AutoModelForCausalLM

MODEL_NAME = 'bigscience/bloom-560m'
llm_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME
)

## Load SentenceTransformer Model

In [None]:
from sentence_transformers import SentenceTransformer

EMBEDDER_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
embedder_model = SentenceTransformer(EMBEDDER_NAME)



## Embed documents

In [None]:
def encode(documents):
  return embedder_model.encode(
    sentences = list(documents.values()),
    normalize_embeddings=True
  )

In [None]:
documents_embeddings = encode(documents=documents) # this takes a bit of time!

In [None]:
documents_embeddings.shape # 5000 documents, embedding vector dimension = 384

(5000, 384)

In [None]:
documents_embeddings

array([[-0.04212961,  0.03468707,  0.07597769, ..., -0.01197653,
        -0.01345562,  0.00431713],
       [-0.01685511, -0.00634583, -0.0067565 , ...,  0.00422356,
        -0.05771038, -0.02872549],
       [ 0.02096751, -0.01108911,  0.0481586 , ...,  0.03826055,
         0.08381868,  0.01772718],
       ...,
       [-0.03387398, -0.00259509,  0.0308782 , ...,  0.00141246,
         0.04191242, -0.07859676],
       [ 0.05317277, -0.01632208, -0.03397854, ...,  0.00623156,
         0.03872793, -0.05830197],
       [-0.01964792,  0.02924611, -0.06285353, ...,  0.01391954,
        -0.01347917,  0.01298266]], dtype=float32)

## Store the documents and embeddings in the vector database

In [None]:
# Function to check if a class exists in Weaviate
def class_exists(class_name):
    schema = client.schema.get()
    classes = schema.get('classes', [])
    return any(cls['class'] == class_name for cls in classes)

In [None]:
# Check if the class exists before creating it
doc_schema = {
    "class": "Document",
    "properties": [
        {
            "name": "doc_idx",
            "dataType": ["int"],
        },
        {
            "name": "title",
            "dataType": ["text"],
        }
    ],
    "vectorizer": "none" # We are using precomputed vectors (using a SentenceTransformer)
}

if not class_exists("Document"):
    client.schema.create_class(doc_schema)
else:
    print("Class 'Document' already exists. Skipping creation.")

Class 'Document' already exists. Skipping creation.


In [None]:
documents['April']

'April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn common years, April starts on the same day of the week as October of the previous year, and i

In [None]:
# Function to upload documents and embeddings to Weaviate
def upload_documents(documents, embeddings):
  for idx, (doc, emb) in enumerate(zip(documents, embeddings)):
    # print(idx)
    data_object = {
        "doc_idx": idx,
        "title": doc
    }
    client.data_object.create(
        data_object,
        class_name="Document",
        vector=emb
    )

def get_documents(class_name):
    result = client.query.get(class_name, ["doc_idx", "title"]).do()
    return len(result['data']['Get'][class_name])

# Function to check if a class is empty
def is_class_empty(class_name):
    return get_documents(class_name) == 0

In [None]:
# Check if the Document class is empty
if is_class_empty("Document"):
  # Upload the data
  upload_documents(documents, documents_embeddings)
else:
  print('No')

No


## Prepare User Query

In [None]:
user_query = 'give me some informations about the month April'
user_query_embedding = embedder_model.encode([user_query])

## Compute the similarity search

In [None]:
# Perform the search
similar_documents = client.query.get("Document", ["doc_idx", "title"]) \
    .with_near_vector({"vector": user_query_embedding}) \
    .with_limit(3) \
    .do()

In [None]:
similar_documents

{'data': {'Get': {'Document': [{'doc_idx': 0, 'title': 'April'},
    {'doc_idx': 204, 'title': 'January'},
    {'doc_idx': 242, 'title': 'May'}]}}}

In [None]:
len(similar_documents['data']['Get']['Document'])

3

In [None]:
def get_context(similar_documents, documents):
  docs = [documents[entry['title']] for entry in similar_documents['data']['Get']['Document']]
  context = '\n'.join([f"Document {i+1} : {doc}" for i, doc in enumerate(docs)])
  return context

In [None]:
context = get_context(similar_documents, documents)

## Prepare Prompt & Generate responces

In [None]:
from transformers import AutoTokenizer

llm_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
prompt = f'Context : {context}. \n Question (user query) : {user_query}. \n Answer : '
input_ids = llm_tokenizer.encode(
    text=prompt,
    return_tensors='pt'
)
# input_ids = input_ids.to('cuda')

In [None]:
input_ids.shape

In [None]:
generated_ids = llm_model.generate(
    input_ids,
    max_new_tokens=20 #  the maximum number of tokens to generate. In other words, the size of the output sequence
)

# Here i get an error indicating that my RAM is used before generating the response, because i'm only using the free version of google colab
# And also here i only tried generating the responces using a smaller model called Bloom with 560 million, and not the Llama-2-7b

In [None]:
generated_response = llm_tokenizer.decode(generated_ids[0], skip_special_tokens=True)