# Building a RAG System with Gemma, MongoDB, and Open Source Models

## Setups

In [None]:
!pip install -qU datasets pandas pymongo sentence_transformers transformers accelerate

## Data sourcing and preparation

In this example, we will use the [`MongoDB/embedded_moveis_dataset`](https://huggingface.co/datasets/MongoDB/embedded_movies) dataset.

In [None]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset('MongoDB/embedded_movies')
dataset_df = pd.DataFrame(dataset['train'])

README.md:   0%|          | 0.00/6.17k [00:00<?, ?B/s]

sample_mflix.embedded_movies.json:   0%|          | 0.00/42.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
dataset_df.head(5)

Unnamed: 0,plot,runtime,genres,fullplot,directors,writers,countries,poster,languages,cast,title,num_mflix_comments,rated,imdb,awards,type,metacritic,plot_embedding
0,Young Pauline is left a lot of money when her ...,199.0,[Action],Young Pauline is left a lot of money when her ...,"[Louis J. Gasnier, Donald MacKenzie]","[Charles W. Goddard (screenplay), Basil Dickey...",[USA],https://m.media-amazon.com/images/M/MV5BMzgxOD...,[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",The Perils of Pauline,0,,"{'id': 4465, 'rating': 7.6, 'votes': 744}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.0007293965299999999, -0.026834568000000003,..."
1,A penniless young man tries to save an heiress...,22.0,"[Comedy, Short, Action]",As a penniless man worries about how he will m...,"[Alfred J. Goulding, Hal Roach]",[H.M. Walker (titles)],[USA],https://m.media-amazon.com/images/M/MV5BNzE1OW...,[English],"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",From Hand to Mouth,0,TV-G,"{'id': 10146, 'rating': 7.0, 'votes': 639}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.022837115, -0.022941574000000003, 0.014937..."
2,"Michael ""Beau"" Geste leaves England in disgrac...",101.0,"[Action, Adventure, Drama]","Michael ""Beau"" Geste leaves England in disgrac...",[Herbert Brenon],"[Herbert Brenon (adaptation), John Russell (ad...",[USA],,[English],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",Beau Geste,0,,"{'id': 16634, 'rating': 6.9, 'votes': 222}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.00023330492999999998, -0.028511643000000003..."
3,"Seeking revenge, an athletic young man joins t...",88.0,"[Adventure, Action]",A nobleman vows to avenge the death of his fat...,[Albert Parker],"[Douglas Fairbanks (story), Jack Cunningham (a...",[USA],https://m.media-amazon.com/images/M/MV5BMzU0ND...,,"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",The Black Pirate,1,,"{'id': 16654, 'rating': 7.2, 'votes': 1146}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[-0.005927917, -0.033394486, 0.0015323418, -0...."
4,An irresponsible young millionaire changes his...,58.0,"[Action, Comedy, Romance]","The Uptown Boy, J. Harold Manners (Lloyd) is a...",[Sam Taylor],"[Ted Wilde (story), John Grey (story), Clyde B...",[USA],https://m.media-amazon.com/images/M/MV5BMTcxMT...,[English],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",For Heaven's Sake,0,PASSED,"{'id': 16895, 'rating': 7.6, 'votes': 918}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.0059373598, -0.026604708, -0.0070914757000..."


We also need to do some preprocessing work:

In [None]:
# Remove data points where the plot column is missing
dataset_df = dataset_df.dropna(subset=['fullplot'])
print('\nNumber of missing values in each column after removal:')
print(dataset_df.isnull().sum())


Number of missing values in each column after removal:
plot                    0
runtime                14
genres                  0
fullplot                0
directors              12
writers                13
countries               0
poster                 78
languages               1
cast                    1
title                   0
num_mflix_comments      0
rated                 279
imdb                    0
awards                  0
type                    0
metacritic            893
plot_embedding          1
dtype: int64


In [None]:
# Remove the plot_embedding from each data point in the dataset
# as we are going to create new embeddings
dataset_df = dataset_df.drop(columns=['plot_embedding'])
dataset_df.head(5)

Unnamed: 0,plot,runtime,genres,fullplot,directors,writers,countries,poster,languages,cast,title,num_mflix_comments,rated,imdb,awards,type,metacritic
0,Young Pauline is left a lot of money when her ...,199.0,[Action],Young Pauline is left a lot of money when her ...,"[Louis J. Gasnier, Donald MacKenzie]","[Charles W. Goddard (screenplay), Basil Dickey...",[USA],https://m.media-amazon.com/images/M/MV5BMzgxOD...,[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",The Perils of Pauline,0,,"{'id': 4465, 'rating': 7.6, 'votes': 744}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,
1,A penniless young man tries to save an heiress...,22.0,"[Comedy, Short, Action]",As a penniless man worries about how he will m...,"[Alfred J. Goulding, Hal Roach]",[H.M. Walker (titles)],[USA],https://m.media-amazon.com/images/M/MV5BNzE1OW...,[English],"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",From Hand to Mouth,0,TV-G,"{'id': 10146, 'rating': 7.0, 'votes': 639}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,
2,"Michael ""Beau"" Geste leaves England in disgrac...",101.0,"[Action, Adventure, Drama]","Michael ""Beau"" Geste leaves England in disgrac...",[Herbert Brenon],"[Herbert Brenon (adaptation), John Russell (ad...",[USA],,[English],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",Beau Geste,0,,"{'id': 16634, 'rating': 6.9, 'votes': 222}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,
3,"Seeking revenge, an athletic young man joins t...",88.0,"[Adventure, Action]",A nobleman vows to avenge the death of his fat...,[Albert Parker],"[Douglas Fairbanks (story), Jack Cunningham (a...",[USA],https://m.media-amazon.com/images/M/MV5BMzU0ND...,,"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",The Black Pirate,1,,"{'id': 16654, 'rating': 7.2, 'votes': 1146}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,
4,An irresponsible young millionaire changes his...,58.0,"[Action, Comedy, Romance]","The Uptown Boy, J. Harold Manners (Lloyd) is a...",[Sam Taylor],"[Ted Wilde (story), John Grey (story), Clyde B...",[USA],https://m.media-amazon.com/images/M/MV5BMTcxMT...,[English],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",For Heaven's Sake,0,PASSED,"{'id': 16895, 'rating': 7.6, 'votes': 918}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,


## Generating embeddings

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('thenlper/gte-large')

def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print('Attempted to get embedding for empty text.')
        return []

    embedding = embedding_model.encode(text)
    return embedding.tolist()

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
dataset_df['embedding'] = dataset_df['fullplot'].apply(get_embedding)

dataset_df.head(5)

Unnamed: 0,plot,runtime,genres,fullplot,directors,writers,countries,poster,languages,cast,title,num_mflix_comments,rated,imdb,awards,type,metacritic,embedding
0,Young Pauline is left a lot of money when her ...,199.0,[Action],Young Pauline is left a lot of money when her ...,"[Louis J. Gasnier, Donald MacKenzie]","[Charles W. Goddard (screenplay), Basil Dickey...",[USA],https://m.media-amazon.com/images/M/MV5BMzgxOD...,[English],"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",The Perils of Pauline,0,,"{'id': 4465, 'rating': 7.6, 'votes': 744}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[-0.009285859763622284, -0.005062091164290905,..."
1,A penniless young man tries to save an heiress...,22.0,"[Comedy, Short, Action]",As a penniless man worries about how he will m...,"[Alfred J. Goulding, Hal Roach]",[H.M. Walker (titles)],[USA],https://m.media-amazon.com/images/M/MV5BNzE1OW...,[English],"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",From Hand to Mouth,0,TV-G,"{'id': 10146, 'rating': 7.0, 'votes': 639}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.0024394148495048285, 0.02309592068195343, ..."
2,"Michael ""Beau"" Geste leaves England in disgrac...",101.0,"[Action, Adventure, Drama]","Michael ""Beau"" Geste leaves England in disgrac...",[Herbert Brenon],"[Herbert Brenon (adaptation), John Russell (ad...",[USA],,[English],"[Ronald Colman, Neil Hamilton, Ralph Forbes, A...",Beau Geste,0,,"{'id': 16634, 'rating': 6.9, 'votes': 222}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.012204292230308056, -0.011455780826508999, ..."
3,"Seeking revenge, an athletic young man joins t...",88.0,"[Adventure, Action]",A nobleman vows to avenge the death of his fat...,[Albert Parker],"[Douglas Fairbanks (story), Jack Cunningham (a...",[USA],https://m.media-amazon.com/images/M/MV5BMzU0ND...,,"[Billie Dove, Tempe Pigott, Donald Crisp, Sam ...",The Black Pirate,1,,"{'id': 16654, 'rating': 7.2, 'votes': 1146}","{'nominations': 0, 'text': '1 win.', 'wins': 1}",movie,,"[0.004541341681033373, -0.0006100545870140195,..."
4,An irresponsible young millionaire changes his...,58.0,"[Action, Comedy, Romance]","The Uptown Boy, J. Harold Manners (Lloyd) is a...",[Sam Taylor],"[Ted Wilde (story), John Grey (story), Clyde B...",[USA],https://m.media-amazon.com/images/M/MV5BMTcxMT...,[English],"[Harold Lloyd, Jobyna Ralston, Noah Young, Jim...",For Heaven's Sake,0,PASSED,"{'id': 16895, 'rating': 7.6, 'votes': 918}","{'nominations': 1, 'text': '1 nomination.', 'w...",movie,,"[-0.0022256169468164444, 0.011567807756364346,..."


## Database setup and connection

**MongoDB** acts as both an operational and a vector database. It offers a database solution that efficiently stores, queries and retrieves vector embeddings.


Make sure we have
* database cluster set up on  MongoDB Atlas
* the `MONGO_URI` ready to our cluster

The database will be named `movies` and the collection will be named `movie_collection_2`.

## Create a vector search index

We need to make sure that our vector index is created via MongoDB Atlas.

Then the next step is to conduct efficient and accurate vector-based searches based on the vector embeddings stored within the documents in the `movie_collection_2` collection.

Creating a Vector Search Index enables the ability to traverse the documents efficiently to retrieve documents within embeddings that match the query embedding based on vector similarity.

An example of MongoDB Vector Search Index:
```yaml
{
    'field': [{
        'numDimensions': 1024,
        'path': 'embedding',
        'similarity': 'cosine',
        'type': 'vector'
    }]
}
```
The value `1024` corresponds to the dimension of the vector generated by the `gte-large` embedding model.

## Establish data connection

We will now use PyMongo to create a MongoDB client object, representing the connection to the cluster and enabling access to its databases and collections.

In [None]:
import pymongo
from google.colab import userdata

def get_mongo_client(mogo_url):
    """Establish connection to the MongoDB"""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print('Connection to MongoDB successful.')
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f'Connection failed: {e}')
        return None


mongo_uri = userdata.get('MONGO_URI')
if not mongo_uri:
    print('`MONGO_URI` not set in the environment variables')

mongo_client = get_mongo_client(mongo_uri)

# ingest data to MongoDB
db = mongo_client['movies']
collection = db['movie_collection_2']

In [None]:
# detele any existing records in the collections
collection.delete_many({})

Ingesting data into a MongoDB collection from a pandas DataFrame is a straightforward process.

In [None]:
documents = dataset_df.to_dict('records')
collection.insert_many(documents)
print('Data ingestion into MongoDB completed.')

## Perform vector search on user queries

We will implement a function that returns a vector search result by generating a query embedding and defining a MongoDB aggregation pipeline.

The pipeline, consisting of the `$vectorSearch` and `$project` stages, executes queries using the generated vector and formats the results to include only the required information, such as plot, title, and genres while incorporating a search score for each result.

In [None]:
def vector_search(user_query, collection):
    """Perform a vector search in the MongoDB collection based on user query

    Parameters
    ----------
    user_query: str
        The user's query string
    collection: MongoCollection
        The MongoDB collection to search

    Returns
    -------
    list
        A list of matching documents
    """
    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            '$vectorSearch': {
                'index': 'vector_index',
                'queryVector': query_embedding,
                'path': 'embedding',
                'numCandidates': 150, # number of candidate matches to consider
                'limit': 4, # return top 4 matches
            }
        },
        {
            '$project': {
                '_id': 0, # exclude the _id field
                'fullplot': 1, # include the plot field
                'title': 1, # include the title field
                'genres': 1, # include the genres field
                'score': {'$meta': 'vectorSearchScore'}, # include the search score
            }
        }
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

## Handling user queries and loading Gemma

In [None]:
def get_search_result(query, collection):
    get_knowledge = vector_search(query, collection)

    search_reuslt = ""
    for result in get_knowledge:
        search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('fullplot', 'N/A')}\n"

    return search_result

In [None]:
# Conduct query with retrival of sources
query = 'What is the best romantic movie to watch and why?'
source_information = get_search_result(query, collection)

combined_information = (
    f"Query: {query}\nContinue to answer the query by using the Searched Results:\n{source_information}"
)
print(combined_information)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = 'google/gemma-2b-it'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')

In [None]:
input_ids = tokenizer(
    combined_information,
    return_tensors='pt'
).to('cuda')
response = model.generate(
    **input_ids,
    max_new_tokens=500
)
print(tokenizer.decode(response[0]))