In [None]:
from pymongo import MongoClient
import json
import requests
from datasets import load_dataset
import os
import torch
import pandas as pd
# LlamaIndex will download embeddings models as needed
# Set llamaindex cache dir to ../cache dir here (Default is system tmp)
# This way, we can easily see downloaded artifacts
os.environ['LLAMA_INDEX_CACHE_DIR'] = os.path.join(os.path.abspath('./'), 'cache')

# Load settings from .env file
from dotenv import find_dotenv, dotenv_values

# _ = load_dotenv(find_dotenv()) # read local .env file
config = dotenv_values(find_dotenv())

ATLAS_URI = config.get('ATLAS_URI')
GROQ_API = config.get('GROQ_API')

dataset = load_dataset("fronkongames/steam-games-dataset")

# To disable GPU and experiment, uncomment the following line
# Normally, you would want to use GPU, if one is available
# os.environ["CUDA_VISIBLE_DEVICES"]=""

print ("using CUDA/GPU: ", torch.cuda.is_available())

for i in range(torch.cuda.device_count()):
   print("device ", i , torch.cuda.get_device_properties(i).name)

In [None]:
# Prepare the dataset
datasets = pd.DataFrame(dataset['train'])

datasets.head(5)

In [None]:
# Remove data point where About the game is missing
dataset_games = datasets.dropna(subset=['About the game', 'Metacritic url'])
print("\nNumber of missing values in each column after removal:")
print(dataset_games.isnull().sum())

# Remove the unecessary columns
dataset_games = dataset_games.drop(columns=['Screenshots', 'Movies', 'Notes', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'Score rank', 'Website', 'Support url', 'Full audio languages'])

# Remove data with Recomendations score = 0
# dataset_games = dataset_games.query("`Recommendations` != 0")

length_dataset = len(dataset_games)
print(length_dataset)

dataset_games.head(5)

In [None]:
# Import necessary modules and classes
from llama_index.core.settings import Settings
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Initialize the Groq Api object with your API key
llm = Groq(model="mixtral-8x7b-32768", api_key=GROQ_API)

# Initialize the HuggingFaceEmbedding object with the specified model and device
embed_model = HuggingFaceEmbedding(model_name='Alibaba-NLP/gte-base-en-v1.5', device='cuda', trust_remote_code=True)

# Set the LLM and embed_model in the Settings for further usage
Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_games.to_json(orient='records')

# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

# Helper function to convert any type to a string
def safe_str(value):
    if isinstance(value, list):
        return ', '.join(map(str, value))
    return str(value)

for document in documents_list:

    # Value for metadata must be one of (str, int, float, None)
    document['Tags'] = json.dumps(document['Tags'])
    document['Supported languages'] = json.dumps(document['Supported languages'])
    document['Reviews'] = json.dumps(document['Reviews'])
    document['Achievements'] = json.dumps(document['Achievements'])

    # Create a Document object with the text and excluded metadata for llm and embedding models
    llama_document = Document(
        text=safe_str(document['About the game']) + '.\n' + 
            "Price " + safe_str(document['Price']) + '.\n' + 
            "Developers " + safe_str(document['Developers']) + '.\n' + 
            "Publishers " + safe_str(document['Publishers']) + '.\n' + 
            "Recommendations " + safe_str(document['Recommendations']) + '.\n' + 
            "Metacritic score " + safe_str(document['Metacritic score']) + '.\n' +
            "Categories " + safe_str(document['Categories']) + '.\n' + 
            "Genres " + safe_str(document['Genres']) + '.',
        metadata=document,
        excluded_llm_metadata_keys=['About the game', 'Price', 'Developers', 'Publishers', 'Recommendations', 'Metacritic score', 'Categories', 'Genres'],
        excluded_embed_metadata_keys=['About the game', 'Price', 'Developers', 'Publishers', 'Recommendations', 'Metacritic score', 'Categories', 'Genres', 'Header image'],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n--------\nContent: {content}",
    )

    llama_documents.append(llama_document)

# Observing an example of what the llm and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)

In [None]:
# Import the SentenceSplitter class from the node_parser module
from llama_index.core.node_parser import SentenceSplitter

# Initialize a SentenceSplitter object
parser = SentenceSplitter()

# Use the SentenceSplitter to parse nodes from the llama_documents
nodes = parser.get_nodes_from_documents(llama_documents)

# Iterate through each node
for node in nodes:
    # Get the content of the node with metadata included
    content_with_metadata = node.get_content(metadata_mode="all")
    
    # Get the text embedding for the node content using the embed_model defined in the Settings
    node_embedding = Settings.embed_model.get_text_embedding(content_with_metadata)
    
    # Assign the calculated embedding to the node
    node.embedding = node_embedding

print("Total nodes data:", len(nodes))

In [None]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = MongoClient(mongo_uri)  # Establish connection to MongoDB using the provided URI.
        print("Connection to MongoDB successful")  # Print success message if connection is established.
        return client
    except Error as e:
        print(f"Connection failed: {e}")  # Print error message if connection fails.
        return None

mongo_uri = config.get('ATLAS_URI')  # Retrieve MongoDB URI from environment variables.
if not mongo_uri:
    print("MONGO_URI not set in environment variables")  # Print a warning if MongoDB URI is not set.

mongo_client = get_mongo_client(mongo_uri)  # Establish MongoDB client connection.

DB_NAME = "steam_games2"  # Name of the database to use.
COLLECTION_NAME = "embedded_games2"  # Name of the collection to use within the database.

db = mongo_client[DB_NAME]  # Select the specified database.
collection = db[COLLECTION_NAME]  # Select the specified collection within the database.

In [None]:
# To ensure we are working with a fresh collection
# Delete any existing records in the collection
collection.delete_many({})

In [None]:
# Importing the MongoDBAtlasVectorSearch class for vector search functionality.
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
import threading

# Initialize a MongoDBAtlasVectorSearch instance with the MongoDB client, database name, collection name, and index name.
# This object will be used for vector search operations on the specified MongoDB collection.
vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, index_name="games_index2")

# Function to add nodes to MongoDB collection with threading
def add_nodes_with_threading(vector_store, nodes):
    def add_nodes_thread():
        for node in nodes:
            vector_store.add([node])

    # Create new thread to add nodes
    thread = threading.Thread(target=add_nodes_thread)
    thread.start()
    thread.join()

# Calling function to adding nodes with threading
add_nodes_with_threading(vector_store, nodes)

In [None]:
from llama_index.core import VectorStoreIndex  # Import the VectorStoreIndex class for indexing vector stores.

# Initialize a MongoDBAtlasVectorSearch instance with the MongoDB client, database name, collection name, and index name.
# This object will be used for vector search operations on the specified MongoDB collection.
vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, index_name="games_index2")

# Create an index using the VectorStoreIndex class, initializing it with the provided vector store.
# This index will enable efficient search operations on the vectors stored in the MongoDB collection.
index = VectorStoreIndex.from_vector_store(vector_store)

In [None]:
# Testing if our RAG system is working

# Importing Markdown for displaying formatted text in IPython notebooks.
from IPython.display import Markdown
# Importing utility function for displaying query responses in notebooks.
from llama_index.core.response.notebook_utils import display_response

# Create a query engine from the index for querying vector data efficiently.
query_engine = index.as_query_engine()

# Define the query string to search for relevant games.
query = "Is Red Dead Redemption 2 a good game?"

# Perform the query using the query engine to retrieve relevant results.
response = query_engine.query(query)

# Display the response as bold text using Markdown.
display(Markdown(f"<b>{response}</b>\n"))

# Display the response with source information using the provided utility function.
display_response(response, show_source=True)