In [1]:
from pymongo import MongoClient
import json
import requests
from datasets import load_dataset
import os
import torch
import pandas as pd
# LlamaIndex will download embeddings models as needed
# Set llamaindex cache dir to ../cache dir here (Default is system tmp)
# This way, we can easily see downloaded artifacts
os.environ['LLAMA_INDEX_CACHE_DIR'] = os.path.join(os.path.abspath('./'), 'cache')

# Load settings from .env file
from dotenv import find_dotenv, dotenv_values

# _ = load_dotenv(find_dotenv()) # read local .env file
config = dotenv_values(find_dotenv())

ATLAS_URI = config.get('ATLAS_URI')
GROQ_API = config.get('GROQ_API')

dataset = load_dataset("fronkongames/steam-games-dataset")

# To disable GPU and experiment, uncomment the following line
# Normally, you would want to use GPU, if one is available
# os.environ["CUDA_VISIBLE_DEVICES"]=""

print ("using CUDA/GPU: ", torch.cuda.is_available())

for i in range(torch.cuda.device_count()):
   print("device ", i , torch.cuda.get_device_properties(i).name)

  from .autonotebook import tqdm as notebook_tqdm


using CUDA/GPU:  True
device  0 NVIDIA GeForce GTX 1660 Ti with Max-Q Design


In [2]:
# Prepare the dataset
datasets = pd.DataFrame(dataset['train'])

datasets.head(5)

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [3]:
# Remove data point where About the game is missing
dataset_games = datasets.dropna(subset=['About the game', 'Metacritic url'])
print("\nNumber of missing values in each column after removal:")
print(dataset_games.isnull().sum())

# Remove the unecessary columns
dataset_games = dataset_games.drop(columns=['Screenshots', 'Movies', 'Notes', 'Average playtime forever', 'Average playtime two weeks', 'Median playtime forever', 'Median playtime two weeks', 'Score rank', 'Website', 'Support url', 'Full audio languages'])

# Remove data with Recomendations score = 0
# dataset_games = dataset_games.query("`Recommendations` != 0")

length_dataset = len(dataset_games)
print(length_dataset)

dataset_games.head(5)


Number of missing values in each column after removal:
AppID                            0
Name                             0
Release date                     0
Estimated owners                 0
Peak CCU                         0
Required age                     0
Price                            0
DLC count                        0
About the game                   0
Supported languages              0
Full audio languages             0
Reviews                       1851
Header image                     0
Website                        500
Support url                   1180
Support email                 1371
Windows                          0
Mac                              0
Linux                            0
Metacritic score                 0
Metacritic url                   0
User score                       0
Positive                         0
Negative                         0
Score rank                    3900
Achievements                     0
Recommendations                  0

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,User score,Positive,Negative,Achievements,Recommendations,Developers,Publishers,Categories,Genres,Tags
10,1026420,WARSAW,"Oct 2, 2019",20000 - 50000,5,0,23.99,0,Use everything at your disposal to help a team...,"['English', 'French', 'German', 'Polish', 'Rus...",...,0,589,212,34,427,Pixelated Milk,"Pixelated Milk,gaming company","Single-player,Steam Achievements,Steam Trading...","Indie,RPG","Tactical RPG,Turn-Based Strategy,Wargame,Histo..."
15,22670,Alien Breed 3: Descent,"Nov 17, 2010",200000 - 500000,3,0,9.99,0,Alien Breed™ 3: Descent is the final explosive...,"['English', 'French', 'German', 'Italian', 'Ja...",...,0,349,134,13,285,Team17 Digital Ltd,Team17 Digital Ltd,"Single-player,Multi-player,Co-op,Steam Achieve...",Action,"Action,Shooter,Sci-fi,Aliens,Third Person,Isom..."
42,231330,Deadfall Adventures,"Nov 15, 2013",100000 - 200000,4,0,19.99,0,Deadfall Adventures is an action-driven first-...,"['English', 'German', 'Polish', 'French', 'Rus...",...,0,1716,628,50,1140,The Farm 51,THQ Nordic,"Single-player,Multi-player,Co-op,Steam Achieve...","Action,Adventure","Adventure,Action,FPS,Shooter,Multiplayer,First..."
44,897820,Reigns: Game of Thrones,"Oct 18, 2018",50000 - 100000,2,0,3.99,0,Reigns: Game of Thrones is the heir to the awa...,"['English', 'French', 'German', 'Spanish - Spa...",...,0,698,203,10,815,Nerial,Devolver Digital,"Single-player,Steam Achievements,Full controll...","Adventure,Indie,RPG","RPG,Indie,Card Game,Adventure,Choices Matter,2..."
66,12140,Max Payne,"Jan 6, 2011",500000 - 1000000,49,17,3.49,0,Max Payne is a man with nothing to lose in the...,['English'],...,0,9516,1114,0,8684,Remedy Entertainment,Rockstar Games,Single-player,Action,"Action,Noir,Classic,Third-Person Shooter,Bulle..."


In [4]:
# Import necessary modules and classes
from llama_index.core.settings import Settings
from llama_index.llms.groq import Groq
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Initialize the Groq Api object with your API key
llm = Groq(model="mixtral-8x7b-32768", api_key=GROQ_API)

# Initialize the HuggingFaceEmbedding object with the specified model and device
embed_model = HuggingFaceEmbedding(model_name='Alibaba-NLP/gte-base-en-v1.5', device='cuda', trust_remote_code=True)

# Set the LLM and embed_model in the Settings for further usage
Settings.llm = llm
Settings.embed_model = embed_model

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid 

In [5]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

# Convert the DataFrame to a JSON string representation
documents_json = dataset_games.to_json(orient='records')

# Load the JSON string into a Python list of dictionaries
documents_list = json.loads(documents_json)

llama_documents = []

# Helper function to convert any type to a string
def safe_str(value):
    if isinstance(value, list):
        return ', '.join(map(str, value))
    return str(value)

for document in documents_list:

    # Value for metadata must be one of (str, int, float, None)
    document['Tags'] = json.dumps(document['Tags'])
    document['Supported languages'] = json.dumps(document['Supported languages'])
    document['Reviews'] = json.dumps(document['Reviews'])
    document['Achievements'] = json.dumps(document['Achievements'])

    # Create a Document object with the text and excluded metadata for llm and embedding models
    llama_document = Document(
        text=safe_str(document['About the game']) + '.\n' + 
            "Price " + safe_str(document['Price']) + '.\n' + 
            "Developers " + safe_str(document['Developers']) + '.\n' + 
            "Publishers " + safe_str(document['Publishers']) + '.\n' + 
            "Recommendations " + safe_str(document['Recommendations']) + '.\n' + 
            "Metacritic score " + safe_str(document['Metacritic score']) + '.\n' +
            "Categories " + safe_str(document['Categories']) + '.\n' + 
            "Genres " + safe_str(document['Genres']) + '.',
        metadata=document,
        excluded_llm_metadata_keys=['About the game', 'Price', 'Developers', 'Publishers', 'Recommendations', 'Metacritic score', 'Categories', 'Genres'],
        excluded_embed_metadata_keys=['About the game', 'Price', 'Developers', 'Publishers', 'Recommendations', 'Metacritic score', 'Categories', 'Genres', 'Header image'],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n--------\nContent: {content}",
    )

    llama_documents.append(llama_document)

# Observing an example of what the llm and Embedding model receive as input
print(
    "\nThe LLM sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "\nThe Embedding model sees this: \n",
    llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED),
)


The LLM sees this: 
 Metadata: AppID=>1026420
Name=>WARSAW
Release date=>Oct 2, 2019
Estimated owners=>20000 - 50000
Peak CCU=>5
Required age=>0
DLC count=>0
Supported languages=>"['English', 'French', 'German', 'Polish', 'Russian']"
Reviews=>"\u201cNew WW2 Strategy Game Offers A Harrowing Look At Poland's Ill-Fated 1944 Uprising\u201d GameSpot \u201c(\u2026) in execution Warsaw manages to deliver its own experience entirely.\u201d Dualshockers \u201c(\u2026) Beautiful hand-painted artwork and turn-based combat (\u2026)\u201d Gameinformer"
Header image=>https://cdn.akamai.steamstatic.com/steam/apps/1026420/header.jpg?t=1657716289
Support email=>contact@pixmilk.com
Windows=>True
Mac=>False
Linux=>False
Metacritic url=>https://www.metacritic.com/game/pc/warsaw?ftag=MCD-06-10aaa1f
User score=>0
Positive=>589
Negative=>212
Achievements=>34
Tags=>"Tactical RPG,Turn-Based Strategy,Wargame,Historical,Strategy RPG,Perma Death,RPG,Difficult,Turn-Based Combat,2D,Rogue-lite,Party-Based RPG,Strat

In [6]:
# Import the SentenceSplitter class from the node_parser module
from llama_index.core.node_parser import SentenceSplitter

# Initialize a SentenceSplitter object
parser = SentenceSplitter()

# Use the SentenceSplitter to parse nodes from the llama_documents
nodes = parser.get_nodes_from_documents(llama_documents)

# Iterate through each node
for node in nodes:
    # Get the content of the node with metadata included
    content_with_metadata = node.get_content(metadata_mode="all")
    
    # Get the text embedding for the node content using the embed_model defined in the Settings
    node_embedding = Settings.embed_model.get_text_embedding(content_with_metadata)
    
    # Assign the calculated embedding to the node
    node.embedding = node_embedding

print("Total nodes data:", len(nodes))

Total nodes data: 4268


In [7]:
def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = MongoClient(mongo_uri)  # Establish connection to MongoDB using the provided URI.
        print("Connection to MongoDB successful")  # Print success message if connection is established.
        return client
    except Error as e:
        print(f"Connection failed: {e}")  # Print error message if connection fails.
        return None

mongo_uri = config.get('ATLAS_URI')  # Retrieve MongoDB URI from environment variables.
if not mongo_uri:
    print("MONGO_URI not set in environment variables")  # Print a warning if MongoDB URI is not set.

mongo_client = get_mongo_client(mongo_uri)  # Establish MongoDB client connection.

DB_NAME = "steam_games2"  # Name of the database to use.
COLLECTION_NAME = "embedded_games2"  # Name of the collection to use within the database.

db = mongo_client[DB_NAME]  # Select the specified database.
collection = db[COLLECTION_NAME]  # Select the specified collection within the database.

Connection to MongoDB successful


In [8]:
# To ensure we are working with a fresh collection
# Delete any existing records in the collection
collection.delete_many({})

DeleteResult({'n': 4268, 'electionId': ObjectId('7fffffff0000000000000313'), 'opTime': {'ts': Timestamp(1715832734, 938), 't': 787}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1715832734, 947), 'signature': {'hash': b'\xe5\xc4\x96\x1c\x82\xcc\x19\xbbr\xc3\x8cj\x16\xf1\x97\x91\xdb1G\x18', 'keyId': 7327385314178105346}}, 'operationTime': Timestamp(1715832734, 938)}, acknowledged=True)

In [9]:
# Importing the MongoDBAtlasVectorSearch class for vector search functionality.
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
import threading

# Initialize a MongoDBAtlasVectorSearch instance with the MongoDB client, database name, collection name, and index name.
# This object will be used for vector search operations on the specified MongoDB collection.
vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, index_name="games_index2")

# Function to add nodes to MongoDB collection with threading
def add_nodes_with_threading(vector_store, nodes):
    def add_nodes_thread():
        for node in nodes:
            vector_store.add([node])

    # Create new thread to add nodes
    thread = threading.Thread(target=add_nodes_thread)
    thread.start()
    thread.join()

# Calling function to adding nodes with threading
add_nodes_with_threading(vector_store, nodes)

In [10]:
from llama_index.core import VectorStoreIndex  # Import the VectorStoreIndex class for indexing vector stores.

# Initialize a MongoDBAtlasVectorSearch instance with the MongoDB client, database name, collection name, and index name.
# This object will be used for vector search operations on the specified MongoDB collection.
vector_store = MongoDBAtlasVectorSearch(mongo_client, db_name=DB_NAME, collection_name=COLLECTION_NAME, index_name="games_index2")

# Create an index using the VectorStoreIndex class, initializing it with the provided vector store.
# This index will enable efficient search operations on the vectors stored in the MongoDB collection.
index = VectorStoreIndex.from_vector_store(vector_store)

In [17]:
# Testing if our RAG system is working

# Importing Markdown for displaying formatted text in IPython notebooks.
from IPython.display import Markdown
# Importing utility function for displaying query responses in notebooks.
from llama_index.core.response.notebook_utils import display_response

# Create a query engine from the index for querying vector data efficiently.
query_engine = index.as_query_engine()

# Define the query string to search for relevant games.
query = "is Red Dead Redemption 2 is a good game?"

# Perform the query using the query engine to retrieve relevant results.
response = query_engine.query(query)

# Display the response as bold text using Markdown.
display(Markdown(f"<b>{response}</b>\n"))

# Display the response with source information using the provided utility function.
display_response(response, show_source=True)

<b>Red Dead Redemption 2 has received critical acclaim, with a Metacritic score of 93 out of 100. It is a single-player and multiplayer game that offers a vast open world with various activities and a deep narrative. The game is known for its immersive gameplay, detailed graphics, and atmospheric soundtrack. It has also received praise for its story, characters, and attention to detail. However, it is important to note that the enjoyment of any game can be subjective and depends on personal preferences.</b>


**`Final Response:`** Red Dead Redemption 2 has received critical acclaim, with a Metacritic score of 93 out of 100. It is a single-player and multiplayer game that offers a vast open world with various activities and a deep narrative. The game is known for its immersive gameplay, detailed graphics, and atmospheric soundtrack. It has also received praise for its story, characters, and attention to detail. However, it is important to note that the enjoyment of any game can be subjective and depends on personal preferences.

---

**`Source Node 1/2`**

**Node ID:** 7dc2a9fb-e65b-457c-8df6-09236c4f29e6<br>**Similarity:** 0.8232114315032959<br>**Text:** America, 1899. Arthur Morgan and the Van der Linde gang are outlaws on the run. With federal agen...<br>

---

**`Source Node 2/2`**

**Node ID:** 9c49c45c-ab75-4b08-8fc7-46f5aae7ed7d<br>**Similarity:** 0.7639858722686768<br>**Text:** The genre-defining masterpiece Resident Evil 2 returns, completely rebuilt from the ground up for...<br>