In [1]:
import pandas as pd
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

from dotenv import load_dotenv

load_dotenv()
# Note: Gemini API Key ENV Var must be 'GOOGLE_API_KEY" for Langchain to use 

from opensearchpy import OpenSearch

# Local Openserach pasword: 08FDH5fj7*SG
# cd desktop/opensearch-2.17.1

# Import this to simulate the LLM we use
import google.generativeai as genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage, SystemMessage

from IPython.display import Markdown, display

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Import data for retrieval docs
abilities_entries = pd.read_json("Cleaned-Jsons/fandom_abilities.json")
maps_entries = pd.read_json("Cleaned-Jsons/fandom_maps.json")
players_entries = pd.read_json("Cleaned-Jsons/fandom_players.json") # change player id to player gamer tag
teams_entries = pd.read_json("Cleaned-Jsons/fandom_teams.json")
tournaments_entries = pd.read_json("Cleaned-Jsons/fandom_tournaments.json")
weapons_entries = pd.read_json("Cleaned-Jsons/fandom_weapons.json")

all_entries = [abilities_entries,maps_entries,players_entries,teams_entries,tournaments_entries,weapons_entries]

In [3]:
total_rows = sum(len(df) for df in all_entries)
total_rows

2025

In [4]:
# Stored consolidated string entries
docs = []

# Convert dataframe rows into strings
def rowToString(row):

    row_str = []

    for col_name, col_value in row.items():

            row_str.append(f"{col_name}: {col_value}")
    
    return ' | '.join(row_str)

# Convert all and add to corpus
for topic in all_entries:
      
      # Convert to strings
      converted_rows = (topic.apply(rowToString, axis=1)).tolist()
      docs.extend(converted_rows)

print((docs[2000]))


name: VCT 2024 - Americas League Kickoff | overview: 
==
*Group Stage:
**All matches are Best of 3
**Three Double-Elimination format (GSL) groups
**Top team from each group will advance to the Playoffs
**Second place team from each group advances to the Play-In
<br>
*Play-In:
**All matches are Best of 3
**Three team Round-Robin group
**Winning team will advance to the Playoffs
<br>
*Playoffs:
**Four team Single-Elimination bracket
**Semi-Finals are Best of 3
**Grand Final is Best of 5
**Top 2 teams qualify to [[VCT/2024_Season/Masters/Madrid|Masters Madrid]]

==
{{TournamentResults|prize=yes|prizeunit=USD|totalprize=|prize_ref=|pointstitle=Points |points=yes
|{{TournamentResults/Line|place=1 |prize=|points=3|team=}}
|{{TournamentResults/Line|place=2 |prize=|points=|team=}}
|{{TournamentResults/Line|place=3-4 |prize=|points=|team=}}
|{{TournamentResults/Line|place=3-4 |prize=|points=|team=}}
|{{TournamentResults/Line|place=5 |prize=|points=|team= |hide=true}}
|{{TournamentResults/Line|p

In [5]:
# Process text data into embeddings

text_embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')   # Initiate embeddings model

data_embeddings = text_embedder.encode(docs)



In [6]:
data_embeddings

array([[-0.00395763,  0.03527774, -0.00626034, ..., -0.01417634,
        -0.02766538,  0.0339634 ],
       [-0.00316342, -0.02700268,  0.0461862 , ...,  0.00160024,
        -0.02059869,  0.00449062],
       [-0.03680293,  0.05570466, -0.04343551, ..., -0.05702543,
        -0.0283303 ,  0.04192416],
       ...,
       [-0.00350216,  0.01157607, -0.04599394, ..., -0.09939551,
        -0.04148806, -0.00932844],
       [-0.03718558,  0.11075651, -0.04167727, ..., -0.08375157,
        -0.01441447,  0.04376922],
       [-0.00052026,  0.04080061, -0.05032773, ..., -0.05457349,
         0.04802196, -0.04538823]], dtype=float32)

In [28]:
# Create opensearch server connection
vstore = OpenSearch(
    hosts=[{'host': 'localhost', 'post':9200}],
    http_auth=('admin', '08FDH5fj7*SG'),
    use_ssl=True,
    verify_certs=False
)



{'acknowledged': True, 'persistent': {'cluster': {'routing': {'allocation': {'disk': {'watermark': {'low': '98%', 'flood_stage': '100%', 'high': '99%'}}}}}}, 'transient': {}}


In [29]:
# Set up indexing structure

# Define index mapping
index_mapping = {
    "settings": {
        "index": {
            "knn": True  
        }
    },
    "mappings": {
        "properties": {
            "text": {
                "type": "text"
            },
            "embedding": {
                "type": "knn_vector",
                "dimension": 384,  
                "index": True
            }
        }
    }
}

# Create index map structure
if vstore.indices.exists(index="docs"):
    vstore.indices.delete(index="docs")

print(vstore.indices.create(index="docs", body=index_mapping))



AuthorizationException: AuthorizationException(403, 'index_create_block_exception', 'blocked by: [FORBIDDEN/10/cluster create-index blocked (api)];')

In [30]:
cluster_health = vstore.cluster.health()
print(cluster_health)

{'cluster_name': 'opensearch', 'status': 'yellow', 'timed_out': False, 'number_of_nodes': 1, 'number_of_data_nodes': 1, 'discovered_master': True, 'discovered_cluster_manager': True, 'active_primary_shards': 5, 'active_shards': 5, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 1, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 83.33333333333334}




In [27]:
# Index docs with opensearch
for id, (doc, embeddings) in enumerate(zip(docs, data_embeddings)):

    doc_corpus = {
        'text': doc,
        'embedding': embeddings.tolist()
    } 

    vstore.index(index="docs", id=id, body=doc_corpus)



KeyboardInterrupt: 

In [12]:
# Request
query = "Can you tell me more about TenZ and what agents he plays in Valorant?"

query_embeddings = text_embedder.encode(query)

query_payload = {
    'query': {
        'knn': {
            'embedding': {
                'vector': query_embeddings.tolist(),
                'k': 5
            }
        }
    }
}

response = vstore.search(index="docs", body=query_payload)
print(response)



RequestError: RequestError(400, 'search_phase_execution_exception', "failed to create query: Field 'embedding' is not knn_vector type.")

In [31]:
mapping = vstore.indices.get_mapping(index="docs")
print(mapping)



NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [docs]', docs, index_or_alias)