## Requirements

In [None]:
!pip install faiss-cpu sentence-transformers transformers tqdm



In [None]:
import json
import numpy as np
import faiss
import torch
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel

## Encoders

### Sentence Transformer

In [None]:
def clean_up_id(id):
  return "-".join(part.capitalize() for part in id.split("-"))

def index_data(jsonl_files, index_file, metadata_file):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # read all files
    data_chunks = []
    for jsonl_file in jsonl_files:
        with open(jsonl_file, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                data_chunks.append(json.loads(line))

    # pull out the text to encode, keep the metadata
    corpus_texts = []
    corpus_meta = []

    for chunk in tqdm(
        data_chunks,
        desc=f"Encoding chunks",
        leave=False,
        unit="chunk"
    ):
        if chunk.get("pokemon"):
          parts = [ f'Pokemon: {chunk["pokemon"]}. Section: {chunk["section"]}.' ]
        elif chunk.get("move_name"):
          metadata = chunk["metadata"]
          parts = [ f'Move: {metadata["move_name"]}.' ]
        else:
          parts = [ f'{clean_up_id(chunk["id"])}. Section: {chunk["section"]}.' ]


        parts.append(chunk["text"])

        full_text = " ".join(parts)
        corpus_texts.append(full_text)

        if chunk.get("pokemon"):
          corpus_meta.append(
              {
                  "id": chunk["id"],
                  "pokemon": chunk["pokemon"],
                  "section": chunk["section"],
                  "text": chunk["text"],
                  "metadata": chunk["metadata"],
              }
          )
        else:
          corpus_meta.append(
              {
                  "id": chunk["id"],
                  "section": chunk["section"],
                  "text": chunk["text"],
                  "metadata": chunk["metadata"],
              }
          )

    embeddings = model.encode(corpus_texts, convert_to_tensor=False, show_progress_bar=True)

    # creat Faiss index
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings, dtype="float32"))
    faiss.write_index(index, index_file)

    # save metadata
    with open(metadata_file, "w", encoding="utf-8") as meta_f:
        json.dump(corpus_meta, meta_f, indent=4, ensure_ascii=False)

In [None]:
jsonl_files = [
    "pokemon_abilities.jsonl",
    "pokemon_abilities_table.jsonl",
    "pokemon_breeding.jsonl",
    "pokemon_core.jsonl",
    "pokemon_description.jsonl",
    "pokemon_evolutions.jsonl",
    "pokemon_locations.jsonl",
    "pokemon_matchups.jsonl",
    "pokemon_move_1.jsonl",
    "pokemon_move_2.jsonl",
    "pokemon_move_3.jsonl",
    "pokemon_move_4.jsonl",
    "pokemon_move_5.jsonl",
    "pokemon_move_6.jsonl",
    "pokemon_move_7.jsonl",
    "pokemon_move_8.jsonl",
    "pokemon_move_9.jsonl",
    "pokemon_pokemon_moves.jsonl",
    "pokemon_statistics.jsonl",
    "pokemon_text.jsonl",
    "pokemon_training.jsonl",
]

index_file = "/content/pokemon_faiss.index"
metadata_file = "/content/pokemon_metadata.json"

index_data(jsonl_files, index_file, metadata_file)


Encoding chunks:   0%|          | 0/66244 [00:00<?, ?chunk/s]

Batches:   0%|          | 0/2071 [00:00<?, ?it/s]