In [None]:
%store -r WEAVIATE_IP
from boto3 import Session

session = Session()
credentials = session.get_credentials()
current_credentials = credentials.get_frozen_credentials()

AWS_ACCESS_KEY = current_credentials.access_key
AWS_SECRET_KEY = current_credentials.secret_key
AWS_SECRET_TOKEN = current_credentials.token
print(f"AWS_ACCESS_KEY:\t{AWS_ACCESS_KEY}")
print(f"AWS_SECRET_KEY:\t{AWS_SECRET_KEY}")
print(f"AWS_SECRET_TOKEN:\t{AWS_SECRET_TOKEN}")
print(f"WEAVIATE_IP:\t{WEAVIATE_IP}")

## Connect to a WCS instance of Weaviate
> this way we will see the number of used vector dimensions

* Make sure to update the cluster_url and your WCS key

In [None]:
import weaviate

client = weaviate.connect_to_custom(
    http_host=WEAVIATE_IP, http_port="8080",  http_secure=False,
    grpc_host=WEAVIATE_IP, grpc_port="50051", grpc_secure=False,

    headers={
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SECRET_TOKEN,
    }
)

client.is_ready()

## Adding PQ configuration to the Wikipedia example
In this step, we will use:
* named vectors ("text_vector")
* PQ configuration on `vector_index_config`

```
vector_index_config=Configure.VectorIndex.hnsw(
    quantizer=Configure.VectorIndex.Quantizer.pq(
        segments=128, # 768/6
        training_limit=25000
    )
),
```

In [None]:
from weaviate.classes.config import Configure

# client.collections.delete("WikipediaPQ")

# Create a collection here - with Cohere as a vectorizer
client.collections.create(
    name="WikipediaPQ",

    # TODO: Note different syntax when we use named vectors
    vectorizer_config=[
        Configure.NamedVectors.text2vec_aws(
            name="text_vector",
            source_properties=["text"],

            model="embed-multilingual-v2.0",
            region="us-west-2",
            # region="us-east-1",

            # AutoPQ - this only works with Async Indexing
            vector_index_config=Configure.VectorIndex.hnsw(
                # TODO: add pq quantizer and set segments to 128
                # quantizer=Configure.VectorIndex.
            ),
    )],
)

## The rest is the same

In [None]:
!pip install -U -q datasets tqdm

In [None]:
from datasets import load_dataset
from tqdm import tqdm

def import_wiki_data(lang, lang_id, max_rows, skip_rows=0):
    print(f"Importing {max_rows} data items for {lang}")

    dataset = load_dataset(f"Cohere/wikipedia-22-12-{lang}-embeddings", split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    counter = 0
    counter = skip_rows

    wikipedia = client.collections.get("WikipediaPQ")

    with wikipedia.batch.fixed_size(batch_size=5000, concurrent_requests=2) as batch:
        for item in tqdm(dataset, initial=skip_rows, total=max_rows):
            vector = item["emb"]
            data_to_insert = {   
                "text": item["text"],
                "wiki_id": item["wiki_id"],
                "title": item["title"],
                "url": item["url"],
                "views": item["views"],
                "lang": lang,
                "lang_id": lang_id,
            }

            batch.add_object(
                properties=data_to_insert,
                
                # TODO: Note different syntax when we use named vectors
                vector={
                    "text_vector": vector   
                }
            )
            
            counter += 1
            if counter >= max_rows:
                break
    
    # check for errors at the end
    if (len(wikipedia.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wikipedia.batch.failed_objects)}")
        print(wikipedia.batch.failed_objects[-1])
    
    print(f"Imported {counter} items for {lang}")
    print("-----------------------------------")

In [None]:
import_per_country = 10_000

import_wiki_data("en", 0, import_per_country, 0)
import_wiki_data("de", 1, import_per_country, 0)
import_wiki_data("fr", 2, import_per_country, 0)
# import_wiki_data("es", 3, import_per_country, 0)
# import_wiki_data("it", 4, import_per_country, 0)

In [None]:
wikipedia = client.collections.get("WikipediaPQ")
wikipedia.aggregate.over_all()

In [None]:
client.close()