In [None]:
%store -r WEAVIATE_IP
from boto3 import Session

session = Session()
credentials = session.get_credentials()
current_credentials = credentials.get_frozen_credentials()

AWS_ACCESS_KEY = current_credentials.access_key
AWS_SECRET_KEY = current_credentials.secret_key
AWS_SECRET_TOKEN = current_credentials.token
print(f"AWS_ACCESS_KEY:\t{AWS_ACCESS_KEY}")
print(f"AWS_SECRET_KEY:\t{AWS_SECRET_KEY}")
print(f"AWS_SECRET_TOKEN:\t{AWS_SECRET_TOKEN}")
print(f"WEAVIATE_IP:\t{WEAVIATE_IP}")

## Connect

In [None]:
import weaviate

client = weaviate.connect_to_custom(
    http_host=WEAVIATE_IP, http_port="8080",  http_secure=False,
    grpc_host=WEAVIATE_IP, grpc_port="50051", grpc_secure=False,

    headers={
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SECRET_TOKEN,
    }
)

client.is_ready()

## Load Data from arxiv

1. Get chunks from paper - `get_chunks_from_paper`
2. Create a tenant for the paper - `create_tenant`
3. Batch import chunks - `batch_import_chunks`

### 1. Get chunks from paper - `get_chunks_from_paper`

In [None]:
!pip install -U -q distyll-info

In [None]:
from distyll.text import from_arxiv_paper
from distyll.utils import chunk_text

def get_chunks_from_paper(url):
    paper = from_arxiv_paper(url)
    chunks = chunk_text(source_text=paper["text"])

    paper["arxiv_id"] = url.replace("https://arxiv.org/pdf/", "").replace(".pdf", "").replace(".", "-")
    paper["chunks"] = chunks
    return paper

### 2. Create a tenant for the paper - `create_tenant`

In [None]:
from weaviate.classes.tenants import Tenant
papers = client.collections.get("PapersTen")

def create_tenant(chunked_paper):
    tenant_name = chunked_paper["arxiv_id"]

    papers.tenants.create([
        Tenant(name=tenant_name)
    ])

    return tenant_name

### 3. Batch import chunks - `batch_import_chunks`

In [None]:
def batch_import_chunks(chunked_paper):
    ten = papers.with_tenant(chunked_paper["arxiv_id"])

    i=0
    with ten.batch.dynamic() as batch:
        for chunk in chunked_paper["chunks"]:
            batch.add_object({
                "title": chunked_paper["title"],
                "url": chunked_paper["url"],
                "chunk": chunk,
                "chunk_no": i,
            })
            i+=1

    # if(len(papers.batch.failed_objects)>0):
    if(len(ten.batch.failed_objects)>0):
        print("Import complete with errors")
        for err in papers.batch.failed_objects:
            print(err)
    else:
        print("Import complete with no errors")

## End-to-end paper load

In [None]:
def import_paper_with_tenants(url):
    cp = get_chunks_from_paper(url)
    tenant_name = create_tenant(cp)
    batch_import_chunks(cp)

In [None]:
import_paper_with_tenants("https://arxiv.org/pdf/2212.10496.pdf")
import_paper_with_tenants("https://arxiv.org/pdf/2401.00107.pdf")

## Close the client when done

In [None]:
client.close()