In [None]:
%store -r WEAVIATE_IP
from boto3 import Session

session = Session()
credentials = session.get_credentials()
current_credentials = credentials.get_frozen_credentials()

AWS_ACCESS_KEY = current_credentials.access_key
AWS_SECRET_KEY = current_credentials.secret_key
AWS_SECRET_TOKEN = current_credentials.token
print(f"AWS_ACCESS_KEY:\t{AWS_ACCESS_KEY}")
print(f"AWS_SECRET_KEY:\t{AWS_SECRET_KEY}")
print(f"AWS_SECRET_TOKEN:\t{AWS_SECRET_TOKEN}")
print(f"WEAVIATE_IP:\t{WEAVIATE_IP}")

## Multimodal Scenario

### Vectorize images with CLIP
CLIP can only be used with a local deployment using Docker Compose.

## Connect

In [None]:
import weaviate

client = weaviate.connect_to_custom(
    http_host=WEAVIATE_IP, http_port="8080",  http_secure=False,
    grpc_host=WEAVIATE_IP, grpc_port="50051", grpc_secure=False,

    headers={
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SECRET_TOKEN,
    }
)

client.is_ready()

## Create a new collection

In [None]:
from weaviate.classes.config import Configure, Property, DataType, Multi2VecField

client.collections.delete("MoviesMM")

client.collections.create(
    name="MoviesMM",
    properties=[ # optional
        Property(name="title", data_type=DataType.TEXT),
        Property(name="overview", data_type=DataType.TEXT),
        Property(name="rating", data_type=DataType.NUMBER),
        Property(name="release_date", data_type=DataType.DATE),
        Property(name="tmdb_id", data_type=DataType.INT),
        Property(name="poster_url", data_type=DataType.TEXT),
        Property(name="poster", data_type=DataType.BLOB),
    ],

    # Define & configure the vectorizer module
    vectorizer_config=[
        # Vectorize the movie title and summary with the Titan model
        Configure.NamedVectors.text2vec_aws(
            name="content",
            source_properties=["title", "overview"],

            model="amazon.titan-embed-text-v1",
            region="us-west-2",
        ),

        # Vectorize the movie poster (image)
        Configure.NamedVectors.multi2vec_clip(
            name="poster",
            image_fields=["poster"]
            # image_fields=[
            #     Multi2VecField(name="poster", weight=0.9)
            # ],
            # text_fields=[
            #     Multi2VecField(name="title", weight=0.1)
            # ],
        )
    ],
)

## Load data

In [None]:
import pandas as pd

# df = pd.read_json("./data/movies_data_1990_2024.json")
df = pd.read_json("./data/movies_data_small.json")
df.head()

### Example of loading images from the Internet
> We won't use it for import for this project, as that could get flagged by tmdb servers as an attack

In [None]:
import base64, requests

def url_to_base64(url):
    image_response = requests.get(url)
    content = image_response.content
    return base64.b64encode(content).decode("utf-8")

url_to_base64("https://image.tmdb.org/t/p/w600_and_h900_bestv2/1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg")

In [None]:
from datetime import datetime, timezone

# test top 3 items
for i, movie in enumerate(df.head(3).itertuples(index=False)):
    poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    poster = url_to_base64(poster_path)

    print(movie.title)
    print(poster_path)
    print(poster, "\n")

### Load poster images from a local folder

In [None]:
import base64

# Helper function to convert a file to base64 representation
def toBase64(path):
    with open(path, 'rb') as file:
        return base64.b64encode(file.read()).decode('utf-8')
    
toBase64("./posters/162_poster.jpg")

In [None]:
from datetime import datetime, timezone
from pathlib import Path

# test top 3 items
for i, movie in enumerate(df.head(3).itertuples(index=False)):
    poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
    posterb64 = toBase64(f"./posters/{movie.id}_poster.jpg")

    print(movie.title)
    print(poster_path)
    print(posterb64, "\n")

### Insert with Batch

In [None]:
from datetime import datetime, timezone
from weaviate.util import generate_uuid5

movies = client.collections.get("MoviesMM")
with movies.batch.rate_limit(100) as batch:

    for i, movie in enumerate(df.itertuples(index=False)):
        if(i == 200): # load the first 200 movie objects
            break

        print(i, movie.title)

        # Convert a JSON date to `datetime` and add time zone information
        release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace(
            tzinfo=timezone.utc
        )

        poster_path = f"https://image.tmdb.org/t/p/w600_and_h900_bestv2{movie.poster_path}"
        posterb64 = toBase64(f"./posters/{movie.id}_poster.jpg")

        movie_obj = {
            "title": movie.title,
            "overview": movie.overview,
            "rating": movie.vote_average,
            "release_date": release_date,
            "tmdb_id": movie.id, # https://www.themoviedb.org/movie/{tmdb_id}
            "poster_path": poster_path,
            "poster": posterb64
        }

        batch.add_object(
            properties=movie_obj,
            uuid=generate_uuid5(movie.id)
        )


### Check for batch errors

In [None]:
# Check for failed objects
if len(movies.batch.failed_objects) > 0:
    print(f"Failed to import {len(movies.batch.failed_objects)} objects")
    for failed in movies.batch.failed_objects:
        print(f"e.g. Failed to import object with error: {failed.message}")
else:
    print("No errors")

In [None]:
movies.aggregate.over_all()

In [None]:
client.close()