# Ingest

In [1]:
import pandas as pd
from weaviate import Client


# Setup

In [2]:
# connect to the weaviate instance at weaviate:8080
client = Client("http://weaviate:8080")


In [3]:
# configure client to use batch size 100 and 1 worker
client.batch.configure(batch_size=1, num_workers=1)


<weaviate.batch.crud_batch.Batch at 0x7fc98d59d330>

In [4]:
client.schema.delete_all()


# Schemas

In [5]:
genre_class_schema = {
    "class": "Genre",
    "description": "A genre of a movie",
    "moduleConfig": {
        "text2vec-transformers": {
            "poolingStrategy": "masked_mean"
        }
    },
    "properties": [
        {
            "dataType": [
                "string"
            ],
            "description": "The name of the genre",
            "name": "name",
        }
    ],
    "vectorizer": "text2vec-transformers"
}

client.schema.create_class(genre_class_schema)


In [6]:
movie_class_schema = {
    "class": "Movie",
    "description": "A movie",
    "moduleConfig": {
        "text2vec-transformers": {
            "poolingStrategy": "masked_mean",
        }
    },
    "properties": [
        {
            "dataType": ["text"],
            "description": "The title of the movie",
            "name": "title",
        },
        {
            "dataType": ["text"],
            "description": "The plot of the movie",
            "name": "plot",
        },
        {
            "dataType": ["text"],
            "description": "The summary of the movie",
            "name": "summary",
        },
        {
            "dataType": ["Genre"],
            "description": "The genres of the movie",
            "name": "genres",
        }
    ],
    "vectorizer": "text2vec-transformers"
}

client.schema.create_class(movie_class_schema)

In [7]:
user_class_schema = None


# Documents

In [8]:
# read the postprocessed movies parquet file
movies = pd.read_parquet("../data/movies_postprocessed.parquet")
movies.head()


Unnamed: 0_level_0,title,plot,summary,genres,poster_url,imdb_url
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3309,"Dog's Life, A (1920)",The Little Tramp and his dog companion struggl...,Poor Charlie lives in a vacant lot. He tries t...,[Comedy],https://m.media-amazon.com/images/M/MV5BYWFkMj...,https://www.imdb.com/title/tt0009018/plotsummary
3132,Daddy Long Legs (1919),An orphan discovers that she has an anonymous ...,Wealthy Jervis Pendleton acts as benefactor fo...,[Comedy],https://m.media-amazon.com/images/M/MV5BMWYwYT...,https://www.imdb.com/title/tt0010040/plotsummary
2821,Male and Female (1919),Lady Mary Lasenby is a spoiled maiden who alwa...,"Lord Brockelhurst, his unwilling betrothed Lad...","[Adventure, Drama]",https://m.media-amazon.com/images/M/MV5BODE2ZT...,https://www.imdb.com/title/tt0010418/plotsummary
2823,"Spiders, The (Die Spinnen, 1. Teil: Der Golden...",Kay Hoog finds a message that indicates that s...,"In San Francisco, the sportsman Kay Hoog tells...","[Action, Drama]",https://m.media-amazon.com/images/M/MV5BMTY2MD...,https://www.imdb.com/title/tt0010726/plotsummary
3231,"Saphead, The (1920)",The simple-minded son of a rich financier must...,Nick Van Alstyne owns the Henrietta silver min...,[Comedy],https://m.media-amazon.com/images/M/MV5BZDNiOD...,https://www.imdb.com/title/tt0011652/plotsummary


# Ingest

## Genres

Get unique genres from the genres column:

In [9]:
# get the genres column
# explode the genres column to get a row for each genre
# make it unique
# and convert it to a list
genres = movies["genres"].explode().unique().tolist()
genres


['Comedy',
 'Adventure',
 'Drama',
 'Action',
 'Horror',
 'War',
 'Crime',
 'Thriller',
 'Sci-Fi',
 'Romance',
 'Animation',
 "Children's",
 'Musical',
 'Documentary',
 'Mystery',
 'Western',
 'Film-Noir',
 'Fantasy']

In [10]:
with client.batch as batch:
    created_genres = {genre: batch.add_data_object(
        {"name": genre}, class_name="Genre") for genre in genres}


## Movies

In [11]:
# sample 5 movies from movies
# df = movies.sample(5, random_state=42)
df = movies


In [12]:
# loop through the title, plot and summary columns
# and add the data objects to the batch
with client.batch as batch:
    for index, row in df.iterrows():
        movie_uuid = batch.add_data_object(
            {
                "title": row["title"],
                "plot": row["plot"],
                "summary": row["summary"],
            },
            class_name="Movie"
        )

        # convert the genres to a list of genre uuids
        genre_uuids = [created_genres[genre] for genre in row["genres"]]

        # add the reference (movie has genres)
        for genre_uuid in genre_uuids:
            batch.add_reference(
                from_object_uuid=movie_uuid,
                from_object_class_name="Movie",
                from_property_name="genres",
                to_object_uuid=genre_uuid,
                to_object_class_name="Genre"
            )


Confirm that all objects have been uploaded: 

In [27]:
num_movies = movies.shape[0]
num_genres = len(genres)

num_movie_objects = client.query.aggregate("Movie").with_meta_count().do()["data"]["Aggregate"]["Movie"][0]["meta"]["count"]
num_genre_objects = client.query.aggregate("Genre").with_meta_count().do()["data"]["Aggregate"]["Genre"][0]["meta"]["count"]

assert num_movie_objects == num_movies
assert num_genres == num_genre_objects