# Imports

In [1]:
import pandas as pd
from weaviate import Client
import numpy as np


# Data

In [2]:
# read the movies postprocess parquet file
movies = pd.read_parquet("../data/movies_postprocessed.parquet")
movies.head()


Unnamed: 0_level_0,title,plot,summary,genres,poster_url,imdb_url
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3309,"Dog's Life, A (1920)",The Little Tramp and his dog companion struggl...,Poor Charlie lives in a vacant lot. He tries t...,[Comedy],https://m.media-amazon.com/images/M/MV5BYWFkMj...,https://www.imdb.com/title/tt0009018/plotsummary
3132,Daddy Long Legs (1919),An orphan discovers that she has an anonymous ...,Wealthy Jervis Pendleton acts as benefactor fo...,[Comedy],https://m.media-amazon.com/images/M/MV5BMWYwYT...,https://www.imdb.com/title/tt0010040/plotsummary
2821,Male and Female (1919),Lady Mary Lasenby is a spoiled maiden who alwa...,"Lord Brockelhurst, his unwilling betrothed Lad...","[Adventure, Drama]",https://m.media-amazon.com/images/M/MV5BODE2ZT...,https://www.imdb.com/title/tt0010418/plotsummary
2823,"Spiders, The (Die Spinnen, 1. Teil: Der Golden...",Kay Hoog finds a message that indicates that s...,"In San Francisco, the sportsman Kay Hoog tells...","[Action, Drama]",https://m.media-amazon.com/images/M/MV5BMTY2MD...,https://www.imdb.com/title/tt0010726/plotsummary
3231,"Saphead, The (1920)",The simple-minded son of a rich financier must...,Nick Van Alstyne owns the Henrietta silver min...,[Comedy],https://m.media-amazon.com/images/M/MV5BZDNiOD...,https://www.imdb.com/title/tt0011652/plotsummary


# Weaviate

In [3]:
# connect to the weaviate instance at weaviate:8080
client = Client("http://weaviate:8080")

client.cluster.get_nodes_status()


[{'gitHash': 'f818156',
  'name': 'node1',
  'shards': [{'class': 'ViewCos', 'name': 'oA1lqcb7qq7e', 'objectCount': 1},
   {'class': 'ViewL2', 'name': 'jygvB6K0jpmI', 'objectCount': 1},
   {'class': 'ViewMan', 'name': 'O89NxYwy1O3I', 'objectCount': 1},
   {'class': 'MovieCos', 'name': 'ff29Z3xcHcQr', 'objectCount': 3823},
   {'class': 'MovieHam', 'name': '5XqC41u3HyuX', 'objectCount': 3823},
   {'class': 'MovieL2', 'name': 'iSU6WKVgXtO1', 'objectCount': 3823},
   {'class': 'MovieMan', 'name': 'SX5w9G4JHPSF', 'objectCount': 3823},
   {'class': 'ViewDot', 'name': 'jB5LvKRyKHXV', 'objectCount': 1},
   {'class': 'ViewHam', 'name': 'DjcKX3sFcMt6', 'objectCount': 1},
   {'class': 'Genre', 'name': 'bhuJm2ZYdJ7m', 'objectCount': 18},
   {'class': 'MovieDot', 'name': 'tFqMwfQ3f4vm', 'objectCount': 3823}],
  'stats': {'objectCount': 19138, 'shardCount': 11},
  'status': 'HEALTHY',
  'version': '1.17.2'}]

In [4]:
client.get_meta()


{'hostname': 'http://[::]:8080',
 'modules': {'ref2vec-centroid': {},
  'text2vec-transformers': {'model': {'_name_or_path': 'sentence-transformers/msmarco-distilroberta-base-v2',
    'add_cross_attention': False,
    'architectures': ['RobertaModel'],
    'attention_probs_dropout_prob': 0.1,
    'bad_words_ids': None,
    'bos_token_id': 0,
    'chunk_size_feed_forward': 0,
    'decoder_start_token_id': None,
    'diversity_penalty': 0,
    'do_sample': False,
    'early_stopping': False,
    'encoder_no_repeat_ngram_size': 0,
    'eos_token_id': 2,
    'finetuning_task': None,
    'forced_bos_token_id': None,
    'forced_eos_token_id': None,
    'gradient_checkpointing': False,
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.1,
    'hidden_size': 768,
    'id2label': {'0': 'LABEL_0', '1': 'LABEL_1'},
    'initializer_range': 0.02,
    'intermediate_size': 3072,
    'is_decoder': False,
    'is_encoder_decoder': False,
    'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
    'layer_no

Get the movie classes:

In [5]:
movie_classes = client.schema.get()["classes"]

movie_classes = [movie_class["class"]
                 for movie_class in movie_classes if movie_class["class"].startswith("Movie")]

movie_classes


['MovieCos', 'MovieDot', 'MovieL2', 'MovieMan', 'MovieHam']

# Schema

Define a schema to represent the a user's viewing history:

In [6]:
# make a copy of movie_classes
# replace Movie with View in each element
view_classes = [movie_class.replace("Movie", "View")
                for movie_class in movie_classes]
view_classes


['ViewCos', 'ViewDot', 'ViewL2', 'ViewMan', 'ViewHam']

In [7]:
# delete the View schemas if it exists
for view_class in view_classes:
    try:
        client.schema.delete_class(view_class)
    except Exception as e:
        print(e)


In [8]:
for view_class, movie_class in zip(view_classes, movie_classes):

    view_class_schema = {
        "class": view_class,
        "description": f"The movies a user has watched",
        "moduleConfig": {
            "ref2vec-centroid": {
                "referenceProperties": ["movies"],
                "method": "mean"
            }
        },
        "properties": [
            {
                "dataType": ["string"],
                "name": "user_id",
                "description": "The user id",

            },
            {
                "dataType": [movie_class],
                "name": "movies",
                "description": "The movies the user has watched",
            }
        ],
        "vectorizer": "ref2vec-centroid"
    }

    # create the View schema
    client.schema.create_class(view_class_schema)


In [9]:
client.schema.get("ViewCos")


{'class': 'ViewCos',
 'description': 'The movies a user has watched',
 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
  'cleanupIntervalSeconds': 60,
  'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
 'moduleConfig': {'ref2vec-centroid': {'method': 'mean',
   'referenceProperties': ['movies']}},
 'properties': [{'dataType': ['string'],
   'description': 'The user id',
   'moduleConfig': {'ref2vec-centroid': {}},
   'name': 'user_id',
   'tokenization': 'word'},
  {'dataType': ['MovieCos'],
   'description': 'The movies the user has watched',
   'moduleConfig': {'ref2vec-centroid': {}},
   'name': 'movies'}],
 'replicationConfig': {'factor': 1},
 'shardingConfig': {'virtualPerPhysical': 128,
  'desiredCount': 1,
  'actualCount': 1,
  'desiredVirtualCount': 128,
  'actualVirtualCount': 128,
  'key': '_id',
  'strategy': 'hash',
  'function': 'murmur3'},
 'vectorIndexConfig': {'skip': False,
  'cleanupIntervalSeconds': 300,
  'maxConnections': 64,
  'efConstr

# Demo

Define a dummy user with a viewing history:

In [10]:
# is there a movie with title that contains "sleeping beauty", "cinderella", "peter pan", "tramp" or "jungle book"
movies[movies["title"].str.contains("sleeping beauty|cinderella|peter pan|tramp|jungle book", case=False)]

Unnamed: 0_level_0,title,plot,summary,genres,poster_url,imdb_url
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1022,Cinderella (1950),When Cinderella's cruel stepmother prevents he...,"Beautiful Cinderella, a young woman with a hea...","[Animation, Children's, Musical]",https://m.media-amazon.com/images/M/MV5BMWE3Nz...,https://www.imdb.com/title/tt0042332/plotsummary
2087,Peter Pan (1953),Wendy and her brothers are whisked away to the...,An adaptation of J. M. Barrie's story about a ...,"[Animation, Children's, Fantasy, Musical]",https://m.media-amazon.com/images/M/MV5BMzIwMz...,https://www.imdb.com/title/tt0046183/plotsummary
2080,Lady and the Tramp (1955),The romantic tale of a sheltered uptown Cocker...,"Lady, a golden cocker spaniel, meets up with a...","[Animation, Children's, Comedy, Musical, Romance]",https://m.media-amazon.com/images/M/MV5BODk4YW...,https://www.imdb.com/title/tt0048280/plotsummary
2096,Sleeping Beauty (1959),"After being snubbed by the royal family, a mal...",After the beautiful Princess Aurora is born in...,"[Animation, Children's, Musical]",https://m.media-amazon.com/images/M/MV5BOTJmZj...,https://www.imdb.com/title/tt0053285/plotsummary
2078,"Jungle Book, The (1967)",Bagheera the Panther and Baloo the Bear have a...,"Abandoned after an accident, baby Mowgli is ta...","[Animation, Children's, Comedy, Musical]",https://m.media-amazon.com/images/M/MV5BMTg3MD...,https://www.imdb.com/title/tt0061852/plotsummary
362,"Jungle Book, The (1994)","Rudyard Kipling's classic tale of Mowgli, the ...",An adaptation of Rudyard Kipling's classic tal...,"[Adventure, Children's, Romance]",https://m.media-amazon.com/images/M/MV5BMjRjND...,https://www.imdb.com/title/tt0110213/plotsummary
1538,"Second Jungle Book: Mowgli & Baloo, The (1997)","The story of Mowgli, a young boy abandoned in ...",Pre-teen jungle boy Mowgli gets to human world...,"[Adventure, Children's]",https://m.media-amazon.com/images/M/MV5BMzZhOT...,https://www.imdb.com/title/tt0120087/plotsummary
2125,Ever After: A Cinderella Story (1998),The Brothers Grimm arrive at the home of a wea...,Andy Tennant directed this Cinderella variant....,"[Drama, Romance]",https://m.media-amazon.com/images/M/MV5BN2FhYT...,https://www.imdb.com/title/tt0120631/plotsummary


In [11]:
user_id = "test_user"

# sleeping beauty, aladdin and the little mermaid
# movie_ids = ["2096", "588", "2081"]

# mortal kombat, mortal kombat annihilation, street fighter
movie_ids = ["44", "1681", "393"]


# the running man, "demolition man", "assassins"
# movie_ids = ["3698", "442", "23"]

# cinderella, peter pan, Lady and the Tramp, sleeping beauty, the jungle book
# movie_ids = ["1022", "2087", "2080", "2096", "362"]



In [12]:
# define a function to get the movie's uuid from the movie id
def get_movie_uuid(movie_id, movie_class="MovieCos"):
    where_filter = {
        "path": ["movie_id"],
        "operator": "Equal",
        "valueString": movie_id}

    # movie details are identical across movie schemas
    # only the distance metric is different
    # so we can hardcode the movie class
    result = client.query.get(movie_class).with_additional(
        "id").with_where(where_filter).do()

    return result.get('data').get('Get').get(movie_class)[0].get('_additional').get('id')


In [13]:
# define a function to build the user's view history
def build_user_view_history(user_id, movie_ids, view_class="ViewCos", movie_class="MovieCos"):
    with client.batch() as batch:
        movie_uuids = [get_movie_uuid(movie_id, movie_class=movie_class)
                       for movie_id in movie_ids]

        user_uuid = client.data_object.create(
            {"user_id": user_id}, class_name=view_class)

        for movie_uuid in movie_uuids:
            client.data_object.reference.add(
                from_uuid=user_uuid,
                from_property_name="movies",
                to_uuid=movie_uuid,
                from_class_name=view_class,
                to_class_name=movie_class)


In [14]:
for view_class, movie_class in zip(view_classes, movie_classes):
    build_user_view_history(user_id, movie_ids, view_class, movie_class)


In [15]:
# TODO: checkout if this is a bug in the client and/or ref2vec module
# with client.batch() as batch:
#     # user_uuid = batch.add_data_object({
#     #     "user_id": user_id,
#     # }, class_name="View"
#     # )

#     user_uuid = client.data_object.create(
#         {"user_id": user_id}, class_name="View")

#     for movie_uuid in movie_uuids:
#         # batch.add_reference(
#         #     from_object_uuid=user_uuid,
#         #     from_object_class_name="View",
#         #     from_property_name="movies",
#         #     to_object_uuid=movie_uuid,
#         #     to_object_class_name="Movie"
#         # )
#         client.data_object.reference.add(
#             from_uuid=user_uuid,
#             from_property_name="movies",
#             to_uuid=movie_uuid,
#             from_class_name="View",
#             to_class_name="Movie"
#         )


Let's sanity check the embeddings that ref2vec has generated for the user:

In [16]:
# define a function to get a user's vector given a view class and user id
def get_user_vector(user_id, view_class="ViewCos"):
    where_filter = {
        "path": ["user_id"],
        "operator": "Equal",
        "valueString": user_id}

    result = client.query.get(view_class).with_additional(
        "vector").with_where(where_filter).do()

    return result.get('data').get('Get').get(view_class)[0].get('_additional').get('vector')


In [17]:
# define a function to get a movie's vector given a movie class and movie id
def get_movie_vector(movie_id, movie_class="MovieCos"):
    where_filter = {
        "path": ["movie_id"],
        "operator": "Equal",
        "valueString": movie_id}

    result = client.query.get(movie_class).with_additional(
        "vector").with_where(where_filter).do()

    return result.get('data').get('Get').get(movie_class)[0].get('_additional').get('vector')


In [18]:
user_embeddings = []

# check 1: in each view,movie class, the user's vector should be the average of the movie vectors

# zip the view classes and movie classes together and iterate over them
for view_class, movie_class in zip(view_classes, movie_classes):
    user_embedding = get_user_vector(user_id, view_class)
    user_embeddings.append(user_embedding)

    movie_embeddings = [get_movie_vector(
        movie_id, movie_class) for movie_id in movie_ids]

    # compute the mean of movie embeddings
    mean_movie_embedding = np.mean(movie_embeddings, axis=0)

    # is mean_movie_embedding close to user_embedding?
    assert np.allclose(mean_movie_embedding, user_embedding)

# check 2: the user embeddings should be the same across view classes

# loop from the second element to the end of user_embeddings
for i in range(1, len(user_embeddings)):
    # compare the previous user embedding to the current user embedding
    assert np.allclose(user_embeddings[i-1], user_embeddings[i])


Now we find the nearest movies to the `user_embedding`:

In [19]:
def build_movie_id_exclude_filter(movie_ids, genre = None):
    operands = [{"path": ["movie_id"], "operator": "NotEqual",
                 "valueString": movie_id} for movie_id in movie_ids]
    
    if genre:
        operands = operands + [{"path": ["genres", "Genre", "name"], "operator": "Equal", "valueString": genre}]

    return {
        "operator": "And",
        "operands": operands
    }


In [25]:
# define a funciton to generate recommendations given a user embedding
# it should take a movie class and top k as arguments
def get_recommendations(user_embedding, watched_movie_ids, genre_constraint = None, movie_class="MovieCos", top_k=10):
    nearVector = {
        "vector": user_embedding,
    }

    movies = (
        client.query
        .get(movie_class, properties=["movie_id", "title", "plot", "genres {... on Genre {name}}", "poster_url"])
        .with_near_vector(nearVector)
        .with_where(build_movie_id_exclude_filter(watched_movie_ids, genre = genre_constraint))
        .with_additional("distance")
        .with_limit(top_k)
        .do()
    )

   
    movies = movies.get('data').get('Get').get(movie_class)
    return movies


In [27]:
movies = get_recommendations(user_embeddings[0], watched_movie_ids=movie_ids, genre_constraint=None, top_k=10, movie_class=movie_classes[0])

for movie in movies:
    title = movie["title"]
    poster_url = movie["poster_url"]
    plot = movie["plot"]
    genres = [genre["name"] for genre in movie["genres"]]
    distance = movie["_additional"]["distance"]

    # print title, certainty, genres separated by newlins
    print(f"{title}\n{plot}\n{genres}\n{distance}\n{poster_url}\n")


Quest, The (1996)
A group of gentlemen of fortune visits a legendary "Lost City", located in Tibet. They plan to steal a priceless statue "Golden Dragon" during the martial arts tournament.
['Action', 'Adventure']
0.39066333
https://m.media-amazon.com/images/M/MV5BMWIyYjMxZTMtZGUyNy00N2UwLTgwNjctOWQ1OGMzN2VlMDExXkEyXkFqcGdeQXVyNDc2NjEyMw@@._V1_.jpg

Starship Troopers (1997)
Humans in a fascist, militaristic future wage war with giant alien bugs.
['Action', 'Adventure', 'Sci-Fi', 'War']
0.41525584
https://m.media-amazon.com/images/M/MV5BNWExNzg3MmMtYjc3MS00MzFlLWJiOWQtNWYxZTgxNjhlZTQ2XkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_.jpg

Tales From the Crypt Presents: Demon Knight (1995)
High-level demons collect low-level demons as warriors in attempt to obtain a key containing the blood of Christ. The key is guarded by immortal warriors called Demon Knights.
['Horror']
0.41615254
https://m.media-amazon.com/images/M/MV5BNGM3N2VmNDQtNWMwNC00MDI5LThhNzYtNTlkZjkwZTJlNTRjXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_.

Same recommendation but with a genre filter:

In [29]:
movies = get_recommendations(user_embeddings[0], watched_movie_ids=movie_ids, genre_constraint="Children's", top_k=10, movie_class=movie_classes[0])

for movie in movies:
    title = movie["title"]
    poster_url = movie["poster_url"]
    plot = movie["plot"]
    genres = [genre["name"] for genre in movie["genres"]]
    distance = movie["_additional"]["distance"]

    # print title, certainty, genres separated by newlins
    print(f"{title}\n{plot}\n{genres}\n{distance}\n{poster_url}\n")


Heavy Metal (1981)
A glowing green orb - which embodies ultimate evil - terrorizes a young girl with an anthology of bizarre and fantastic stories of dark fantasy, eroticism and horror.
['Action', 'Adventure', 'Animation', 'Horror', 'Sci-Fi']
0.45923924
https://m.media-amazon.com/images/M/MV5BOTc2NzM1ODgtM2RkYi00M2U4LWE4NzEtMDMzZGUzYmI1ZDUzL2ltYWdlXkEyXkFqcGdeQXVyNTAyODkwOQ@@._V1_.jpg

Pokémon: The First Movie (1998)
Scientists genetically create a new Pokémon, Mewtwo, but the results are horrific and disastrous.
['Animation', "Children's"]
0.4769203
https://m.media-amazon.com/images/M/MV5BZGM3MjQ3NTQtNzRiZi00MDUzLWFjYjEtZWJjMjUwYzExYjRiXkEyXkFqcGdeQXVyMjUzOTY1NTc@._V1_.jpg

Pokémon the Movie 2000 (2000)
Ash Ketchum must gather the three spheres of fire, ice and lightning in order to restore balance to the Orange Islands.
['Animation', "Children's"]
0.4773947
https://m.media-amazon.com/images/M/MV5BNzE1NjBiODAtNDVhNS00ZTI1LTg4ZjUtZTk3OWVhODljMjNjXkEyXkFqcGdeQXVyMzM4MjM0Nzg@._V1_.jpg

P

Compare the plot for movie ids:

In [23]:
for movie_id in movie_ids:
    plot = client.query.get("MovieCos", properties=["plot"]).with_where({"path": ["movie_id"], "operator": "Equal", "valueString": movie_id}).do()
    plot = plot.get('data').get('Get').get('MovieCos')[0].get('plot')
    print(plot)

Three unknowing martial artists are summoned to a mysterious island to compete in a tournament whose outcome will decide the fate of the world.
A group of martial-arts warriors has only six days to save the Earth from an extra-dimensional invasion.
Col. Guile and various other martial arts heroes fight against the tyranny of Dictator M. Bison and his cohorts.
