In [74]:
import json
import os
import pathlib
import pinecone
import random
import statistics

In [44]:
PINECONE_API_KEY = os.environ["PINECONE_API_KEY"]

INDEX_NAME = "multi-entity-recommender"
INDEX_DIMS = 1536

data_dir = pathlib.Path("data").resolve()

In [2]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment="us-west4-gcp-free",
)

In [52]:
def load_embeddings(path):
    embeddings = {}

    for file in os.listdir(path):
        id = file.removesuffix(".txt")
        
        with open(path / file, "r") as f:
            embedding = json.load(f)
        
        embeddings[id] = embedding
    
    return embeddings


books = load_embeddings(data_dir / "embeddings" / "books")
movies = load_embeddings(data_dir / "embeddings" / "movies")
tv = load_embeddings(data_dir / "embeddings" / "tv")

all = books | movies | tv

In [53]:
try:
    pinecone.create_index(INDEX_NAME, dimension=1536)
except Exception as e:
    if INDEX_NAME not in pinecone.list_indexes():
        raise e

In [50]:
index = pinecone.Index(INDEX_NAME, pool_threads=50)

In [37]:
index.upsert(
    vectors=list(all.items()),
    batch_size=10,
)

Upserted vectors: 100%|██████████| 3000/3000 [01:20<00:00, 37.09it/s]


{'upserted_count': 3000}

In [43]:
# Requiem for a Dream
# https://www.imdb.com/title/tt0180093
index.query(
    id="imdb:movie:tt0180093",
    top_k=10,
)

{'matches': [{'id': 'imdb:movie:tt0180093', 'score': 1.0, 'values': []},
             {'id': 'imdb:movie:tt5109784', 'score': 0.898632884, 'values': []},
             {'id': 'imdb:movie:tt2229499', 'score': 0.875571072, 'values': []},
             {'id': 'imdb:movie:tt0338013', 'score': 0.870142698, 'values': []},
             {'id': 'imdb:movie:tt0414993', 'score': 0.870064735, 'values': []},
             {'id': 'imdb:movie:tt1959490', 'score': 0.863276184, 'values': []},
             {'id': 'imdb:movie:tt4550098', 'score': 0.862037241, 'values': []},
             {'id': 'imdb:movie:tt0947798', 'score': 0.860651672, 'values': []},
             {'id': 'imdb:movie:tt0257044', 'score': 0.859552681, 'values': []},
             {'id': 'imdb:movie:tt1125849',
              'score': 0.859465837,
              'values': []}],
 'namespace': ''}

In [42]:
# Harry Potter and the Half-Blood Prince
# https://www.goodreads.com/en/book/show/1
index.query(
    id="goodreads:book:1",
    top_k=10,
)

{'matches': [{'id': 'goodreads:book:1', 'score': 1.0, 'values': []},
             {'id': 'imdb:movie:tt0417741', 'score': 0.96148777, 'values': []},
             {'id': 'goodreads:book:8', 'score': 0.948156953, 'values': []},
             {'id': 'goodreads:book:15881', 'score': 0.934124351, 'values': []},
             {'id': 'goodreads:book:5', 'score': 0.932909071, 'values': []},
             {'id': 'goodreads:book:2', 'score': 0.931541264, 'values': []},
             {'id': 'imdb:movie:tt1201607', 'score': 0.929012239, 'values': []},
             {'id': 'imdb:movie:tt0926084', 'score': 0.921922088, 'values': []},
             {'id': 'imdb:movie:tt0330373', 'score': 0.921125114, 'values': []},
             {'id': 'imdb:movie:tt0295297',
              'score': 0.919870853,
              'values': []}],
 'namespace': ''}

0.02550607407157708

In [77]:
# Random query vector

sigma = statistics.stdev([x for xs in all.values() for x in xs])

def rand():
    r = random.normalvariate(0, sigma)
    if r > 1:
        return 1
    if r < -1:
        return -1
    return r

xq = [rand() for _ in range(INDEX_DIMS)]

index.query(
    vector=xq,
    top_k=10,
)

{'matches': [{'id': 'goodreads:book:32929',
              'score': 0.0440304503,
              'values': []},
             {'id': 'goodreads:book:24178',
              'score': 0.0412061736,
              'values': []},
             {'id': 'goodreads:book:44652',
              'score': 0.0386374108,
              'values': []},
             {'id': 'goodreads:book:45102', 'score': 0.0380659, 'values': []},
             {'id': 'goodreads:book:5338', 'score': 0.0376384705, 'values': []},
             {'id': 'imdb:tv:tt6474378', 'score': 0.0373258218, 'values': []},
             {'id': 'goodreads:book:20564',
              'score': 0.0372788794,
              'values': []},
             {'id': 'imdb:tv:tt7462410', 'score': 0.0362671837, 'values': []},
             {'id': 'goodreads:book:903', 'score': 0.0361516587, 'values': []},
             {'id': 'goodreads:book:2956',
              'score': 0.0360366106,
              'values': []}],
 'namespace': ''}