In [75]:
!pip3 install torch torchvision torchaudio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [76]:
!pip install -qU \
          pinecone-text \
          datasets \
          pyarrow

In [77]:
!pip install openai nltk tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [78]:
import json
# import secret tokens
with open('drive/MyDrive/huberman-whisper/secrets.json') as f:
  secrets = json.load(f)
openai_api_key = secrets['openai-api-key']
hf_token = secrets['hf-token']
pinecone_api_key = secrets['pinecone-api-key']
pinecone_env = secrets['pinecone-env']

In [79]:
# Log in to HF Hub
from huggingface_hub import login
login(token=hf_token)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [80]:
# Load datasets
from datasets import load_dataset
huberman_metadata = load_dataset("hbattu/huberman-youtube-metadata")['train']
huberman_timestamped = load_dataset("hbattu/huberman-timestamped")['train']



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

In [81]:
# Process datasets
def modify_yt_url(row):
  row['url'] = 'https://youtu.be/' + row['video_id']
  return row
huberman_metadata = huberman_metadata.map(modify_yt_url)
huberman_timestamped = huberman_timestamped.filter(lambda sent: sent['end'] is not None)



In [82]:
from collections import defaultdict
# construct video metadata dict
channel_meta =  defaultdict(lambda: defaultdict(dict))
for row in huberman_metadata:
  channel_meta[row['video_id']] = row
# get list of episode ids
episode_ids = huberman_metadata['video_id']

In [83]:
import nltk
import nltk.data
nltk.download('punkt')
# init sent detector
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [84]:
import tiktoken
# Get number of tokens for each document
def num_tokens(chunk, model="gpt-3.5-turbo"):
  """Returns the number of tokens used by a list of messages."""
  try:
      encoding = tiktoken.encoding_for_model(model)
  except KeyError:
      encoding = tiktoken.get_encoding("cl100k_base")
  return len(encoding.encode(chunk))

In [85]:
def contextualize(content, title, keywords, description):
  # Contextualize each document with global context 
  # --> experiments found that this was NOT needed as it polluted documents with too much context
  title_sent = "Document's title: {}".format(title)
  keywords_sent = "Document's keywords: {}".format(', '.join(keywords))
  description_sent = "Document's description: {}".format(' '.join(sent_detector.tokenize(description)[:3]))
  content_text = "Document's content: {}".format(content)
  return ". ".join([title_sent, keywords_sent, description_sent]) + ' ' + content_text

In [86]:
def chunk(episode, episode_meta, window=15, stride=10, contextualized=True):
  # Chunk each episode into documents to be embedded using a rolling window of 'window' 
  # and an overlap of 'window - stride' sentences
  documents = []
  for s in range(0, len(episode), stride):
    doc = {}
    window_end = min(s+window, len(episode)) 
    content = ' '.join(episode[s:window_end]['text'])
    if contextualized: 
      doc['document'] = contextualize(content, episode_meta['title'], episode_meta['keywords'], episode_meta['description'])
    else:
      doc['document'] = content
    doc['content'] = content 
    doc['start'] = episode[s]['start']
    doc['end'] = episode[window_end - 1]['end']
    documents.append(doc)
  return documents

In [87]:
def build_vector_meta(doc, episode_meta):
  # Build vector metadata for each document
  metadata = {}
  metadata['title'] = episode_meta['title']
  metadata['published'] = episode_meta['published']
  metadata['thumbnail'] = episode_meta['thumbnail']
  metadata['video_id'] = episode_meta['video_id']
  metadata['content'] = doc['content']
  metadata['start'] = doc['start']
  metadata['end'] = doc['end']
  metadata['tokens'] = num_tokens(doc['content'])
  return metadata

In [88]:
import openai
import torch
from pinecone_text.sparse import SpladeEncoder
openai.api_key = openai_api_key
# Initialize Splade
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"running on {device}")
splade = SpladeEncoder(device=device)

running on cuda


In [89]:
def embed(documents, embedding='dense'):
  # Embed doucments using a hybrid model of sparse and dense embeddings
  if embedding == 'sparse':
    # Encode documents using Splade
    sparse_values = splade.encode_documents(documents)
    return sparse_values
  else:
    # Use ada-002 to encode documents
    openai_response = openai.Embedding.create(
        input = documents,
        model = "text-embedding-ada-002"
    )
    dense_values = [vec_data['embedding'] for vec_data in openai_response['data']]
    return dense_values

In [90]:
def batch_embed(documents, episode_meta, batch_size=8):
  # Embed using a batch of 'batch_size' documents into a Pinecone accepted format
  vectors = []
  for d in range(0, len(documents), batch_size):
    batch_end = min(d+batch_size, len(documents))
    vec_ids = [episode_meta['video_id'] + '_' + str(idx+1) for idx in range(d, batch_end)]
    vec_metas = [build_vector_meta(doc, episode_meta) for doc in documents[d:batch_end]]
    dense_vecs = embed([doc['document'] for doc in documents[d:batch_end]])
    sparse_vecs = embed([doc['document'] for doc in documents[d:batch_end]], embedding='sparse')
    batch_data = zip(vec_ids, vec_metas, dense_vecs, sparse_vecs)
    vec_batch = [{ 'id': id, 'values': dense_vec, 'metadata': meta, 'sparse_values':sparse_vec } for id, meta, dense_vec, sparse_vec in batch_data]
    vectors.extend(vec_batch)
  return vectors

In [91]:
from tqdm.auto import tqdm
def build_embeddings(episode_ids, channel_meta):
  # Build embeddings for each episode in the channel
  vectors = []
  for id in tqdm(episode_ids):
    episode = huberman_timestamped.filter(lambda ep: ep['id'] == id)
    episode_meta = channel_meta[id]
    documents = chunk(episode, episode_meta, contextualized=False)
    vectors.extend(batch_embed(documents, episode_meta))
  return vectors

In [92]:
hybrid_vectors = build_embeddings(episode_ids, channel_meta)

  0%|          | 0/129 [00:00<?, ?it/s]



In [93]:
len(hybrid_vectors)

16500

In [94]:
!pip install -qU "pinecone-client[grpc]"

In [95]:
import pinecone
# init connection to pinecone
pinecone.init(
    api_key=pinecone_api_key,
    environment=pinecone_env
)

In [96]:
pinecone.list_indexes()

['huberman-search']

In [97]:
index_name = "huberman-search"
dim = 1536    # num. of dimensions of returned embedding of ada-002 model
metadata_config = {
    "indexed": ["video_id"]
}
if index_name not in pinecone.list_indexes():
    # create the index
    pinecone.create_index(
      index_name,
      dimension=dim,
      metric="dotproduct",
      metadata_config=metadata_config,
      pod_type="s1"
    )
else: print('Index already exists.')

Index already exists.


In [98]:
index = pinecone.GRPCIndex(index_name)

In [99]:
def upsert_vectors(index, namespace, vectors, batch_size=64):
  # Insert embeddings to Pinecone by batching
  for i in tqdm(range(0, len(vectors), batch_size)):
    batch_end = min(i+batch_size, len(vectors))
    index.upsert(vectors=vectors[i:batch_end], namespace=namespace)

In [100]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.2,
 'namespaces': {'contextualized-default': {'vector_count': 16500}},
 'total_vector_count': 16500}

In [101]:
upsert_vectors(index, 'nocontext-default', hybrid_vectors)

  0%|          | 0/258 [00:00<?, ?it/s]

In [102]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.3,
 'namespaces': {'contextualized-default': {'vector_count': 16500},
                'nocontext-default': {'vector_count': 16500}},
 'total_vector_count': 33000}

In [103]:
from pinecone_text.hybrid import hybrid_convex_scale

In [120]:
query = "does alcohol affect testosterone?"
# create sparse and dense vectors
sparse = splade.encode_queries(query)
dense_response = openai.Embedding.create(
        input = query,
        model = "text-embedding-ada-002"
    )
dense = dense_response['data'][0]['embedding']
hybrid_dense, hybrid_sparse = hybrid_convex_scale(dense, sparse, alpha=0.6)

In [121]:
# search
result = index.query(
    top_k=10,
    vector=hybrid_dense,
    sparse_vector=hybrid_sparse,
    namespace='nocontext-default',
    include_metadata=True
)
result

{'matches': [{'id': 'DkS1pkKpILY_101',
              'metadata': {'content': 'And these can include things like '
                                      'diminished sex drive, increased fat '
                                      'storage, and a number of other things '
                                      'that I think most people would find to '
                                      'be negative effects. I once talked '
                                      'about the fact that drinking alcohol '
                                      'can increase the aromatization of '
                                      'testosterone to estrogen. I posted that '
                                      "online and I didn't get attacked, but I "
                                      'did get criticized for the fact that it '
                                      'has been shown, yes, has been shown '
                                      'that small amounts of alcohol '
                              