# Build Embeddings

## Dataset

First we need to download the YT transcriptions dataset:

In [1]:
from datasets import load_dataset

data = load_dataset(
    'jamescalam/youtube-transcriptions',
    split='train'
)
data

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration jamescalam--youtube-transcriptions-25e61ee3cea2d8c2
Reusing dataset json (/home/jupyter/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-25e61ee3cea2d8c2/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


Dataset({
    features: ['title', 'visibility', 'published', 'url', 'id', 'text', 'start', 'end'],
    num_rows: 27214
})

In [2]:
data[0]

{'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'visibility': 'Public',
 'published': '2021-07-06 13:00:03 UTC',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'id': '35Pdoyi6ZoQ-t0.0',
 'text': 'Hi, welcome to the video.',
 'start': 0.0,
 'end': 9.36}

In [3]:
data[1]

{'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'visibility': 'Public',
 'published': '2021-07-06 13:00:03 UTC',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'id': '35Pdoyi6ZoQ-t3.0',
 'text': 'So this is the fourth video in a Transformers',
 'start': 3.0,
 'end': 11.56}

In [4]:
data[2]

{'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'visibility': 'Public',
 'published': '2021-07-06 13:00:03 UTC',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'id': '35Pdoyi6ZoQ-t9.36',
 'text': 'from Scratch mini series.',
 'start': 9.36,
 'end': 15.84}

The sentences are all quite short at the moment, we need to merge them to create better chunks of text containing more meaning.

In [5]:
from tqdm.auto import tqdm

new_data = []

window = 6  # number of sentences to combine
stride = 3  # number of sentences to 'stride' over, used to create overlap

for i in tqdm(range(0, len(data), stride)):
    i_end = min(len(data)-1, i+window)
    if data[i]['title'] != data[i_end]['title']:
        # in this case we skip this entry as we have start/end of two videos
        continue
    text = ' '.join(data[i:i_end]['text'])
    new_data.append({
        'start': data[i]['start'],
        'end': data[i_end]['end'],
        'title': data[i]['title'],
        'text': text,
        'id': data[i]['id'],
        'url': data[i]['url'],
        'published': data[i]['published']
    })

100%|██████████| 9072/9072 [00:07<00:00, 1156.11it/s]


In [6]:
new_data[0]

{'start': 0.0,
 'end': 25.76,
 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'text': "Hi, welcome to the video. So this is the fourth video in a Transformers from Scratch mini series. So if you haven't been following along, we've essentially covered what you can see on the screen. So we got some data.",
 'id': '35Pdoyi6ZoQ-t0.0',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'published': '2021-07-06 13:00:03 UTC'}

In [7]:
new_data[100]

{'start': 981.4,
 'end': 1009.52,
 'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'text': "Now, it has taken a long time. It's a few days later. And I made a few changes during training as well. So this definitely wasn't the cleanest training process, because I was kind of updating parameters as it was going along. So initially, well, first, we've trained",
 'id': '35Pdoyi6ZoQ-t981.4',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'published': '2021-07-06 13:00:03 UTC'}

In [8]:
new_data[500]

{'start': 1096.0,
 'end': 1112.0,
 'title': 'Training BERT #4 - Train With Next Sentence Prediction (NSP)',
 'text': "token type IDs. Let's go with number 0. Okay. So now we see okay the reason is because they're in the middle here.",
 'id': 'x1lAcT3xl5M-t1096.0',
 'url': 'https://youtu.be/x1lAcT3xl5M',
 'published': '2021-05-27 16:15:39 UTC'}

## Initialize Embedding Model

In [9]:
from sentence_transformers import SentenceTransformer

model_id = "multi-qa-mpnet-base-dot-v1"

model = SentenceTransformer(model_id)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [10]:
dim = model.get_sentence_embedding_dimension()

In [11]:
import pinecone

index_id = "youtube-search"

pinecone.init(
    api_key="<<YOUR_API_KEY>>",  # app.pinecone.io
    environment="us-west1-gcp"
)

if index_id not in pinecone.list_indexes():
    pinecone.create_index(
        index_id,
        dim,
        metric="dotproduct"
    )

index = pinecone.Index(index_id)
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

Now let's begin building the embeddings...

In [12]:
from tqdm.auto import tqdm

# we encode and insert in batches of 64
batch_size = 64

# loop through in batches of 64
for i in tqdm(range(0, len(new_data), batch_size)):
    # find end position of batch (for when we hit end of data)
    i_end = min(len(new_data)-1, i+batch_size)
    # extract the metadata like text, start/end positions, etc
    batch_meta = [{
        "text": new_data[x]["text"],
        "start": new_data[x]["start"],
        "end": new_data[x]["end"],
        "url": new_data[x]["url"],
        "title": new_data[x]["title"]
    } for x in range(i, i_end)]
    # extract only text to be encoded by embedding model
    batch_text = [row['text'] for row in new_data[i:i_end]]
    # create the embedding vectors
    batch_embeds = model.encode(batch_text).tolist()
    # extract IDs to be attached to each embedding and metadata
    batch_ids = [row['id'] for row in new_data[i:i_end]]
    # 'upsert' (eg insert) IDs, embeddings, and metadata to index
    to_upsert = list(zip(batch_ids, batch_embeds, batch_meta))
    index.upsert(to_upsert)

# check everything has been added
index.describe_index_stats()

100%|██████████| 210/210 [00:56<00:00,  3.72it/s]


In [13]:
i_end

13392

In [14]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 13392}},
 'total_vector_count': 13392}

In [15]:
query = "how to contribute to open source?"

xq = model.encode(query).tolist()

index.query(xq, top_k=5, include_metadata=True)

{'matches': [{'id': '_OAU1kQdmgE-t521.52',
              'metadata': {'end': 562.96,
                           'start': 521.52,
                           'text': "point on projects it's also a very good "
                                   'idea to see if you can try and contribute '
                                   'to open source projects. Now this is a '
                                   'pretty typical piece of advice where '
                                   'people say okay you should contribute to '
                                   "open source and I'm including that as well "
                                   "because it's very good advice. If you can "
                                   'go ahead and start contributing to things '
                                   'there are a lot of things that will '
                                   "happen. First you're contributing",
                           'title': 'How to Learn Data Science | ML | '
                       

---