# We're going to build question and answer bot

That allow you to search through youtube transcripts using natural language

In [None]:
pip install --quiet openai datasets lancedb

## Download the data
700 videos and 208619 sentences

In [1]:
from datasets import load_dataset

data = load_dataset('jamescalam/youtube-transcriptions', split='train')
data

Found cached dataset json (/Users/changshe/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


Dataset({
    features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
    num_rows: 208619
})

## Prepare context

Create context of 20 sentences

In [2]:
from lancedb.context import contextualize

df = (contextualize(data.to_pandas())
      .groupby("title").text_col("text")
      .window(20).stride(4)
      .to_df())
df.head(1)

Unnamed: 0,title,published,url,video_id,channel_id,id,text,start,end
177622,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t0.0,Imagine an AI where all in the same model you ...,0.0,24.0


## Create embedding function
We'll call the OpenAI embeddings API to get embeddings

In [3]:
import openai
import os

# Configuring the environment variable OPENAI_API_KEY
if "OPENAI_API_KEY" not in os.environ:
    # OR set the key here as a variable
    openai.api_key = "sk-..."
    
assert len(openai.Model.list()["data"]) > 0

In [4]:
import numpy as np
def embed_func(c):    
    rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
    return [record["embedding"] for record in rs["data"]]

## Create the LanceDB Table

In [5]:
import lancedb
from lancedb.embeddings import with_embeddings

data = with_embeddings(embed_func, df, show_progress=True)
data.to_pandas().head(1)

  0%|          | 0/49 [00:00<?, ?it/s]

Unnamed: 0,title,published,url,video_id,channel_id,id,text,start,end,vector
0,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t0.0,Imagine an AI where all in the same model you ...,0.0,24.0,"[-0.024402587, -0.00087673456, 0.016499246, -0..."


In [6]:
db = lancedb.connect("/tmp/lancedb")  # current directory
tbl = db.create_table("chatbot", data)
len(tbl)

48935

In [7]:
tbl.to_pandas().head(1)

Unnamed: 0,title,published,url,video_id,channel_id,id,text,start,end,vector
0,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t0.0,Imagine an AI where all in the same model you ...,0.0,24.0,"[-0.024402587, -0.00087673456, 0.016499246, -0..."


## Create and answer the prompt

In [8]:
def create_prompt(query, context):
    limit = 3750

    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(context)):
        if len("\n\n---\n\n".join(context.text[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(context.text[:i-1]) +
                prompt_end
            )
            break
        elif i == len(context)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(context.text) +
                prompt_end
            )    
    return prompt

In [9]:
def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

# check that it works
query = "who was the 12th person on the moon and when did they land?"
complete(query)

'The 12th person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'

## Use LanceDB to find the answer and show the video at the right place

In [10]:
query = ("Which training method should I use for sentence transformers "
         "when I only have pairs of related sentences?")

In [11]:
# Embed the question
emb = embed_func(query)[0]

In [12]:
# Use LanceDB to get top 3 most relevant context
context = tbl.search(emb).limit(3).to_df()

In [13]:
# Get the answer from completion API
prompt = create_prompt(query, context)
complete(prompt)

'NLI with multiple negative ranking loss.'

In [14]:
from IPython.display import YouTubeVideo

top_match = context.iloc[0]
YouTubeVideo(top_match["url"].split("/")[-1], start=top_match["start"])