<a href="https://colab.research.google.com/github/juancopi81/yannic-chatbot/blob/main/Chatbot_Ada_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets openai transformers tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tiktoken
  Downloading tiktoken-0.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.19.0
  Downloading requests-2.28.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.8/62.8 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting blobfile>=2
  Downloading blobfile-2.0.1-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 KB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodomex~=3.8
  Downloading pycryptodomex-3.16.0-cp35-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m49.4 MB

In [None]:
from ast import literal_eval
from datasets import load_dataset
import numpy as np
import pandas as pd

import openai
import tiktoken
from transformers import GPT2TokenizerFast

In [None]:
# get API key from top-right dropdown on OpenAI website
openai.api_key = "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"

EMBEDDING_MODEL = "text-embedding-ada-002"
COMPLETIONS_MODEL = "text-davinci-003"
MAX_SECTION_LEN = 2000
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 500,
    "model": COMPLETIONS_MODEL,
}

hf_ds = "juancopi81/yannic_ada_embeddings"
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

HEADER = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "This is not covered in my videos." Try imitating the style of the provided context. \n\nContext:\n"""
RESPONSE_SOURCES = " For more information, check out my following videos: "

In [None]:
# query separator to help the model distinguish between separate pieces of text.
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

#Utils

In [None]:
def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def load_embeddings(hf_ds: str) -> dict:
    """
    Read the document embeddings and their keys from a HuggingFace dataset.
    
    hf_ds is the name of the HF dataset with exactly these named columns: 
        "TITLE", "URL", "TRANSCRIPTION", "transcription_length", "text", "ada_embedding"
    """
    hf_ds = load_dataset(hf_ds, split="train")
    hf_ds.set_format("pandas")
    df = hf_ds[:]
    df.ada_embedding = df.ada_embedding.apply(literal_eval)
    df["idx"] = df.index
    return {
        (r.idx, r.TITLE, r.URL): r.ada_embedding for idx, r in df.iterrows()
    }

def create_dataframe(hf_ds: str):
    hf_ds = load_dataset(hf_ds, split="train")
    hf_ds.set_format("pandas")
    df = hf_ds[:]
    df["num_tokens"] = df["text"].map(count_tokens)
    df["idx"] = df.index
    df = df.set_index(["idx", "TITLE", "URL"])
    return df

def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def vector_similarity(x: list, y: list) -> float:
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict) -> list:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> tuple:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.num_tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.text.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = HEADER
    
    return (header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:",
            chosen_sections_indexes)

def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict,
    show_prompt: bool = False
) -> str:
    prompt, sources = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )
    
    res_sources = RESPONSE_SOURCES
    for source in sources[:2]:
        src_lst = eval(source)
        title = "".join(src_lst[1])
        url = "".join(src_lst[2])
        final_src = title + " " + url
        res_sources += " " + final_src

    return response["choices"][0]["text"].strip(" \n") + res_sources

In [None]:
df = create_dataframe(hf_ds)
df.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (1085 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TRANSCRIPTION,transcription_length,text,ada_embedding,num_tokens
idx,TITLE,URL,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models,https://www.youtube.com/watch?v=gwI6g1pBD84,"Hello there, today we'll look at Glide towards...",500,GLIDE: Towards Photorealistic Image Generation...,"[-0.023338761180639267, 0.01196559239178896, -...",644
1,GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models,https://www.youtube.com/watch?v=gwI6g1pBD84,I'm going to paint this area right here. And I...,500,GLIDE: Towards Photorealistic Image Generation...,"[-0.02665344625711441, 0.006354713812470436, 0...",625
2,GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models,https://www.youtube.com/watch?v=gwI6g1pBD84,model substantially to make very misleading pi...,500,GLIDE: Towards Photorealistic Image Generation...,"[-0.023775776848196983, -0.0021778957452625036...",625
3,GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models,https://www.youtube.com/watch?v=gwI6g1pBD84,"the cat, and predict and learn to predict the ...",500,GLIDE: Towards Photorealistic Image Generation...,"[-0.01872330904006958, 0.016276435926556587, -...",621
4,GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models,https://www.youtube.com/watch?v=gwI6g1pBD84,"at each step is small enough, the posterior is...",500,GLIDE: Towards Photorealistic Image Generation...,"[-0.016091231256723404, 0.017658470198512077, ...",621


In [None]:
document_embeddings = load_embeddings("juancopi81/yannic_ada_embeddings")



In [None]:
# An example embedding:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

(0, 'GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models', 'https://www.youtube.com/watch?v=gwI6g1pBD84') : [-0.023338761180639267, 0.01196559239178896, -0.00887258630245924, 0.011793375946581364, 0.016050564125180244]... (1536 entries)


In [None]:
order_document_sections_by_query_similarity("Is OpenAI 'open'?", document_embeddings)[:3]

[(0.8543818997558141,
  (1624,
   '[ML News] AI-generated patent approved | Germany gets an analog to OpenAI | ML cheats video games',
   'https://www.youtube.com/watch?v=SPOqoI0zOPQ')),
 (0.8488043317477918,
  (250,
   '[ML News] BLOOM: 176B Open-Source | Chinese Brain-Scale Computer | Meta AI: No Language Left Behind',
   'https://www.youtube.com/watch?v=W3mrgqtm5R4')),
 (0.8472144938090321,
  (148,
   "[ML News] OpenAI's Whisper | Meta Reads Brain Waves | AI Wins Art Fair, Annoys Humans",
   'https://www.youtube.com/watch?v=S-7r0-oysaU'))]

In [None]:
prompt, sources = construct_prompt(
    "Is OpenAI 'open'?",
    document_embeddings,
    df
)

print("===\n", prompt)

Selected 3 document sections:
(1624, '[ML News] AI-generated patent approved | Germany gets an analog to OpenAI | ML cheats video games', 'https://www.youtube.com/watch?v=SPOqoI0zOPQ')
(250, '[ML News] BLOOM: 176B Open-Source | Chinese Brain-Scale Computer | Meta AI: No Language Left Behind', 'https://www.youtube.com/watch?v=W3mrgqtm5R4')
(148, "[ML News] OpenAI's Whisper | Meta Reads Brain Waves | AI Wins Art Fair, Annoys Humans", 'https://www.youtube.com/watch?v=S-7r0-oysaU')
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "This is not covered in my videos." Try imitating the style of the provided context. 

Context:

* [ML News] AI-generated patent approved | Germany gets an analog to OpenAI | ML cheats video games: which is 27 million in real money in a series a founding co led by early bird VC, Lake star and UBC partners. The team says it will have a strong commitment to open source commun

In [None]:
res = "For more information, check out my following videos: "
for source in sources[:2]:
    src_lst = eval(source)
    title = "".join(src_lst[1])
    url = "".join(src_lst[2])
    final_src = title + " " + url
    res += " " + final_src

print(res)

For more information, check out my following videos:  [ML News] AI-generated patent approved | Germany gets an analog to OpenAI | ML cheats video games https://www.youtube.com/watch?v=SPOqoI0zOPQ [ML News] BLOOM: 176B Open-Source | Chinese Brain-Scale Computer | Meta AI: No Language Left Behind https://www.youtube.com/watch?v=W3mrgqtm5R4


In [None]:
answer = answer_query_with_context("Who is Yann LeCun", df, document_embeddings)

Selected 4 document sections:
(2944, '[Drama] Yann LeCun against Twitter on Dataset Bias', 'https://www.youtube.com/watch?v=n1SXlK5rhR8')
(2943, '[Drama] Yann LeCun against Twitter on Dataset Bias', 'https://www.youtube.com/watch?v=n1SXlK5rhR8')
(741, "[ML News] DeepMind controls fusion | Yann LeCun's JEPA architecture | US: AI can't copyright its art", 'https://www.youtube.com/watch?v=YOLL8dIhLJI')
(1958, 'Yann LeCun - Self-Supervised Learning: The Dark Matter of Intelligence (FAIR Blog Post Explained)', 'https://www.youtube.com/watch?v=Ag1bw8MfHGQ')


In [None]:
answer

'Yann LeCun is a French computer scientist and AI researcher. He is the Silver Professor of the Courant Institute of Mathematical Sciences, a professor of computer science at the Center for Data Science, and a professor of neural science at the New York University. He is also the founding director of the NYU Center for Data Science and the director of AI Research at Facebook. For more information, check out my following videos:  [Drama] Yann LeCun against Twitter on Dataset Bias https://www.youtube.com/watch?v=n1SXlK5rhR8 [Drama] Yann LeCun against Twitter on Dataset Bias https://www.youtube.com/watch?v=n1SXlK5rhR8'