In [101]:
import os
import numpy as np
import pandas as pd
from typing import Optional
from tqdm.notebook import tqdm

import pytube
from pytube import YouTube
from langchain.docstore.document import Document
from youtube_transcript_api import YouTubeTranscriptApi

from langchain import hub
from llama_cpp import Llama
from langchain_chroma import Chroma
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter

### Download Video using PyTube

In [2]:
VIDEO_URL = 'https://youtu.be/QmOF0crdyRU?si=5PZa8XlWtyXlYopb'

In [3]:
def downloadYTvideo(url, save_dir="../data"):
    yt = YouTube(url)
    video_title = yt.title
    video_id = pytube.extract.video_id(VIDEO_URL)
    # Issue: https://github.com/pytube/pytube/issues/1626
    yt.streams.first()

    video_description = yt.description
    video = yt.streams.filter(progressive=True).get_highest_resolution()
    try:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        print(f"Downloading video titled: {video_title}...")
        # video.download(save_dir)
        print("Videp Downloaded.")

        print('Downloading transcript')
        transcript_info = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        df = pd.DataFrame.from_dict(transcript_info) 
        df.to_csv(os.path.join(save_dir, 'transcript.csv'), index = False)
        print('Transcript saved.')
        
    except:
        print("Failed to download video")

In [4]:
downloadYTvideo(VIDEO_URL)

Downloading video titled: Controlling Your Dopamine For Motivation, Focus & Satisfaction...
Videp Downloaded.
Downloading transcript
Transcript saved.


### Load Dataset

In [5]:
df = pd.read_csv('../data/transcript.csv')

In [6]:
df.head(5)

Unnamed: 0,text,start,duration
0,"- Welcome to the Huberman Lab Podcast,",0.053,2.247
1,where we discuss science\nand science-based tools,2.3,2.63
2,for everyday life.,4.93,1.201
3,[energetic music],6.131,3.399
4,"I'm Andrew Huberman,",9.53,0.97


In [7]:
corpus = df.text.str.cat(sep = ' ')

In [8]:
print(corpus)

- Welcome to the Huberman Lab Podcast, where we discuss science
and science-based tools for everyday life. [energetic music] I'm Andrew Huberman, and I'm a professor of
neurobiology and ophthalmology at Stanford School of Medicine. Today, we are going to
talk all about dopamine and what drives you to do
the things that you do. We're going to talk about
motivation and desire and craving, but also how dopamine
relates to satisfaction and our feelings of wellbeing. And of course, any
discussion about dopamine has to include a discussion
about the potential for dopamine induced addiction. Indeed, dopamine lies at
the heart of addiction to all things. But today we are mainly going to focus on, how what we do and how we do it, and how we conceptualize those things leads to changes in this amazing molecule in our brain and bodies
that we call dopamine. I'm going to teach you what
dopamine is and what it is not. There are a lot of myths
about the molecule dopamine. We often hear about
so-calle

In [9]:
print('Number of characters in the corpus: ', len(corpus))
print('Unique characters: ', len(set(corpus)))

Number of characters in the corpus:  132375
Unique characters:  73


### Recursive Chunking

In [22]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 512, chunk_overlap = 64, length_function = len)

In [23]:
document_corpus = splitter.split_text(corpus)

In [25]:
document_corpus[95]

"going to be variation there, but that's the average\nincrease in baseline dopamine caused by sex. Later, I will talk about\nhow the different aspects of the so-called arousal art, the different aspects of sex, believe it or not have a\ndifferential impact on dopamine. But for now as a general\ntheme or activity, sex doubles the amount\nof dopamine circulating in your blood. Nicotine. In particular, nicotine that is smoked like cigarettes and so forth, increases dopamine two and"

In [26]:
print('Number of document: ', len(document_corpus))

Number of document:  312


### Mistral7B For HyDe

In [27]:
class LLMHypothethicalQuestions:
    def __init__(
        self,
        gguf_ckpt: str,
        max_tokens: int = 1024,
        top_p: float = 0.97,
        temperature: float = 0.1,
        show_log: bool = False,
        prompt: Optional[str] = None,
    ):

        # Initialize the LLM
        self.llm = LlamaCpp(
            model_path=gguf_ckpt,
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=top_p,
            verbose=show_log,
        )

        if prompt is None:
            INSTRUCTION_TEMPLATE = """You are a question designer, given a paragraph you generate a question from the paragraph.
            Let's Think Step By Step.
            Paragraph: {paragraph}
            Generated question:"""
            self.instruction_prompt = PromptTemplate(
                template=INSTRUCTION_TEMPLATE, input_variables=["paragraph"]
            )

        else:
            self.instruction_prompt = PromptTemplate(
                template=prompt, input_variables=["paragraph"]
            )

        # Build a chain
        self.llm_chain = self.instruction_prompt | self.llm

    def generate(self, paragraph: str) -> None:
        out = self.llm_chain.invoke({"paragraph": paragraph})
        return out

In [28]:
# Hyperparams
CONTEXT_LENGTH = 2048
MAXIMUM_BATCH_SIZE = 1048

In [45]:
mistral_cpp = Llama(model_path='../GGUF_models/mistral-7b-instruct-v0.1.Q5_K_M.gguf',
                    n_ctx = CONTEXT_LENGTH,
                    n_batch = MAXIMUM_BATCH_SIZE,
                    n_threads = 10,
                    verbose = False)

In [56]:
INSTRUCTION_TEMPLATE = """
You are a question designer, given a paragraph you generate a question from the paragraph.
Let's Think Step By Step. Only Generate one question and just return the question.
Paragraph: {paragraph}
Generated question:"""

In [69]:
instruction_prompt = INSTRUCTION_TEMPLATE.format(paragraph = document_corpus[200])

In [71]:
document_corpus[200]

"really improve their symptoms. But of course there's a\nlot of non-prescription, non-clinical use of\nthose compounds as well. And it stands to reason that\nthe use of those substances to increase dopamine\ncould very well provide the same sort of blockade\nof neuroplasticity that cocaine and amphetamine do. Because when you look at the\namount of dopamine increase that's triggered by those compounds, it's really comparable. So again, a cautionary note"

In [72]:
# Sample tokens
print(mistral_cpp.tokenize(b"Hello, world!", special = True))
print(mistral_cpp.tokenize(b"This is mistral", special = True))

[1, 22557, 28725, 1526, 28808]
[1, 851, 349, 5710, 1650]


In [73]:
MAXIMUM_TOKENS = 1048
TEMPERATURE = 0.1
TOP_P = 0.99

In [74]:
# Sample forward pass
response = mistral_cpp(
    prompt=instruction_prompt,
    max_tokens=MAXIMUM_TOKENS,
    temperature=TEMPERATURE,
    top_p=TOP_P,
)

In [75]:
# Output
response['choices'][0]['text'].strip()

'What is the potential impact of non-prescription use of certain compounds on neuroplasticity?'

In [76]:
ques_to_cntx = dict()

for doc in tqdm(document_corpus, desc="Generating hypothetical questions"):
    instruction_prompt = INSTRUCTION_TEMPLATE.format(paragraph=doc)
    response = mistral_cpp(
        prompt=instruction_prompt,
        max_tokens=MAXIMUM_TOKENS,
        temperature=TEMPERATURE,
        top_p=TOP_P,
    )
    question = response["choices"][0]["text"].strip()
    ques_to_cntx[question] = doc

Generating hypothetical questions:   0%|          | 0/312 [00:00<?, ?it/s]

In [77]:
hyde_df = pd.DataFrame(list(ques_to_cntx.items()), columns=['Question', 'Answer'])
hyde_df.to_csv('../data/hyde.csv', index = False)

In [78]:
hyde_df = pd.read_csv('../data/hyde.csv')
hyde_df

Unnamed: 0,Question,Answer
0,What is the relationship between dopamine and ...,"- Welcome to the Huberman Lab Podcast, where w..."
1,What is the relationship between dopamine and ...,discussion about dopamine has to include a dis...
2,What is a dopamine schedule?,about the molecule dopamine. We often hear abo...
3,"What is the relationship between caffeine, por...","caffeine, pornography, even some plant-based c..."
4,How can leveraging dopamine help sustain energ...,"with a lot of tools, how to leverage dopamine ..."
...,...,...
306,How can I leave a review or comment for the po...,"to us on Apple and Spotify. And on Apple, you ..."
307,What is the website address for supporting the...,tax deductible donation to the research in my ...
308,What is the reason for partnering with Thorne?,the supplements you use be of very high qualit...
309,What supplements does David take and where can...,"supplements that I take, you can go to Thorne,..."


In [79]:
ques_to_cntx = dict(zip(hyde_df['Question'], hyde_df['Answer']))

In [80]:
# Generate document
documents = list(map(lambda x: Document(page_content=x), ques_to_cntx.keys()))

In [81]:
documents[:10]

[Document(page_content='What is the relationship between dopamine and motivation?'),
 Document(page_content='What is the relationship between dopamine and addiction?'),
 Document(page_content='What is a dopamine schedule?'),
 Document(page_content='What is the relationship between caffeine, pornography, and plant-based compounds in terms of their effect on dopamine levels?'),
 Document(page_content='How can leveraging dopamine help sustain energy, drive, and motivation for important tasks over long periods of time?'),
 Document(page_content='How does the study published in the European Journal of Physiology investigate the effects of high increases in dopamine on the human body?'),
 Document(page_content='What was the effect of cold water exposure on norepinephrine, epinephrine (adrenaline), and dopamine levels in the body?'),
 Document(page_content='What is the effect of cold water therapy on metabolism and fat loss?'),
 Document(page_content='How does cold water exposure improve clar

### VectorDB

In [82]:
# Using nomic embeddings
# READ: https://blog.nomic.ai/posts/nomic-embed-text-v1
embd = Llama(model_path="../GGUF_models/nomic-embed-text-v1.Q8_0.gguf", embedding=True, verbose = False, normalize = True)

In [83]:
class GGUFEmbedding:
    def __init__(self, model_path):
        self.model = Llama(
            model_path=model_path,
            embedding=True,
            verbose=False
        )

    def embed_documents(self, query):
        return embd.embed(query)
    
    def embed_query(self, query):
        return embd.embed(query)

In [84]:
llamacpp_embd = GGUFEmbedding(model_path = '../GGUF_models/nomic-embed-text-v1.Q8_0.gguf')

In [85]:
out = llamacpp_embd.embed_documents('Hello world')
print('Shape of output embeddings: ', np.array(out).shape)

Shape of output embeddings:  (768,)


In [86]:
db = Chroma.from_documents(documents, llamacpp_embd)

In [92]:
retriever = db.as_retriever(similarity = 'similarity', search_kwargs = {
    'k' : 2,
})

In [96]:
retriever.invoke('What protocols to follow to increase dopamine baseline?')

[Document(page_content='What are some ways to increase dopamine levels?'),
 Document(page_content='What are some supplements that can increase baseline levels of dopamine without taking prescription pharmaceutical compounds?')]

## Talk To Youtube

In [97]:
TEMPERATURE = 0.1
TOP_P = 0.97
MAX_TOKENS = 16000

In [98]:
llama = LlamaCpp(
    model_path='../GGUF_models/llama-2-7b-chat.Q5_K_S.gguf',
    temperature=TEMPERATURE,
    max_tokens=MAX_TOKENS,
    top_p=TOP_P,
    verbose=False,
    n_threads = 10
)

In [102]:
chat_template = hub.pull("rlm/rag-prompt")


while True:
    user_input = input("USER: ")

    print("USER: ", user_input)

    documents = retriever.invoke(user_input)
    neigh_contexts = list(set(map(lambda x: ques_to_cntx[x.page_content], documents)))

    input_tokens = chat_template.format(
        question=user_input, context=".".join(neigh_contexts)
    )

    for chunk in llama.stream(input_tokens):
        print(chunk, end="", flush=True)

    print()

USER:  What to do when you get the urge of doing a bad habit?
 If you find yourself experiencing a drop in your baseline level of dopamine due to engagement with an activity or substance that led to big peaks, it's important to take steps to address the issue. This may involve going cold turkey or gradually tapering off the activity or substance to limit interactions and prevent further depletion of dopamine levels. It can also be helpful to seek professional help, such as a therapist or counselor, who can provide guidance and support in managing addictive tendencies.
USER:  What habits can raise the dopamine baseline?
 Activities that increase dopamine include exercise, learning new skills, and practicing mindfulness. Cold showers and ice baths are also known to increase dopamine levels. The exact amount by which these activities raise dopamine is not specified in the context provided.
USER:  By how much Cold showers increase baseline?
 Based on the context provided, cold showers can 

KeyboardInterrupt: Interrupted by user