In [54]:
import os
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import pytube
from pytube import YouTube
from langchain.docstore.document import Document
from youtube_transcript_api import YouTubeTranscriptApi

### Download Video using PyTube

In [2]:
VIDEO_URL = 'https://youtu.be/INHW_-HGCIs?si=T_4wZdVsryr5BLZo'

In [3]:
def downloadYTvideo(url, save_dir="../data"):
    yt = YouTube(url)
    video_title = yt.title
    video_id = pytube.extract.video_id(VIDEO_URL)
    # Issue: https://github.com/pytube/pytube/issues/1626
    yt.streams.first()

    video_description = yt.description
    video = yt.streams.filter(progressive=True).get_highest_resolution()
    try:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        print(f"Downloading video titled: {video_title}...")
        # video.download(save_dir)
        print("Videp Downloaded.")

        print('Downloading transcript')
        transcript_info = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        df = pd.DataFrame.from_dict(transcript_info) 
        df.to_csv(os.path.join(save_dir, 'transcript.csv'), index = False)
        print('Transcript saved.')
        
    except:
        print("Failed to download video")

In [None]:
downloadYTvideo(VIDEO_URL)

### Load Dataset

In [4]:
df = pd.read_csv('../data/transcript.csv')

In [5]:
df.head(5)

Unnamed: 0,text,start,duration
0,if you want to buy my course on how I,0.599,3.841
1,grew this YouTube following that's the,2.639,3.601
2,top Link in the description in this,4.44,3.9
3,video I'm going to teach you from my,6.24,4.56
4,current understanding how to love a,8.34,5.52


In [6]:
corpus = df.text.str.cat(sep = ' ')

In [7]:
print(corpus)

if you want to buy my course on how I grew this YouTube following that's the top Link in the description in this video I'm going to teach you from my current understanding how to love a woman and this gets so much deeper than just getting into a relationship and saying I love you to her it gets infinitely more deeper than the average relationship and if I can tell you a story of the last time that I took psychedelic mushrooms I felt 100 masculine I achieved 100 polarity at least that's what I believe with my woman and it was a beautiful magical experience we went up a mountain to a private part of like this place in Thailand and there we sat down we talked these like this mushroom like this dried thing which is like a psychedelic it's it's meant to enhance your consciousness and after around 10 15 minutes it starts the hit and if you've never experienced this before it's like the saturation of the world around you like the colors start to pop and it and brightness and feelings just wen

In [8]:
print('Number of characters in the corpus: ', len(corpus))
print('Unique characters: ', len(set(corpus)))

Number of characters in the corpus:  90462
Unique characters:  68


### Recursive Chunking

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [10]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1024, chunk_overlap = 64, length_function = len)

In [11]:
document_corpus = splitter.split_text(corpus)

In [12]:
print('Number of document: ', len(document_corpus))

Number of document:  95


### Mistral7B For HyDe

In [13]:
from typing import Optional
from llama_cpp import Llama
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import PromptTemplate

In [14]:
class LLMHypothethicalQuestions:
    def __init__(
        self,
        gguf_ckpt: str,
        max_tokens: int = 1024,
        top_p: float = 0.97,
        temperature: float = 0.1,
        show_log: bool = False,
        prompt: Optional[str] = None,
    ):

        # Initialize the LLM
        self.llm = LlamaCpp(
            model_path=gguf_ckpt,
            temperature=temperature,
            max_tokens=max_tokens,
            top_p=top_p,
            verbose=show_log,
        )

        if prompt is None:
            INSTRUCTION_TEMPLATE = """You are a question designer, given a paragraph you generate a question from the paragraph.
            Let's Think Step By Step.
            Paragraph: {paragraph}
            Generated question:"""
            self.instruction_prompt = PromptTemplate(
                template=INSTRUCTION_TEMPLATE, input_variables=["paragraph"]
            )

        else:
            self.instruction_prompt = PromptTemplate(
                template=prompt, input_variables=["paragraph"]
            )

        # Build a chain
        self.llm_chain = self.instruction_prompt | self.llm

    def generate(self, paragraph: str) -> None:
        out = self.llm_chain.invoke({"paragraph": paragraph})
        return out

In [15]:
# Hyperparams
CONTEXT_LENGTH = 2048
MAXIMUM_BATCH_SIZE = 1048

In [16]:
mistral_cpp = Llama(model_path='../GGUF_models/mistral-7b-instruct-v0.1.Q5_K_M.gguf',
                    n_ctx = CONTEXT_LENGTH,
                    n_batch = MAXIMUM_BATCH_SIZE,
                    verbose = False)

In [17]:
INSTRUCTION_TEMPLATE = """
You are a question designer, given a paragraph you generate a question from the paragraph.
Let's Think Step By Step.
Paragraph: {paragraph}
Generated question:"""

In [18]:
instruction_prompt = INSTRUCTION_TEMPLATE.format(paragraph = document_corpus[12])

In [19]:
# Sample tokens
print(mistral_cpp.tokenize(b"Hello, world!", special = True))
print(mistral_cpp.tokenize(b"This is mistral", special = True))

[1, 22557, 28725, 1526, 28808]
[1, 851, 349, 5710, 1650]


In [20]:
MAXIMUM_TOKENS = 1048
TEMPERATURE = 0.1
TOP_P = 0.99

In [21]:
# Sample forward pass
response = mistral_cpp(
    prompt=instruction_prompt,
    max_tokens=MAXIMUM_TOKENS,
    temperature=TEMPERATURE,
    top_p=TOP_P,
)

In [24]:
response['choices'][0]['text'].strip()

'What is the point where you will have the girl, and why is it important to focus on self-improvement before that point?'

In [28]:
# TODO: Use TQDM 
ques_to_cntx = dict()

for index, doc in enumerate(document_corpus):
    print(f"Processing {index + 1} out of {len(document_corpus)}")
    instruction_prompt = INSTRUCTION_TEMPLATE.format(paragraph = doc)
    response = mistral_cpp(
        prompt=instruction_prompt,
        max_tokens=MAXIMUM_TOKENS,
        temperature=TEMPERATURE,
        top_p=TOP_P,
    )
    question = response['choices'][0]['text'].strip()
    ques_to_cntx[question] = doc

Processing 1 out of 95
Processing 2 out of 95
Processing 3 out of 95
Processing 4 out of 95
Processing 5 out of 95
Processing 6 out of 95
Processing 7 out of 95
Processing 8 out of 95
Processing 9 out of 95
Processing 10 out of 95
Processing 11 out of 95
Processing 12 out of 95
Processing 13 out of 95
Processing 14 out of 95
Processing 15 out of 95
Processing 16 out of 95
Processing 17 out of 95
Processing 18 out of 95
Processing 19 out of 95
Processing 20 out of 95
Processing 21 out of 95
Processing 22 out of 95
Processing 23 out of 95
Processing 24 out of 95
Processing 25 out of 95
Processing 26 out of 95
Processing 27 out of 95
Processing 28 out of 95
Processing 29 out of 95
Processing 30 out of 95
Processing 31 out of 95
Processing 32 out of 95
Processing 33 out of 95
Processing 34 out of 95
Processing 35 out of 95
Processing 36 out of 95
Processing 37 out of 95
Processing 38 out of 95
Processing 39 out of 95
Processing 40 out of 95
Processing 41 out of 95
Processing 42 out of 95
P

In [36]:
list(ques_to_cntx.keys())

['What is the experience of taking psychedelic mushrooms according to the speaker?',
 'What was your experience when you went on a picnic with your woman on a mountain and started feeling anxious, scared, and sick?',
 'What is the speaker describing in the paragraph?',
 'What would happen if you looked down and saw seven big, healthy-looking stray dogs on your side of the mountain while walking through a trip with someone?',
 'What would you do if a group of stray dogs were approaching you?',
 'What would you do if a pack of stray dogs were to run towards you?',
 'What visuals do you typically see when taking psychedelics?',
 'What is the speaker describing in this paragraph?',
 'What is the most meaningful way you have expressed your love to someone?',
 'What was your experience like four years ago when you were fapping multiple times while your girlfriend slept in the next room?',
 'What happened on Saturday night when you were supposed to see your girlfriend?',
 'What was the differ

In [56]:
documents = list(map(lambda x:  Document(page_content=x), ques_to_cntx.keys()))

### VectorDB

In [43]:
import torch.nn.functional as F
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

In [49]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)



In [58]:
db = Chroma.from_documents(documents, hf)