In [None]:
!pip install git+https://github.com/openai/whisper.git

In [None]:
import gc
import pickle

import torch
import whisper
from langchain import HuggingFaceHub, HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from pytube import YouTube
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print("MPS device not found.")

In [None]:
DOWNLOAD_AND_PROCESS_VIDEOS = False

In [None]:
def download_video_and_return_file_name(url: str) -> str:
    yt = YouTube(url=url, use_oauth=True, allow_oauth_cache=True)
    title = yt.streams[0].title
    audios = yt.streams.filter(only_audio=True)
    video_audio = audios[-1]

    file_name = title.lower().replace(" ", "_") + ".mp3"
    video_audio.download(filename=f"{file_name}")
    return file_name

In [None]:
raw_transcriptions = []

video_urls = [
    "https://youtu.be/l8yx1MPtWBc?si=FG3khkaRT7uJ_V2C",
    "https://youtu.be/2HV6sKCqeZo?si=KD9cxOHOzGhdw8pH",
    "https://youtu.be/W7bsI4d4YKY?si=qqwmGzpWLnTxt1Ed",
    "https://youtu.be/Ul_o0hMJMZA?si=xEWJjFYVYITz1khF",
]

if DOWNLOAD_AND_PROCESS_VIDEOS:

    videos_locations = []

    for url in video_urls:
        file_name = download_video_and_return_file_name(url)
        videos_locations.append(file_name)
        print(f"For the video {url} has been downloaded!")
        collected = gc.collect()

    whisper_model = whisper.load_model("large-v2")

    for file_name in videos_locations:
        raw_transcriptions.append(
            whisper_model.transcribe(file_name, task="transcribe", language="English")
        )
        collected = gc.collect()

    with open("pamuk.pkl", "wb") as f:
        pickle.dump(raw_transcriptions, f)
else:
    with open("pamuk.pkl", "rb") as f:
        raw_transcriptions = pickle.load(f)

In [None]:
scripts = " ".join(text["text"] for text in raw_transcriptions)

In [None]:
len(scripts)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, separators=["."]
)

documents = text_splitter.create_documents([scripts])
sentences = [document.page_content for document in documents]

In [None]:
print(f"Size of sentences: {len(sentences)}")
print(f"Examples:")
print(sentences[:5])

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
vector_store = FAISS.from_texts(sentences, embeddings)

In [None]:
question = "what is the most difficult part of writing?"
searched_docs = vector_store.similarity_search(question)
print(searched_docs[0].page_content)

In [None]:
collected = gc.collect()
torch.mps.empty_cache()

model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model)
llm = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    trust_remote_code=True,
    max_new_tokens=256,
    repetition_penalty=1.6,
    model_kwargs={
        "device_map": "mps",
        "load_in_8bit": False,
        "max_length": 256,
        "do_sample": True,
        "temperature": 1.4,
        "top_k": 3,
        "top_p": 1.2,
    },
)

In [None]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know. Use one sentence only once.
Respond in the persona of a talented writer gives some writing advices to the writers.

{context}

Question: {question}
Answer: 
"""

prompt = PromptTemplate(
    template=template,
    input_variables=[
        "context",
        "question",
    ],
)

hf_llm = HuggingFacePipeline(pipeline=llm)

retriever = vector_store.as_retriever()

chain = RetrievalQA.from_chain_type(
    llm=hf_llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
)

collected = gc.collect()

In [None]:
%%time

query = "How can I create my character for my new novel?"
result = chain.invoke({"query": query})
print(result["result"])

In [None]:
%%time

query = "How can I prepare myself for my new novel?"
result = chain.invoke({"query": query})
print(result["result"])

In [None]:
%%time

query = "How many pages should I write everyday?"
result = chain.invoke({"query": query})
print(result["result"])

In [None]:
%%time

query = "How should I prepare myself to start a new novel?"
result = chain.invoke({"query": query})
print(result["result"])

In [None]:
%%time

query = "How can I benefit from other novels?"
result = chain.invoke({"query": query})
print(result["result"])