In [82]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import YoutubeLoader

import pinecone
import os

from dotenv import load_dotenv
import json
# Load the environment variables
load_dotenv()

True

### Extract youtube transcripts

In [53]:
# from etl.videos functions
json_path = "./data/videos.json"
with open(json_path) as f:
    video_infos = json.load(f)


In [55]:
len(video_infos)

2

### Chuck transcripts in smaller documents

In [88]:
base_url = "https://www.youtube.com/watch?v="
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

documents = []
for video_info in video_infos:
    video_id = video_info["id"]
    loader = YoutubeLoader.from_youtube_url(f"{base_url}{video_id}",add_video_info=True)
    transcript = loader.load()
    docs = text_splitter.split_documents(transcript)
    documents.extend(docs)
documents

 Document(page_content="structure in our cells called the mitochondria mitochondrial dysfunction is responsible for the majority of non-infectious diseases dysfunctional mitochondria are what's behind cardiovascular disease cancer in fact the cancer you have normal cells that turn into cancer cells because of damaged mitochondria it's impossible to get cancer unless you have damage to the mitochondria Alzheimer's which is Type 3 diabetes kidney liver disease all of these General metabolic diseases if you look closely at what's happening as a Common Thread deep inside the cell we have dysfunctional mitochondria there are a lot of things that destroy the mitochondria but at the top of the list is sugar and foods that turn into sugar very quickly like refined carbohydrates like starches breads pasta cereal crackers biscuits waffles pancakes muffins things like that and when we talk about sugar the majority sugar in the U.S is beet sugar and the majority of beets grown in the U.S are GMO s

### Create the embedding of the documents for semantic search

In [77]:
# create embeddings with OpenAI
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

# change the string into a vector space that represent different documents
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base='', openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization='', allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None, headers=None)

### Pinecone
We will store our embeddings in the cloud so that they can persist. Pinecone allows us to do that for free.
First, we need to create a new account to get the api_key and environment, so we initialize Pinecone.
Second, we must create an index for our vector with the following setup: Dimension: 1536 and Metric: Cosine.

In [83]:
# init 
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
PINECONE_ENV=os.getenv("PINECONE_ENV")

pinecone.init(
    api_key=PINECONE_API_KEY, 
    environment=PINECONE_ENV
)
index_name = "drberg"

### Get the embeddings and then pass them over the Pinecone

In [90]:
docsearch =  Pinecone.from_texts([doc.page_content for doc in documents], embeddings, index_name=index_name)

In [91]:
docsearch

<langchain.vectorstores.pinecone.Pinecone at 0x1251ea9d0>

### This are the documents that have the highest cosine similarity according to Pinecone

In [101]:
query = "Why does sugar a risk for blood vessels?"
docs = docsearch.similarity_search(query)
docs

 Document(page_content="you become a lot less hungry. Why? Because you stabilize your blood sugars and now your cells can be actually fed because when you're living on sugar, because sugar is toxic to the body, the body starts rejecting it. That's called insulin resistance. So the body is blocking insulin because that controls sugar and what your body is really trying to do is limit the amount of\nsugar inside the cells. So the body doesn't\nconsider it a good thing. It's a bad thing. So when you give it up,\nthis thing could reverse. And not only can you now\nabsorb proper amounts of fuel. You'll absorb nutrients a lot better too because insulin resistance also\nblocks nutrients, minerals, vitamins, and that's one of\nthe functions of insulin. All right, number three. You can have less fatigue especially after eating a meal. When you're eating sugar\non a regular basis, usually you're gonna\nbe tired after you eat and that is a blood sugar thing. But you're gonna find that you're no l

### Get the answer in Natural Language (NL)

In [96]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

llm = OpenAI(temperature=0, openai_api_key= OPENAI_API_KEY)
chain =  load_qa_chain(llm, chain_type="stuff")

In [99]:
query = "Why does sugar a risk for blood vessels?"
docs = docsearch.similarity_search(query)

### Run the chain and get the NL answer 

In [100]:
chain.run(input_documents=docs, question = query)

' Sugar can increase the risk for blood vessels because it can lead to insulin resistance, which blocks the absorption of nutrients, minerals, and vitamins. It can also lead to glycation, which is the damage of proteins in the blood, and can lead to mitochondrial dysfunction, which can cause oxidative stress and inflammation.'