In [2]:
from langchain.vectorstores import FAISS
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQAWithSourcesChain
import pandas as pd

In [4]:
# load the youtube dataset
data_path = "/home/kamal/gitfolders/website/vector/selected_vid.csv"
vid_data = pd.read_csv(data_path)

In [5]:
vid_data.head(2)

Unnamed: 0,video_id,title,categoryId,thumbnail_link,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,22.0,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,20.0,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,"While running her own modding shop, Ramya Pare..."


In [42]:
vid_data.shape

(307, 5)

In [50]:
vid_data = vid_data[~vid_data.description.isna()]

In [9]:
isinstance('hi',str)

True

In [51]:
#lets see how many videos have description more than 500 chars?

vid_data[vid_data.description.apply(lambda x: len(x) if isinstance(x,str) else 0) >= 500]

#We can see around 160+ vids are having 500 or more characters 

Unnamed: 0,video_id,title,categoryId,thumbnail_link,description
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,20.0,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,24.0,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,I left youtube for a month and this is what ha...
3,kXLn3HkpjaA,XXL 2020 Freshman Class Revealed - Official An...,10.0,https://i.ytimg.com/vi/kXLn3HkpjaA/default.jpg,Subscribe to XXL → http://bit.ly/subscribe-xxl...
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,26.0,https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg,Transforming The LaBrant Family's empty white ...
5,w-aidBdvZo8,I Haven't Been Honest About My Injury.. Here's...,24.0,https://i.ytimg.com/vi/w-aidBdvZo8/default.jpg,Subscribe To My Channel - https://www.youtube....
...,...,...,...,...,...
302,5WjcDji3xYc,Honest Trailers | Avatar: The Last Airbender,1.0,https://i.ytimg.com/vi/5WjcDji3xYc/default.jpg,►►Subscribe to ScreenJunkies!► https://fandom....
303,AMXT1ok5UBg,THIS IS THE END.,22.0,https://i.ytimg.com/vi/AMXT1ok5UBg/default.jpg,THIS IS THE END.Shop on The RealReal and get $...
304,9nidKH8cM38,TAXI CAB SLAYER KILLS 'TO KNOW HOW IT FEELS',27.0,https://i.ytimg.com/vi/9nidKH8cM38/default.jpg,The first 1000 people to click the link will g...
305,mLOe7vGI0YI,Beerus VS Sailor Galaxia (Dragon Ball VS Sailo...,24.0,https://i.ytimg.com/vi/mLOe7vGI0YI/default.jpg,"They live for destruction, but now one must de..."


In [52]:
#Its better to split the description and store them inside the FAISS store for better retrieval experience

from langchain.text_splitter import RecursiveCharacterTextSplitter

descrip_split = RecursiveCharacterTextSplitter(chunk_size=300, 
                                               chunk_overlap=0,
                                              length_function=len,
                                              add_start_index=True)

In [21]:
embedding = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [53]:
vid_dict_array = vid_data.to_dict(orient='records')

In [54]:
vid_dict_array[0]

{'video_id': '3C66w5Z0ixs',
 'title': 'I ASKED HER TO BE MY GIRLFRIEND...',
 'categoryId': 22.0,
 'thumbnail_link': 'https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg',
 'description': 'SUBSCRIBE to BRAWADIS ▶ http://bit.ly/SubscribeToBrawadis\r\rFOLLOW ME ON SOCIAL\r▶ Twitter: https://twitter.com/Brawadis\r▶ Instagram: https://www.instagram.com/brawadis/\r▶ Snapchat: brawadis\r\rHi! I’m Brandon Awadis and I like to make dope vlogs, pranks, reactions, challenges and basketball videos. Don’t forget to subscribe and come be a part of the BrawadSquad!'}

In [56]:
descrip_split.create_documents([vid_dict_array[2]['description']],metadatas=[{'source':vid_dict_array[2]['title']}])

[Document(page_content="I left youtube for a month and this is what happenedMY COFFEE COMPANY: https://twitter.com/TOTMCoffeeJoin the subreddit: https://www.reddit.com/r/jacksepticeye/MORE MEMES ► https://www.youtube.com/watch?v=wGdn6ldQTTg&list=PLMBYlcH3smRxmCZzsUyrxB0IyKSQAU0pPJacksepticeye's Funniest Home Videos:", metadata={'source': 'I left youtube for a month and THIS is what happened.', 'start_index': 0}),
 Document(page_content='https://www.youtube.com/watch?v=VqfLcdpBasY&list=PLMBYlcH3smRxOk7Cp_V2ar3QDfvljWvSyEdited by: https://twitter.com/DaveDelirious►Twitter : https://twitter.com/Jack_Septic_Eye►Instagram: http://instagram.com/jacksepticeye', metadata={'source': 'I left youtube for a month and THIS is what happened.', 'start_index': 294})]

In [55]:
documents = []

for vid in vid_dict_array:
    title = vid['title']
    description = vid['description']
    temp = descrip_split.create_documents([description],metadatas=[{'source':title}])
    documents.extend(temp)

In [57]:
vector_db = FAISS.from_documents(documents=documents,embedding=embedding)

In [58]:
vector_db.save_local('vector_db')

In [62]:
load_db = FAISS.load_local(folder_path="vector_db",embeddings=embedding)

In [59]:
vec_retriever = vector_db.as_retriever()

In [60]:
vector_db.similarity_search("Explain about Coffee")

[Document(page_content='https://ijoc.org/index.php/ijoc/article/view/11777/2907https://apps.fas.usda.gov/newgainapi/api/report/downloadreportbyfilename?filename=Coffee%20Annual_Bogota_Colombia_5-14-2018.pdf Andy Jarvis https://link.springer.com/article/10.1007/s10584-012-0500-y Coffee is one of the most popular', metadata={'source': 'The global coffee crisis is coming', 'start_index': 749}),
 Document(page_content="changing. Now, experts estimate the amount of land that can sustain coffee will fall 50 percent by 2050. It's not just a crisis for consumers but for the millions who have made a livelihood out of growing coffee. Vox.com is a news website that helps you cut through the noise and understand what's", metadata={'source': 'The global coffee crisis is coming', 'start_index': 1333}),
 Document(page_content="commodities on Earth. It's grown by nearly 125 million farmers, from Latin America to Africa to Asia. But as man-made climate change warms the atmosphere, the notoriously parti