## Embedding from hugginface

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
from langchain.vectorstores import OpenSearchVectorSearch

docsearch = OpenSearchVectorSearch(
    opensearch_url = 'https://localhost:9200',
    embedding_function=embeddings,
    # index_name='nextjs-video',
    index_name=None,
    http_compress = True, # enables gzip compression for request bodies
    http_auth = ('admin','admin'),
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

# adding some data

In [None]:
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
# transcript = YoutubeLoader('lG7Uxts9SXs&t=2373s').load()
transcript = YoutubeLoader('H0vhkoXljq0').load()
docs = text_splitter.split_documents(transcript)

docsearch.add_documents(docs)

In [None]:
docs = docsearch.similarity_search("project", k=5)
docs

## Import Json data

In [None]:
from langchain.document_loaders import JSONLoader


def extract_metadata(record: dict, metadata: dict) -> dict:
    metadata["id"] = record['url']
    return metadata

JSON_PATH = './output.json'

loader = JSONLoader(
    file_path=JSON_PATH,
    jq_schema='.commit_history[]',
    text_content=False,
    metadata_func=extract_metadata
)

documents = loader.load()

## llm model

In [None]:
import os
from dotenv import load_dotenv
from langchain.llms.together import Together

load_dotenv()

TOGETHER_API = os.getenv('TOGETHER_API')

model = Together(together_api_key=TOGETHER_API, 
                 max_tokens=500,
                 model='togethercomputer/llama-2-13b-chat')

In [None]:
from palmLLM import PalmLLM


model = PalmLLM()

# Youtube transcript

In [None]:
retriever = docsearch.as_retriever(search_kwargs={"k": 5})

In [None]:
def format_docs(docs):
    return " ".join([d.page_content for d in docs])

In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema import StrOutputParser
 
prompt = PromptTemplate(input_variables=['question', 'docs'],
                            template='''
                            You are a helpful Youtube assistant that can answer questions about videos based on the video's transcript.
                            
                            Answer the following question : "{question}"
                            Based on the following transcript : "{docs}"
                            
                            Only use the factual information within the transcript.
                            ''')

chain = (
        {"question": RunnablePassthrough(), "docs": retriever | format_docs} 
        | prompt 
        | model 
        | StrOutputParser()
)

In [None]:
question = "how many videos are there?"

for i in chain.stream(question):
    print(i, end='\n', flush=True)

# Using OpenSearch

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

# Answer the following question : {question}

template = """
You are a helpful Youtube assistant that can answer questions about videos based on the video's transcript.

Based on the following transcript : "{docs}"

Answer the user question "{question}".

Only use the factual information within the transcript.
"""

prompt = PromptTemplate(input_variables=['docs', 'question'], template=template)


chain = (
    {'docs': retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
question = "what's the projects discussed?"
# chain.invoke(question)
for i in chain.stream(question):
    print(i, end='\n', flush=True)
    

## Test OpenSearch API

In [None]:
from opensearchpy import OpenSearch

auth = ('admin', 'admin') # For testing only. Don't store credentials in code.
client = OpenSearch(
    hosts = 'https://localhost:9200',
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [None]:
client.search(index="langchain-video")['hits']

In [None]:
client.delete_by_query(index="langchain-video", body={"query": {"match_all": {}}})

## Conversational

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

from langchain.memory import ConversationBufferMemory
from langchain.chains import LLMChain
from operator import itemgetter
# Answer the following question : {question}

template = """
You are a helpful Youtube assistant that can answer questions about videos based on the video's transcript.

Based on the following transcript : "{docs}"

Answer the user question "{question}".

Only use the factual information within the transcript.
"""

prompt = PromptTemplate(input_variables=['docs', 'question'], template=template)


chain = (
    {'docs': retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model 
    | StrOutputParser()
)



In [None]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

chain = {'memory' : RunnableLambda(memory.load_memory_variables) | itemgetter("chat_history"), 'docs': retriever | format_docs, "question": RunnablePassthrough()} | prompt | model | StrOutputParser()

In [None]:
question = 'Hello'

inputs = {"question": question}

while question != '':
    question = input()
    # resp = qa(question)
    resp = chain.invoke(question)
    print(f'{resp}')