# LangChain QA Scrap Pad

In [1]:
import os
from typing import List
import pandas as pd
import redis
from langchain.vectorstores.redis import Redis
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
CACHE_TYPE = os.getenv("CACHE_TYPE")
REDIS_URL = os.getenv("REDIS_URL")
OPENAI_COMPLETIONS_ENGINE = os.getenv("OPENAI_COMPLETIONS_ENGINE")
INDEX_NAME = os.getenv("INDEX_NAME")

## 1. Setup Langchain Components

We will use Index indexed in [this notebook](1.0.0-han-langchain-indexing.ipynb) for our Q&A bot. 

In [3]:
llm = OpenAI()
embeddings = OpenAIEmbeddings()
vectorstore = Redis.from_existing_index(
    redis_url=REDIS_URL,
    index_name='chat_index',
    embedding=embeddings,
)

In [4]:
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, say that you don't know, don't try to make up an answer.

This should be in the following format:

Question: [question here]
Answer: [answer here]

Begin!

Context:
---------
{context}
---------
Question: {question}
Answer:"""

In [5]:
prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

In [6]:
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [7]:
question = "What is the best way to train models for tabular data?"
chain({'query': question})

{'query': 'What is the best way to train models for tabular data?',
 'result': ' It depends on the dataset size and type. Generally, boosted decision trees (XGBoost, CatBoost, or LightGBM) are faster to train and on par with neural networks. However, it is possible to get better results with neural networks if the hyperparameter ranges are carefully chosen and the data is transformed to the model correctly.',
 'source_documents': [Document(page_content='U01HNA2UH60: Is anyone here doing tabular deep learning?U01CTELE17D: We do neural nets, but usually trees wins on our leader boards.U01HNA2UH60: hmm that’s interesting. i’m wondering if the scale of data is a factor?U01CTELE17D: Possibly, but with smart sampling and very limited hyper parameter tuning you can get very accurate model, why bother with neural nets in the first place? where in order to get similar or slightly worse results requires 10x more time and resources.', metadata={'channel_name': 'mlops-questions-answered', 'thread_

In [8]:
chain({'query': question})['result']

' It depends on the data, but it is often suggested that gradient-boosted tree ensembles still outperform deep learning models, and some have found success using a library called pytorch-widedeep. For time series data, some NN models have been seen to work well from the darts library.'

## 2. Answer Generation

In [9]:
df = pd.read_csv('../data/questions_list.csv')
df.head()

Unnamed: 0,question,answer
0,Has anyone used Erlang or Elixir in production...,
1,Is Object Oriented Programming (OOP) terrible ...,
2,How do you track inferences over time for ML o...,
3,Is there a blog or repo with code for a simple...,
4,What are some best practices or open source fr...,


In [10]:
def qa(question:str):
    return chain({'query': question})['result']

df['answer'] = df['question'].apply(qa)
df.to_csv('../data/result_list.csv')