<a href="https://colab.research.google.com/github/kashindra-mahato/NLP/blob/main/FAQbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prepare Environment
## Colab: Enable the GPU runtime

In [None]:
%%bash

nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



CalledProcessError: ignored

In [None]:
%%bash

pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

## Logging
- We configure how logging messages should be displayed and which log level should be used before importing Haystack. Example log message: INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily:

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

## mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Create a simple DocumentStore

In [None]:
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

## Create a Retriever using embeddings

## sentence-transformers/multi-qa-distilbert-cos-v1

In [None]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-distilbert-cos-v1",
    use_gpu=True,
    scale_score=False,
)

## Prepare & Index FAQ data

In [None]:
import pandas as pd

from haystack.utils import fetch_archive_from_http
doc_dir = ""
# Get dataframe with columns "question", "answer"
# data = pd.read_csv(f"{doc_dir}/question_answer_dataset.csv")
data = pd.read_excel(f"{doc_dir}/.xlsx")
data.columns = ['questions', 'answers']
df = pd.DataFrame(data['questions'])
# Minimal cleaning
df.fillna(value="", inplace=True)
df["questions"] = df["questions"].apply(lambda x: x.strip())
print(df.head())

In [None]:
data

### Preprocessing

In [None]:
new_df = pd.DataFrame(df['questions'])

In [None]:
new_df['questions'] = new_df['questions'].str.lower()
new_df

In [None]:
# # remove numbers
# new_df['questions'] = new_df['questions'].str.replace('\d+', '')
# new_df

In [None]:
# remove anything that is not word or sentence
new_df['questions'] = new_df['questions'].str.replace(r'[^\w\s]+', '')
new_df

In [None]:
# Create embeddings for our questions from the FAQs
# In contrast to most other search use cases, we don't create the embeddings here from the content of our documents,
# but rather from the additional text field "question" as we want to match "incoming question" <-> "stored question".
questions = list(new_df["questions"].values)
new_df["embedding"] = retriever.embed_queries(queries=questions).tolist()
new_df = new_df.rename(columns={"questions": "content"})

# Convert Dataframe to list of dicts and index them in our DocumentStore
docs_to_index = new_df.to_dict(orient="records")
document_store.write_documents(docs_to_index)

In [None]:
new_df

In [None]:
from haystack.pipelines import FAQPipeline

pipe = FAQPipeline(retriever=retriever)

## Ask questions

In [None]:
# from haystack.utils import print_answers

# while True:
#   # Run any question and change top_k to see more or less answers
#   prediction = pipe.run(query=input("Enter your query: "), params={"Retriever": {"top_k": 1}})

#   print_answers(prediction, details="medium")

In [None]:
test_data = pd.read_excel(doc_dir + '.xlsx')
test_data.columns = ['questions']

In [None]:
test_data.head()

In [None]:
# Run any question and change top_k to see more or less answers
prediction_list = [pipe.run(query=test_data['questions'][i], params={"Retriever": {"top_k": 1}}) for i in range(len(test_data['questions']))]


In [None]:
from haystack.schema import Answer

In [None]:
query_list = [prediction_list[i]['query'] for i in range(len(prediction_list)-1)]

In [None]:
context_list = [prediction_list[i]['answers'][0].context for i in range(len(prediction_list)-1)]

In [None]:
score_list = [prediction_list[i]['answers'][0].score for i in range(len(prediction_list)-1)]

In [None]:
result_df = pd.DataFrame(data=list(zip(query_list,context_list,score_list)), columns=['query', 'context', 'score'])

In [None]:
result_df

In [None]:
result_df['score'].describe()