# QnA with Langchain

In [None]:
!pip install openai
!pip install langchain
!pip install pypdf2
!pip install faiss-cpu
!pip install docx2txt
!pip install tiktoken



In [None]:
from PyPDF2 import PdfReader
import os
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain   #FOR QNA
import openai

In [None]:
class Convert2Text:
    def __init__(self, file_path):
        self.file_path = file_path

    def check_file_extension(self):
        _, file_extension = os.path.splitext(self.file_path)
        return file_extension.lower()

    def convert_to_text(self):
        file_extension = self.check_file_extension()
        if file_extension == '.pdf':
            return self.pdftotext(self.file_path)
        elif file_extension == '.docx':
            return self.doctotext(self.file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}. Only .pdf and .docx files are supported.")

    def doctotext(self, m):
        temp = docx2txt.process(m)
        resume_text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
        text = ' '.join(resume_text)
        return text

    def pdftotext(self, m):
        pdfFileObj = open(m, 'rb')
        pdfFileReader = PdfReader(pdfFileObj)
        num_pages = len(pdfFileReader.pages)
        currentPageNumber = 0
        text = ''
        while currentPageNumber < num_pages:
            pdfPage = pdfFileReader.pages[currentPageNumber]
            text = text + pdfPage.extract_text()
            currentPageNumber += 1
        pdfFileObj.close()
        return text

In [None]:
openai_api_key = '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'

In [None]:
converter = Convert2Text('/content/Cheating Prevention.pdf')
text = converter.convert_to_text()

In [None]:
text_splitter = CharacterTextSplitter(separator= '\n', chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_text(text)

embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)
vector_db = FAISS.from_texts(chunks, embedding = embeddings)

In [None]:
llm = ChatOpenAI(temperature=0.7, openai_api_key = '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')
chain = load_qa_chain(llm= llm, chain_type='stuff')

In [None]:
query = "Tell me about Student Cheating Detection in Higher Education by Implementing Machine Learning and LSTM Techniques research paper"
docs = vector_db.similarity_search(query)
chain.run(input_documents= docs, question=query)

'The research paper titled "Student Cheating Detection in Higher Education by Implementing Machine Learning and LSTM Techniques" by Waleed Alsabhan addresses the issue of academic dishonesty, specifically cheating, in online assessments. The paper explores the use of machine learning (ML) technology to develop a deep learning model using Long Short-Term Memory (LSTM) layers with dropout and dense layers to identify exam cheating among students.\n\nThe research aims to provide practical solutions for monitoring and eliminating cheating incidents in educational institutions. The dataset used in the study includes students\' grades in various exam portions, which are labeled as "normal" or "cheating". Despite having a smaller dataset compared to previous research, the model architecture achieved a training accuracy of 90% and a validation accuracy of 92%, outperforming models that used Convolutional Neural Network (CNN) and Recurrent Neural Network (RNN) layers.\n\nThe paper highlights th

# Embeddings for conversation chain

In [None]:
!pip install transformers



In [None]:
from langchain.chains import ConversationalRetrievalChain    #FOR CONVERSATION CHAIN
from langchain.memory import ConversationBufferWindowMemory   #FOR CONVERSATION CHAIN
from transformers import GPT2TokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter     #ESSENTIAL

In [None]:
converter = Convert2Text('/content/Intelligent systems for cheating prevention.pdf')
text = converter.convert_to_text()

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))
#JUST TO CHECK IF THE NUMBER OF TOKENS ARE IN THE LIMIT

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=count_tokens)

chunks = text_splitter.split_text(text)

embeddings = OpenAIEmbeddings(openai_api_key = openai_api_key)
vectorstore = FAISS.from_texts(texts= chunks, embedding = embeddings)

In [None]:
from IPython.display import display
import ipywidgets as widgets

llm = ChatOpenAI(temperature=0.7, openai_api_key = openai_api_key)
# memory = ConversationBufferWindowMemory(k=2, return_messages=True)

conversation_chain = ConversationalRetrievalChain.from_llm(
    llm = llm,
    retriever = vectorstore.as_retriever()
)

In [None]:
chat_history = []

def on_submit(_):
    query = input_box.value
    input_box.value = ""

    if query.lower() == 'exit':
        print("Thank you for using Smartbot!")
        return

    result = conversation_chain({"question": query, "chat_history": chat_history})
    chat_history.append((query, result['answer']))

    display(widgets.HTML(f'<b>User:</b> {query}'))
    display(widgets.HTML(f'<b><font color="blue">Chatbot:</font></b> {result["answer"]}'))

print("Welcome to the Transformers chatbot! Type 'exit' to stop.")

input_box = widgets.Text(placeholder='Please enter your question:')
input_box.on_submit(on_submit)

display(input_box)

Welcome to the Transformers chatbot! Type 'exit' to stop.


Text(value='', placeholder='Please enter your question:')

HTML(value='<b>User:</b> Who are the authors of Intelligent system for cheating prevention research paper')

HTML(value='<b><font color="blue">Chatbot:</font></b> The authors of the research paper on Intelligent Systems…

HTML(value='<b>User:</b> Can you summarise the research pap')

HTML(value='<b><font color="blue">Chatbot:</font></b> The research paper focuses on developing an effective an…

Thank you for using Smartbot!
