In [None]:
!pip install openai==0.27.8
!pip install tiktoken
!pip install langchain
!pip install hnswlib

In [None]:
!pip install PyPDF2

In [3]:
# imports
import openai
import hnswlib
import langchain
from langchain.text_splitter import TextSplitter, CharacterTextSplitter
import PyPDF2
import requests

In [16]:
openai.api_key = ""
openai_params = {"model":"gpt-4-1106-preview",
                 "temperature":0.5,
                 "frequency_penalty":0.0,
                 "presence_penalty":0.0,
                 "max_tokens":1500,
                 "top_p":1}


In [17]:
def generate_answer(prompt,openai_params):
  message = [{"role":"user","content":prompt}]
  response = openai.ChatCompletion.create(messages=message,
                                        **openai_params)

  return response.choices[0].message.content

In [18]:
def tokenize_text_gpt(content,chunk_size=120,splitter_pattern=""):
    """
    Tokenize the text according to openai tokenizer using Langchain
    :param content:
    :return:
    """
    if not splitter_pattern:

        if "\n\n" in content:

            text_splitter_ = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=0,encoding_name="cl100k_base")
        elif "\n" in content:
            text_splitter_ = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=0,
                                                                         separator="\n",encoding_name="cl100k_base")
        else:
            text_splitter_ = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size, chunk_overlap=0,
                                                                         separator=" ",encoding_name="cl100k_base")
    else:
        text_splitter_ = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size,chunk_overlap=0,
                                                                    separator=splitter_pattern,encoding_name="cl100k_base")
    passages = text_splitter_.split_text(content)

    return passages


# tokenized_text = tokenize_text_by_page(text)
# for page, tokens in tokenized_text.items():
#     print(f"Page {page}: {tokens}\n")

In [19]:
def extract_text_from_pdf(pdf_file_path):
    # Open the PDF file
    with open(pdf_file_path, 'rb') as file:
        # Create PDF reader object
        pdf_reader = PyPDF2.PdfReader(file)

        extracted_text = ""

        for page in pdf_reader.pages:

            extracted_text += page.extract_text()

        return extracted_text

In [20]:
def create_index2(text_chunks):

    embeddings = get_embedding_list(text_chunks)
    if len(embeddings) == 0:
        print("No embeddings generated.")
        return None, {}

    dimension = len(embeddings[0])  # Dynamically get the dimension of embeddings

    index1 = hnswlib.Index(space='l2', dim=dimension)
    index1.init_index(max_elements=len(text_chunks), ef_construction=200, M=16)

    # Bulk adding to the index
    index1.add_items(embeddings)

    index1.set_ef(50)
    return index1

In [21]:
def get_embedding_list(texts, model="text-embedding-ada-002"):
  texts = [re.sub("\n+", " ", text) for text in texts]
  embedding_data = openai.Embedding.create(input = texts, model=model)['data']
  print("embeddings returned from openai")
  return [embedding_data[i]["embedding"] for i in range(len(embedding_data))]



def get_embedding(text, model="text-embedding-ada-002"):
  text = re.sub("\n+", " ", text)
  return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [22]:
def search_similar_text2(query, index, top_k):
    query_vector = get_embedding(query)
    try:
        labels, distances = index.knn_query(query_vector, k=top_k)

        # Flatten the labels and distances since we have a single query
        labels = labels.flatten()
        distances = distances.flatten()

    except Exception as e:
        print(e)
        results = []
    return labels

In [23]:
text_dict = extract_text_from_pdf("/content/230_lipnet_end_to_end_sentence_lev.pdf")
chunks = tokenize_text_gpt(text_dict)

In [24]:
len(chunks)

119

In [25]:
import re
index = create_index2(chunks)

embeddings returned from openai


In [32]:
query = "What are the lipreading datasets used?"

In [33]:
searched_index = search_similar_text2(query,index,10)

In [34]:
context = ""
for i in searched_index:
  context = context + " " +chunks[i] + "\n\n"

In [35]:
def qa_prompt(instruction, context):
    _prompt = f"""When responding to instructions, always ensure your answers:
1.Condense essential information from provided documents to directly tackle the task.
2.Analyze and address any disparities or inaccuracies between the instruction and document content.
3.Highlight crucial details, eliminating extraneous information for clarity.
4.Recognize any missing content, proposing alternatives or additional sources if needed.
5.Uphold ethical standards by maintaining accuracy in line with document(s) content.
6.Summarize pertinent insights from the document(s) pertinent to the instruction.
7.Include references citing PDF names and page numbers for relevant excerpts supporting your response.
8.Tailor your response to the instruction's nature, concentrating on precision and pertinence.

Context:{context}
Instruction:{instruction}
"""
    return _prompt


In [36]:
fin_prompt = qa_prompt(query, context)
generate_answer(fin_prompt,openai_params)

'The lipreading datasets mentioned in the provided document excerpts are:\n\n1. AVICar\n2. AVLetters\n3. AVLetters2\n4. BBC TV\n5. CUAVE\n6. OuluVS1\n7. OuluVS2\n8. GRID corpus\n\nThese datasets are used for training and evaluating lipreading models, with the GRID corpus being highlighted as one containing a significant number of sentences (34 speakers producing 1000 sentences each) and used for state-of-the-art performance summaries in lipreading tasks.'