In [65]:
import os
import openai
openai.api_key = os.getenv("OPENAI_API_KEY", "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")

### Read the file and split into chunks

In [66]:
from tika import parser  
from langchain.text_splitter import RecursiveCharacterTextSplitter

# parsing the file
parse_file = parser.from_file("./example.pdf")
data = parse_file['content'] 

# splitting the data into chunks
splitter = RecursiveCharacterTextSplitter()
paragraphs = splitter.split_text(text=data)

# clean the data
def clean_text(text):
    cleaned_string = text.replace("\n","").replace('..',"")
    return cleaned_string

cleaned_paragraphs = [clean_text(para) for para in paragraphs]
print(cleaned_paragraphs)




['The aim of this assignment is to develop a Generative AI Application that extracts action  items from a given document and generates a new document in which it presents them in a  particular layout that is selected by user. The application will use a Large Language Model  (LLM) to process and learn the content of the document and extract key action items, which  will be presented to the user in a layout chosen from a list of predefined layouts.']


### Embedding

In [67]:
from sentence_transformers import SentenceTransformer,util
import faiss

#importing the model
model = SentenceTransformer('msmarco-distilbert-base-v4')

#embedding
para_embeddings = model.encode(cleaned_paragraphs)

print(f"para_embed: {para_embeddings.shape}")

para_embed: (1, 768)


### Indexing

In [68]:
d = 768 # size of vectors
k = 10 # we want 10 nearest neighbour vectors
#indexing of the embeddings
index = faiss.IndexFlatIP(d)
index.add(para_embeddings)

In [71]:
import tiktoken
def num_tokens_from_string(string, encoding_name):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    
    return num_tokens

In [72]:
import numpy as np
from sentence_transformers import SentenceTransformer,CrossEncoder,util
import openai


#initialize models
# Model 1: msmarco-distilbert-base-v4 
# Model 2: all-MiniLM-L6-v2
model_vector = SentenceTransformer('msmarco-distilbert-base-v4')

# Model 1: cross-encoder/stsb-roberta-large
# Model 2: cross-encoder/ms-marco-MiniLM-L-12-v2
model_encoder = CrossEncoder("cross-encoder/stsb-roberta-large")

In [76]:
from IPython.display import HTML
from langchain.llms import OpenAI
import time
box ='Create red border boxes in each row with space in between kind of html template and italics font'
table ='Use table based html template and use bold text for headings'
list_ = 'create bullet list and align it left, use bold font'
simple='create a list with black background and white text, align it to the left'

In [77]:
while True:
    
    #get the query
    query = input("Please enter you query: ")
    if query==None:
        break
    
    print("Available layouts: ")
    print(" Box: Separate bordered boxes with the information.")
    print(" Table: Arranged in a table format.")
    print(" List: Bulleted list aligned to the left.")
    print(" Simple: List with black background and white color.")
    layout = input("Pick a layout: ")
    if layout.lower()=='box':
        layout = box
    elif layout.lower()=='table':
        layout = table
    elif layout.lower()=='list':
        layout = list_
    else:
        layout = simple
        
        
    
    # create query vector and fetch relevant indexes
    query_vector = model_vector.encode([query])
    D,I = index.search(query_vector, k)
    relevant_indexes=I.tolist()[0]
    
    # using indexes get relevant paragraphs
    relevant_paras = []
    for i in relevant_indexes:
        relevant_paras.append(cleaned_paragraphs[i])
        
        
    #create query-paragraph pairs and calculate similarity scores to rank paragraphs
    query_paras_combined = [[query, para] for para in relevant_paras]
    similarity_scores = model_encoder.predict(query_paras_combined)
    sim_scores_argsort = list(reversed(np.argsort(similarity_scores)))
    
    

    #build context -- max paragraphs allowed in context is set to 3 and max token length is set to 2700
    relevant_context = ""
    threshold = 3       
    for idx in sim_scores_argsort:
        if threshold > 0 and num_tokens_from_string(relevant_context, "p50k_base")+num_tokens_from_string(relevant_paras[idx], "p50k_base")<2700:
            relevant_context += relevant_paras[idx] + "\n\n"
            threshold = threshold - 1
        else:
            break
    
    #generate an input prompt
    refined_prompt = f"""
    Answer the question based on the contexts below. 
    If the question cannot be answered using the information 
    provided answer with "I don't know".
    Create answers as HTML code to pick action items based on the below contexts and put it into the below specified template.
    Also take main content from input query and include it in the answer as an h1 heading with bold font.
    
    Contexts:
    {relevant_context}

    ###

    Template:
    {layout}

    ###

    Question:{query}
    Answer:"""
    print(f"""Tokens in the prompt: {num_tokens_from_string(refined_prompt, "p50k_base")}""")

    print("Loading.....")
    #Feed input prompt to openai model
    response = openai.Completion.create(
    engine="text-davinci-003",
    prompt=refined_prompt,
    temperature=0.0,
    max_tokens=1024,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
    )

        
    display(HTML(response["choices"][0]["text"]))
    
    confirm = input("Do you have any new query [y/N]: ")
    if confirm.lower() in ['yes', 'y']:
        continue
    else:
        break
        
    

Available layouts: 
 Box: Separate bordered boxes with the information.
 Table: Arranged in a table format.
 List: Bulleted list aligned to the left.
 Simple: List with black background and white color.
Tokens in the prompt: 413
Loading.....


KeyboardInterrupt: 

In [11]:
# Tell me steps to design url shortener
# Give me steps to design consistent hashing