## Import packages and initialize AOAI connection

In [1]:
import re
import os
import PyPDF2
import openai
import requests
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity
from transformers import GPT2TokenizerFast

API_KEY = os.getenv("OPENAI_API_EMBEDDING_KEY") 
RESOURCE_ENDPOINT = os.getenv("AZURE_OPENAI_EMBEDDING_ENDPOINT") 

openai.api_type = "azure"
openai.api_key = API_KEY
openai.api_base = RESOURCE_ENDPOINT
openai.api_version = "2022-12-01"

url = openai.api_base + "/openai/deployments?api-version=2022-12-01"

r = requests.get(url, headers={"api-key": API_KEY})

# print(r.text)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


## Define the search engine and

In [2]:
# embeddings_engine = 'text-search-davinci-doc-001'
# search_engine = 'text-search-davinci-query-001'
embeddings_engine = 'text-search-curie-doc-001' 
search_engine = 'text-search-curie-query-001'
query_string = 'What are potential regulatory actions that can result from an inspection?' # 'what ports and connectors does my surface have?'
summarization_engine = "text-davinci-002"

## Read in .pdf

In [3]:
def read_pdf_into_df(pdf_file_path):
    # create a pdf file object
    pdfFileObj = open(pdf_file_path, 'rb') # rb means read binary mode 
    # create a pdf reader object
    pdfReader = PyPDF2.PdfReader(pdfFileObj) 
    # print the number of pages in pdf document 
    print('Number of pages in PDF: ', len(pdfReader.pages)) 

    # declare an empty dataframe with two columns: Page and Text (for page number and text of each page)  
    df = pd.DataFrame(columns=['page', 'text'])

    # loop through the number of pages in the document, get each page's text, write to a new row in dataframe  
    for i in range(0, len(pdfReader.pages)):      # for each page 

        pageObj = pdfReader.pages[i]     # get that page object from the pdf reader object using indexing  

        text = pageObj.extract_text()       # extract the text from that page object  

        df_temp = pd.DataFrame([[i+1,text]], columns=['page', 'text'])     # create temporary dataframe to hold information for each iteration (page number and text on that page)  

        # df = df.append(df_temp)            # append temporary dataframe to master dataframe at end of each iteration (loop through all pages of document)  
        df = pd.concat([df, df_temp], ignore_index=True)     # append temporary dataframe to master dataframe at end of each iteration (loop through all pages of document)

        print('Finished processing page %d' % (i+1))      # print statement so you can see what pages have been processed already while script is running 

    print('Finished processing all pages.')               # print statement when script has finished running 

    return df

In [4]:
# Path to download the PDF to run this exercise: 'https://download.microsoft.com/download/7/B/1/7B10C82E-F520-4080-8516-5CF0D803EEE0/surface-book-user-guide-EN.pdf'
# surface_user_guide_path = 'C:/Users/ianadams/OneDrive - Microsoft/surface-book-user-guide-EN.pdf'
# surface_user_guide_df = read_pdf_into_df(surface_user_guide_path)

fda_7348_810_path = "C:/Users/ianadams/OneDrive - Microsoft/CP 7348.810 Sponsors and CROs_FINAL.pdf"
surface_user_guide_df = read_pdf_into_df(fda_7348_810_path)

Number of pages in PDF:  66
Finished processing page 1
Finished processing page 2
Finished processing page 3
Finished processing page 4
Finished processing page 5
Finished processing page 6
Finished processing page 7
Finished processing page 8
Finished processing page 9
Finished processing page 10
Finished processing page 11
Finished processing page 12
Finished processing page 13
Finished processing page 14
Finished processing page 15
Finished processing page 16
Finished processing page 17
Finished processing page 18
Finished processing page 19
Finished processing page 20
Finished processing page 21
Finished processing page 22
Finished processing page 23
Finished processing page 24
Finished processing page 25
Finished processing page 26
Finished processing page 27
Finished processing page 28
Finished processing page 29
Finished processing page 30
Finished processing page 31
Finished processing page 32
Finished processing page 33
Finished processing page 34
Finished processing page 35
F

## Drop the cover and contents pages

In [5]:
rows_to_drop = list(range(0,2)) # Drops Cover and Copyright
# rows_to_drop = list(range(0,5)) # Drops Cover and Table of Contents
surface_user_guide_df = surface_user_guide_df.drop(rows_to_drop, axis=0)

## Normalize Text in the df

In [6]:
# s is input text
def normalize_text(s, sep_token = " \n "):
    s = re.sub(r'\s+',  ' ', s).strip()
    s = re.sub(r". ,","",s)
    # remove all instances of multiple spaces
    s = s.replace("..",".")
    s = s.replace(". .",".")
    s = s.replace("\n", "")
    s = s.strip()
    
    return s

In [7]:
surface_user_guide_df['text'] = surface_user_guide_df['text'].apply(lambda x : normalize_text(x))
# surface_user_df

## Get a token count for each page

In [8]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
surface_user_guide_df['n_tokens'] = surface_user_guide_df["text"].apply(lambda x: len(tokenizer.encode(x)))
# surface_user_df = surface_user_df[surface_user_df.n_tokens<2000]
len(surface_user_guide_df)
# surface_user_df

64

## Get the embeddings for each page

In [9]:
surface_user_guide_df['davinci_search'] = surface_user_guide_df["text"].apply(lambda x : get_embedding(x, engine = embeddings_engine))
# surface_user_df

## Define our search and execute

In [10]:
# search through the reviews for a specific product
def search_docs(df, user_query, search_engine, top_n=3, to_print=True):
    embedding = get_embedding(
        user_query,
        engine=search_engine
    )
    df["similarities"] = df.davinci_search.apply(lambda x: cosine_similarity(x, embedding))

    res = (
        df.sort_values("similarities", ascending=False)
        .head(top_n)
    )
    if to_print == True:
        display(res)
    return res


In [11]:
search_results = search_docs(surface_user_guide_df, query_string, search_engine, top_n=4)

Unnamed: 0,page,text,n_tokens,davinci_search,similarities
14,15,7348.810 7348.810 Date of Issuance: 09/ 15/202...,581,"[-0.006083174142986536, 0.0046507855877280235,...",0.373887
58,59,7348.810 7348.810 Date of Issuance: 09/ 15/202...,522,"[-0.00846640020608902, -0.0023893050383776426,...",0.371995
59,60,7348.810 7348.810 Date of Issuance: 09/ 15/202...,497,"[-0.014045056886970997, -0.008437714539468288,...",0.370509
16,17,7348.810 7348.810 Date of Issuance: 09/ 15/202...,509,"[-0.0002744222874753177, -0.00825151614844799,...",0.360411


In [12]:
search_results.reset_index(inplace=True)
search_results

Unnamed: 0,index,page,text,n_tokens,davinci_search,similarities
0,14,15,7348.810 7348.810 Date of Issuance: 09/ 15/202...,581,"[-0.006083174142986536, 0.0046507855877280235,...",0.373887
1,58,59,7348.810 7348.810 Date of Issuance: 09/ 15/202...,522,"[-0.00846640020608902, -0.0023893050383776426,...",0.371995
2,59,60,7348.810 7348.810 Date of Issuance: 09/ 15/202...,497,"[-0.014045056886970997, -0.008437714539468288,...",0.370509
3,16,17,7348.810 7348.810 Date of Issuance: 09/ 15/202...,509,"[-0.0002744222874753177, -0.00825151614844799,...",0.360411


## Pass text from top 3 results to AOAI for summarization

In [13]:
prompt_engineering = 'You must summarize the results of the ----SEARCH RESULTS---- section in a way that best answers the query listed in the ----USER QUERY--- section \n \n'
def create_final_prompt(df, user_query, prompt_engineering):
    res = prompt_engineering + '----USER QUERY----\n' + user_query + '\n\n' + '----SEARCH RESULTS----\n'

    for i in range(3):
        res += df['text'][i] + '\n'
    
    print(res)
    
    return res

In [14]:
final_prompt = create_final_prompt(search_results, query_string, prompt_engineering)

You must summarize the results of the ----SEARCH RESULTS---- section in a way that best answers the query listed in the ----USER QUERY--- section 
 
----USER QUERY----
What are potential regulatory actions that can result from an inspection?

----SEARCH RESULTS----
7348.810 7348.810 Date of Issuance: 09/ 15/2021 Page 15 of 66 FORM FDA 2438g (electronic -09/2003) PART III - INSPECTIONAL The primary focus of sponsor inspections is to evaluate the sponsor’s practices and procedures to determine compliance with applicable regulations and adherence to good clinical practice standards to ensure subject protection and data quality and integrity. These inspections may include, but are not limited to, a review of the sponsor’s practices and procedures related to clinical trial oversigh including activities such as site monitoring, vendor audit training, and data collection, handling, and management. The inspectional focus is not to scientifically evaluate the results of the study or the quality

In [15]:
def results_text_to_final_response(prompt, summarization_engine):
    # Passes the response from the question answering bot to the AOAI model

    # Format the output from the QA bot to include the signifier for a summarization from AOAI
    # bot_answer = answer + '\n\nTl;dr'

    # REVIEW: Print the length of the bot answer for reference - may want to cut off since summarization
    # doesn't work well with short answer.
    # print(len(bot_answer))

    # Submit the answer from the QA Bot to the AOAI model for summariation
    response = openai.Completion.create(
      engine=summarization_engine,
      prompt= prompt, 
      temperature=0.7,
      max_tokens=2048,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0,
      stop=None)

    return response

In [16]:
response = results_text_to_final_response(final_prompt, summarization_engine)

In [17]:
def print_formatted_response(open_ai_response):
        print("\n", "Azure Open AI response: ", open_ai_response['choices'][0]['text'])

print_formatted_response(response)


 Azure Open AI response:  7348

