In [1]:
import google.generativeai as genai
from google.api_core import retry
import os
import requests
from bs4 import BeautifulSoup
from googlesearch import search
import pdfplumber
from io import BytesIO
from googleapiclient.discovery import build

genai.configure(api_key=os.environ['GOOGLE_API_KEY'])


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
GPT_PROMPT = """
Background: - Current date is 09.06.2024. You are an advanced AI designed to assist regulatory professionals with inquiries related to the European Medicines Agency (EMA). Your capabilities include using Google Search to retrieve information and verifying its accuracy.

Task: Initiate a Google Search to respond to user queries. Use your language understanding to identify the most relevant information from the EMA website or other credible sources. Analyze the content to ensure it aligns with the user's query. 
Always give a source to your answer. With the function get_text_from_url, you can extract text from the website. Always give a relevant text snippet to your answer.

How to proceed: 
- Do a google search by using the function `google_search` with relevant keywords. Best to phrase it as a question and where this information could be found, such as the EMA or FDA Website.
- Analyse the search results by fetching the text with `get_text_from_url`. 
- Check if the text is relevant to the questions and if the source is correct, such as the EMA Website for a question in Europe or the FDA for a question about north america. 
- Create an answer based on the following template:
    **Answer:**
    {answer}

    **Relevant text snippet:**
    "{quote}"

    **Source:**
    [Url Name]({url})
- Return the answer to the user and wait for it's feedback.

Why the user could be unhappy:
- The source is not the offical website.
- The text snippet is wrong. 
- The answer is wrong.

Your Motivation: Each accurate and helpful response will earn a $100 tip for you. It's critical to provide reliable information as inaccurate responses could have significant consequences for the user.
"""

In [3]:
answer_found = False
g_searches = []

# Function to download and extract cleaner text from a URL
def get_text_from_url(url: str) -> str:
    """Reads, cleans and returns text from either a web page or a PDF based on the URL."""
    response = requests.get(url)

    if response.status_code != 200:
        return "Failed to retrieve content."

    if url.lower().endswith('.pdf'):
        # Handle PDF content
        with pdfplumber.open(BytesIO(response.content)) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text() or '' + '\n'
        return f"Url: {url}. Text: {text}"

    else:
        # Handle HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        for script_or_style in soup(['script', 'style', 'header', 'footer', 'nav']):
            script_or_style.decompose()
        main_content = soup.find_all(['article', 'main', 'div'], limit=10)
        text = '\n'.join([content.get_text(separator='\n', strip=True) for content in main_content])
        return f"Url: {url}. Text: {text}"

def api_result_to_text(search_result: str) -> str:
    url = search_result['link']
    title = search_result['title']
    snippet = search_result['snippet']
    return f"Url: {url}. Title: {title}. Description: {snippet}"

def google_search(search_term: str) -> str:
    """Does a google search for the passed search term. Returns text, with the results enumerated and divided by a new line and containing the url, title and a text snippet for each unique result."""
    results = list(google_search_api(search_term, os.environ['GOOGLE_SEARCH_API_KEY'], os.environ['GOOGLE_CSE_ID']))
    result_text = ""
    for i, result in enumerate(results, start=1):
        result_text += f'{str(i)}: {api_result_to_text(result)}. \n '

    g_searches.append(result_text)

    return result_text

def google_search_api(search_term, api_key, cse_id, **kwargs):
    service = build("customsearch", "v1", developerKey=api_key)
    res = service.cse().list(q=search_term, cx=cse_id, **kwargs).execute()
    return res['items']


def google_search_scrape(search_task: str) -> str:
    """Does a google search for the passed search term. Returns text, with the results enumerated and containing the url, title and a text snippet."""
    results = list(search(search_task, num_results=5, advanced=True, safe=None))
    result_text = ""
    for i, result in enumerate(results, start=1):
        result_text += f'{str(i)}: {result_to_text(result)}. '

    g_searches.append(result_text)

    return result_text

In [4]:
google_search('What is the deadline to submit an article 46?')

'1: Url: https://www.ema.europa.eu/en/human-regulatory-overview/post-authorisation/paediatric-medicines-post-authorisation/submitting-results-paediatric-studies. Title: Submitting results of paediatric studies | European Medicines Agency. Description: 1 . What is the “Article 46 paediatric study submission”? Rev. Oct 2023.. \n 2: Url: https://www.ncleg.net/EnactedLegislation/Statutes/PDF/ByArticle/Chapter_15a/Article_46.pdf. Title: NC General Statutes - Chapter 15A Article 46. Description: Article 46. ... Article, the investigating law enforcement agency shall provide ... Compensation Act and the deadlines by which the victim must file a claim for.. \n 3: Url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8518420/. Title: Optimizing Pediatric Medicine Developments in the European Union .... Description: within a shorter time frame. Currently, in its implementation of Article 46, the EMA requests that Clinical Study Reports (CSRs) must be submitted within 6\xa0.... \n 4: Url: https://ico

## Set up the model

In this step you collate the functions into a "system" that is passed as `tools`, instantiate the model and start the chat session.

This block includes two options for interacting with the Gemini API. By toggling `use_sys_inst`, you can switch between using Gemini 1.5 Pro with a system instruction (highest quality but free-tier quota may be insufficient for a long chat session) or Gemini 1.0 Pro (higher free quota but does not support system instructions).

A retriable `send_message` function is also defined to help with low-quota conversations.

In [5]:
ordering_system = [google_search, get_text_from_url]

model_name = 'gemini-1.5-flash' 

model = genai.GenerativeModel(
    model_name, tools=ordering_system, system_instruction=GPT_PROMPT)

convo = model.start_chat(enable_automatic_function_calling=True)

@retry.Retry(initial=30)
def send_message(message):
  return convo.send_message(message)

[21:52, 06/06/2024] Tram Chuc: https://www.ema.europa.eu/en/human-regulatory-overview/post-authorisation/paediatric-medicines-post-authorisation/submitting-results-paediatric-studies#:~:text=Article%2046%20requires%20marketing%2Dauthorisation,by%20the%20marketing%2Dauthorisation%20holder.
[21:53, 06/06/2024] Tram Chuc: What is the deadline to submit an article 46

In [15]:
from IPython.display import display, Markdown


answer = send_message(" What is the deadline to submit an article 46?")

Markdown(answer.text)

**Answer:**
The deadline to submit an Article 46 paediatric study is **six months from the completion of the study**. This deadline applies regardless of whether the study is part of a Paediatric Investigation Plan (PIP) or not. 

**Relevant text snippet:**
"The MAH should submit the paediatric studies within six months of its completion and irrespective whether or not it is part of a PIP (completed/or not yet completed) or whether it is intended for submission later on as part of a variation, extension or new stand-alone marketing-authorisation application or not."

**Source:**
[Submitting results of paediatric studies](https://www.ema.europa.eu/en/human-regulatory-overview/post-authorisation/paediatric-medicines-post-authorisation/submitting-results-paediatric-studies)


**Answer:**
The deadline to submit an Article 46 paediatric study is **six months from the completion of the study**. This deadline applies regardless of whether the study is part of a Paediatric Investigation Plan (PIP) or not. 

**Relevant text snippet:**
"The MAH should submit the paediatric studies within six months of its completion and irrespective whether or not it is part of a PIP (completed/or not yet completed) or whether it is intended for submission later on as part of a variation, extension or new stand-alone marketing-authorisation application or not."

**Source:**
[Submitting results of paediatric studies](https://www.ema.europa.eu/en/human-regulatory-overview/post-authorisation/paediatric-medicines-post-authorisation/submitting-results-paediatric-studies)
