<a href="https://colab.research.google.com/github/imnoteinstien/1skills-introduction-to-github/blob/main/PharmaAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install beautifulsoup4 langchain chromadb requests pandas sqlalchemy pymupdf

Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.10.0-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting op

In [1]:
# Create base folder
!mkdir pharma-ai-chatbot

# Create subfolders
!mkdir pharma-ai-chatbot/app
!mkdir pharma-ai-chatbot/data
!mkdir pharma-ai-chatbot/models

In [2]:
# Basic requirements
!pip install openai python-dotenv gradio

# Optional API providers
!pip install google-generativeai deepseek-api

Collecting gradio
  Downloading gradio-5.13.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.6.0 (from gradio)
  Downloading gradio_client-1.6.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.9.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (from gradio)
  Downloading safehttpx-0.1.6-py3-none-any.whl.me

In [3]:
# Check folders
!ls pharma-ai-chatbot

# Check installed packages
!pip list | grep -E "openai|gradio|google-generativeai|deepseek"

app  data  models
google-generativeai                      0.8.4
gradio                                   5.13.1
gradio_client                            1.6.0
openai                                   1.59.9


In [35]:
# Create data subfolders
!mkdir -p pharma-ai-chatbot/data/fda_labels
!mkdir -p pharma-ai-chatbot/data/pubchem_compounds
!mkdir -p pharma-ai-chatbot/data/pmc_pdfs
!mkdir -p pharma-ai-chatbot/data/pmc_texts

In [76]:
%%writefile pharma-ai-chatbot/data/fda_scraper.py
import requests
import os
import time
import json
from tqdm import tqdm
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Configuration
MAX_RETRIES = 3
BACKOFF_FACTOR = 1
BATCH_SIZE = 1000  # Max allowed by FDA API
BASE_URL = "https://api.fda.gov/drug/label.json"
SAFE_FIELDS = "openfda.generic_name,indications_and_usage,dosage_and_administration,adverse_reactions"

def create_session():
    """Create a resilient requests session with retries"""
    session = requests.Session()
    retries = Retry(
        total=MAX_RETRIES,
        backoff_factor=BACKOFF_FACTOR,
        status_forcelist=[429, 500, 502, 503, 504]
    )
    session.mount('https://', HTTPAdapter(max_retries=retries))
    return session

def get_existing_ids():
    """Get set of already downloaded label IDs"""
    return set(f.split('.')[0] for f in os.listdir('pharma-ai-chatbot/data/fda_labels'))

def scrape_fda_labels():
    os.makedirs('pharma-ai-chatbot/data/fda_labels', exist_ok=True)
    session = create_session()

    try:
        # Get total count
        count_resp = session.get(
            BASE_URL,
            params={'count': 'openfda.generic_name.exact'},
            timeout=10
        )
        total = sum(item['count'] for item in count_resp.json().get('results', []))

        existing = len(get_existing_ids())
        print(f"📊 Total labels: {total} | Existing: {existing} | New: {total - existing}")

        params = {
            'limit': BATCH_SIZE,
            'skip': 0,
            'search': 'exists:openfda.generic_name',
            'sort': 'effective_time:desc',
            'fields': SAFE_FIELDS
        }

        downloaded = 0
        with tqdm(total=total-existing, desc="📥 Downloading", unit="label") as pbar:
            while True:
                try:
                    response = session.get(BASE_URL, params=params, timeout=15)
                    response.raise_for_status()
                    data = response.json()

                    if 'results' not in data:
                        print("⚠️ API structure changed - missing 'results' key")
                        break

                    batch_ids = {label.get('id') for label in data['results']}
                    existing_ids = get_existing_ids()
                    new_labels = [label for label in data['results']
                                if label.get('id') not in existing_ids]

                    # Save batch
                    for label in new_labels:
                        filename = f"pharma-ai-chatbot/data/fda_labels/{label['id']}.json"
                        with open(filename, 'w') as f:
                            json.dump(label, f)
                        downloaded += 1
                        pbar.update(1)

                    # Progress check
                    if len(new_labels) == 0 or params['skip'] + BATCH_SIZE > 10000:
                        break  # FDA API has skip limit of 10,000

                    params['skip'] += BATCH_SIZE
                    time.sleep(0.5)  # Conservative rate limiting

                except (requests.exceptions.RequestException, KeyError) as e:
                    print(f"⚠️ Temporary error: {str(e)} - Retrying...")
                    time.sleep(5)
                    continue

        # Update metadata
        metadata = {
            'last_run': datetime.now().isoformat(),
            'total_downloaded': downloaded,
            'next_start': params['skip']
        }
        with open('pharma-ai-chatbot/data/fda_labels/metadata.json', 'w') as f:
            json.dump(metadata, f)

        print(f"✅ Successfully downloaded {downloaded} new labels")

    except Exception as e:
        print(f"❌ Critical failure: {str(e)}")

if __name__ == "__main__":
    scrape_fda_labels()

Overwriting pharma-ai-chatbot/data/fda_scraper.py


In [60]:
!pip install requests tqdm pymupdf multiprocess



In [None]:
!python pharma-ai-chatbot/data/fda_scraper.py


📊 Total labels: 26588 | Existing: 62806 | New: -36218
📥 Downloading: 0label [00:00, ?label/s]⚠️ Temporary error: 400 Client Error: Bad Request for url: https://api.fda.gov/drug/label.json?limit=1000&skip=0&search=exists%3Aopenfda.generic_name&sort=effective_time%3Adesc&fields=openfda.generic_name%2Cindications_and_usage%2Cdosage_and_administration%2Cadverse_reactions - Retrying...
⚠️ Temporary error: 400 Client Error: Bad Request for url: https://api.fda.gov/drug/label.json?limit=1000&skip=0&search=exists%3Aopenfda.generic_name&sort=effective_time%3Adesc&fields=openfda.generic_name%2Cindications_and_usage%2Cdosage_and_administration%2Cadverse_reactions - Retrying...
⚠️ Temporary error: 400 Client Error: Bad Request for url: https://api.fda.gov/drug/label.json?limit=1000&skip=0&search=exists%3Aopenfda.generic_name&sort=effective_time%3Adesc&fields=openfda.generic_name%2Cindications_and_usage%2Cdosage_and_administration%2Cadverse_reactions - Retrying...
⚠️ Temporary error: 400 Client Er