# NDA Database generation

## Scraping the FDA website to populate a database with 
1. Drug name
2. Drug maker
3. Event date
4. Event type
5. Company Name

In [2]:
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import re
import pandas as pd
from IPython import embed
import random

In [3]:
def sourceGetter(urlSource=
                 'http://www.fda.gov/BiologicsBloodVaccines/DevelopmentApprovalProcess/BiologicalApprovalsbyYear/default.htm'):

# This function gets the urls for each year of FDA approval for biolgics.
# The default input is the url that includes the list of links to pages for each year since 1996.
# Used as input for blaGetter, which finds the links to list of events in each year entry
    urlList = []
    with urllib.request.urlopen(urlSource) as response: #imports html from the url
       html = response.read()
    soup = BeautifulSoup(html,'lxml') #parses the html into a useful tree structure
    
    for link in soup.find(id='section-menu').find_all('a'): #iterates through the links to year pages and extracts link
        if link.get('class') == ['list-group-item']:
            urlList.append('http://www.fda.gov' + 
                           link.get('href'))
            
    return(urlList)


    

In [None]:
def blaGetter(blaSource = 
              'http://www.fda.gov/BiologicsBloodVaccines/DevelopmentApprovalProcess/BiologicalApprovalsbyYear/ucm482392.htm',
             n = 3):
# This function takes the list of urls output by sourceGetter and extracts links for the three
# types of FDA approval events for biologics: BLA, supplement, and NDA.
    blaList = []
    with urllib.request.urlopen(blaSource) as response: # imports html from url
        html = response.read()
    soup = BeautifulSoup(html,'lxml') # parse and construct the tree
    soup = soup.find('article') # navigating through the tree
    soup.find('header').extract()
    soup = soup.find('ul')
    for link in soup.find_all('li'): # generator object that returns links for BLA, supp., and NDA.
        L = link.find('a').get('href')
        yield('http://www.fda.gov' + L)   

In [4]:
uL = sourceGetter() #compile list of year pages
uL
bL_BLAapproval = [] 
bL_BLAsupplement = []
bL_Bnda = []
for u in uL: #iterate through years, extracting BLA, supp., and NDA links
    gen = blaGetter(u)
    for i in range(0,3): #splitting the links returned by blaGetter into three different lists
        try: #error catching to deal with years that don't have all three regulation pages
            B = next(gen)
            if i == 0:bL_BLAapproval.append(B)
            elif i == 1:bL_BLAsupplement.append(B)
            elif i == 2:bL_Bnda.append(B)
        except StopIteration: pass
        

In [13]:
def dataFinder(entry,n,eventType):
# This function sits inside blaEntry (below) and extract information for addition to the database
    tds = entry.find_all('td') #navigating the tree and extracting information
    try: drugName = entry.a.string.extract()
    except: drugName = 'error'
    try: STN = tds[2].string
    except: STN = 'error'
    try: companyName = tds[3].contents[0].string
    except: companyName = 'error'
    try: eventDate = tds[4].string
    except:eventDate = 'error' 
    noteworthFDA = ['NO']
    # populate a dict object for appending to the dataframe
    d = {'drugName':drugName, 
         'STN':STN,
         'companyName':companyName,
         'eventDate':eventDate,
         'eventType':eventType
        }
    
    return(pd.DataFrame(data=d)) #return the dict as a dataframe

In [None]:
def blaEntry(url,eventType):
# Takes the url output by blaGetter, and calls dataFinder to grab values for columns in the dataframe
# Note: eventType is passed through to dataFinder so that dataFinder can return the eventType in
# the dataframe object.
    df = pd.DataFrame(columns = ('drugName','STN','companyName','eventDate','eventType'))
    with urllib.request.urlopen(url) as response: #html from url
        html = response.read()
    soup = BeautifulSoup(html,'lxml') #tree from html
    try: entries = soup.tbody.find_all('tr')
    except AttributeError: entries = soup.table.find_all('tr')
    for entry in entries:
        n = 0
        dfT = dataFinder(entry,n,eventType) #temporary dataframe with dataFinder
        dfT
        df = pd.concat([df,dfT]) #add temporary dataframe to master
    return(df)  

In [19]:
#Takes the urls from sourceGetter and blaGetter and builds the dataframe with 
# blaEntry and dataFinder
df = pd.DataFrame(columns = ('drugName','STN','companyName','eventDate','eventType'))
for b in bL_BLAapproval:
    dfT = blaEntry(b,['BLAapproval'])
    df = pd.concat([df,dfT])

for b in bL_BLAsupplement:
    dfT = blaEntry(b,['BLAsupplement'])
    df = pd.concat([df,dfT])

for b in bL_Bnda:
    dfT = blaEntry(b,['Bnda'])
    df = pd.concat([df,dfT])

# saves df as to desktop as a csv
df.to_csv(path_or_buf = '/Users/Jonathan/Desktop/df.csv')