# Webscraping the data

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib3
import regex
import re
import os

### *Define Functions*

In [4]:
# define function to retrieve the links from the page(collapse above)

def getPUFLinks(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text)
    PUFlinks = []

    for link in soup.findAll('tr', attrs = {'id' : 'Row1_neoTD3'}):
        href = link.find('a').get('href')
        PUF = href.split('=')[1]
        if PUF in PUFids:
            PUFlinks.append(link.find('a').get('href'))

    return PUFlinks

In [5]:
#alter function to retrieve SAS links
def getSASLinks(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text)
    SASlinks = []
    
    for link in soup.findAll('tr', attrs = {'id' : 'faqRoll_neoTD3'}):
        href = link.find('a').get('href')
        suffix = 'ssp.zip'
        if href.endswith(suffix) == True:
            SASlinks.append(link.find('a').get('href'))

    return SASlinks

In [6]:
#alter function to retrieve SASv9 links
def getV9Links(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text)
    SASlinks = []
    
    for link in soup.findAll('tr', attrs = {'id' : 'faqRoll_neoTD3'}):
        href = link.find('a').get('href')
        suffix = 'v9.zip'
        if href.endswith(suffix) == True:
            SASlinks.append(link.find('a').get('href'))

    return SASlinks

In [7]:
#define function to download the linked data files
def dlFiles(list, dir):
    try:
        if not os.path.exists(dir):                                        #check if directory exists, if not, create it
            os.makedirs(dir)
        for url in list:
            l= []                                                         #create empty list and use to append the url prefix to the file url
            prefix = 'https://meps.ahrq.gov/mepsweb/data_stats/'
            l.append(prefix)
            l.append(url)
            url = ''.join(l)
            response = requests.get(url)                                  #get url response and check if successful
            response.raise_for_status()
            filename = os.path.join(dir, os.path.basename(url))           #create filename using string from url
            with open(filename, 'wb') as file:                            #open file in binary mode to write to it
                file.write(response.content)
    except requests.exceptions.HTTPError as e:                            #errors to return if access fails
        print(f'HTTP error occurred for {url}: {e}')
    except requests.exceptions.RequestException as e:
        print(f'Error occurred for {url}: {e}')
         

### *Full Year Population Characteristics*

In [45]:
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=1%2CHousehold+Full+Year+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Population+Characteristics'
PUFids = ['HC-233', 'HC-224', 'HC-216', 'HC-209', 'HC-201', 'HC-192', 'HC-181', 'HC-171', 'HC-163', 'HC-155']

In [46]:
# get page link text
fpc = getPUFLinks(URL)

In [47]:
#turn link text into full links to pages

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in fpc])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-233', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-224', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-216', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-209', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-201', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-192', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-171', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-163', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-155']


In [20]:
#get the links to the SAS files

SASids = []

for link in PUFlinks:
    SASids.extend(getSASLinks(link))
SASids

['../data_files/pufs/h233/h233ssp.zip',
 '../data_files/pufs/h224/h224ssp.zip',
 '../data_files/pufs/h216/h216ssp.zip',
 '../data_files/pufs/h209/h209ssp.zip',
 '../data_files/pufs/h201/h201ssp.zip',
 '../data_files/pufs/h192ssp.zip',
 '../data_files/pufs/h181ssp.zip',
 '../data_files/pufs/h171ssp.zip',
 '../data_files/pufs/h163ssp.zip',
 '../data_files/pufs/h155ssp.zip']

In [48]:
#get the links to the V9 files

V9ids = []

for link in PUFlinks:
    SASids.extend(getV9Links(link))
V9ids

['../data_files/pufs/h233/h233v9.zip',
 '../data_files/pufs/h224/h224v9.zip',
 '../data_files/pufs/h216/h216v9.zip',
 '../data_files/pufs/h209/h209v9.zip',
 '../data_files/pufs/h201/h201v9.zip']

In [None]:
remove the ones i have SAS for already

SASids = SASids[5:]
SASids

In [None]:
put all ids in one list
allids = ZIPids + SASids
allids

In [49]:
#call function to download files
list = allids
dir = '../data/fpc'
dlFiles(list, dir)

### *Hospital Inpatient Stays*

In [65]:
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Hospital+Inpatient+Stays'
PUFids = ['HC-229D', 'HC-220D', 'HC-213D', 'HC-206D', 'HC-197D', 'HC-188D', 'HC-178D', 'HC-168D', 'HC-160D', 'HC-152D']

In [66]:
# get page link text
his = getPUFLinks(URL)

In [67]:
#turn link text into full links to pages

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in his])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-229D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-220D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-213D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-206D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-197D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-188D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-178D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-168D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-160D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-152D']


In [68]:
#get the links to the SAS files

SASids = []

for link in PUFlinks:
    SASids.extend(getSASLinks(link))
SASids

['../data_files/pufs/h229d/h229dssp.zip',
 '../data_files/pufs/h220d/h220dssp.zip',
 '../data_files/pufs/h213d/h213dssp.zip',
 '../data_files/pufs/h206d/h206dssp.zip',
 '../data_files/pufs/h197d/h197dssp.zip',
 '../data_files/pufs/h188dssp.zip',
 '../data_files/pufs/h178dssp.zip',
 '../data_files/pufs/h168dssp.zip',
 '../data_files/pufs/h160dssp.zip',
 '../data_files/pufs/h152dssp.zip']

In [54]:
#get the links to the V9 files

V9ids = []

for link in PUFlinks:
    SASids.extend(getV9Links(link))
V9ids

['../data_files/pufs/h229d/h229dv9.zip',
 '../data_files/pufs/h220d/h220dv9.zip',
 '../data_files/pufs/h213d/h213dv9.zip',
 '../data_files/pufs/h206d/h206dv9.zip',
 '../data_files/pufs/h197d/h197dv9.zip']

In [None]:
remove the ones i have SAS for already

SASids = SASids[5:]
SASids

In [None]:
put all ids in one list
allids = ZIPids + SASids
allids

In [55]:
#call function to download files
list = allids
dir = '../data/his'
dlFiles(list, dir)

### *Emergency Room Visits*

In [56]:
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits'
PUFids = ['HC-229E', 'HC-220E', 'HC-213E', 'HC-206E', 'HC-197E', 'HC-188E', 'HC-178E', 'HC-168E', 'HC-160E', 'HC-152E']

In [57]:
# get page link text
erv = getPUFLinks(URL)

In [58]:
#turn link text into full links to pages

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in erv])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-229E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-220E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-213E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-206E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-197E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-188E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-178E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-168E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-160E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-152E']


In [None]:
#get the links to the SAS files

SASids = []

for link in PUFlinks:
    SASids.extend(getSASLinks(link))
SASids

In [59]:
#get the links to the V9 files

V9ids = []

for link in PUFlinks:
    SASids.extend(getV9Links(link))
V9ids

['../data_files/pufs/h229e/h229ev9.zip',
 '../data_files/pufs/h220e/h220ev9.zip',
 '../data_files/pufs/h213e/h213ev9.zip',
 '../data_files/pufs/h206e/h206ev9.zip',
 '../data_files/pufs/h197e/h197ev9.zip']

In [None]:
remove the ones i have SAS for already

SASids = SASids[5:]
SASids

In [None]:
#put all ids in one list
allids = ZIPids + SASids
allids

In [60]:
#call function to download files
list = allids
dir = '../data/erv'
dlFiles(list, dir)

### *Office-Based Medical Provider Visits*

In [2]:
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Office-Based+Medical+Provider+Visits'
PUFids = ['HC-229G', 'HC-220G', 'HC-213G', 'HC-206G', 'HC-197G', 'HC-188G', 'HC-178G', 'HC-168G', 'HC-160G', 'HC-152G']

In [8]:
# get page link text
obv = getPUFLinks(URL)

In [9]:
#turn link text into full links to pages

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in obv])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-229G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-220G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-213G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-206G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-197G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-188G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-178G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-168G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-160G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-152G']


In [None]:
#get the links to the SAS files

SASids = []

for link in PUFlinks:
    SASids.extend(getSASLinks(link))
SASids

In [10]:
#get the links to the V9 files

V9ids = []

for link in PUFlinks:
    SASids.extend(getV9Links(link))
V9ids

['../data_files/pufs/h229g/h229gv9.zip',
 '../data_files/pufs/h220g/h220gv9.zip',
 '../data_files/pufs/h213g/h213gv9.zip',
 '../data_files/pufs/h206g/h206gv9.zip',
 '../data_files/pufs/h197g/h197gv9.zip']

In [None]:
#remove the ones i have SAS for already

SASids = SASids[5:]
SASids

In [None]:
#put all ids in one list
allids = SASids + V9ids
allids

In [15]:
#call function to download files
list = allids
dir = '../data/obv'
dlFiles(list, dir)