# Webscraping the codebooks

In [121]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import urllib3
import regex
import re
import os
from tqdm.notebook import tqdm
import pickle

To get the codebooks I need to:
- request the codebook html page
- get the html
- pull out the name for each variable
- drop them in a list
- pull out the description for each variable
- drop them in a list
- merge those lists into a dataframe

From there, I will need to:
- get the links for each variable name
- request each page
- get the values from the value column
- drop them in a list
- get the values from the unweighted column
- drop them in a list
- get the values from the weighted value column
- drop them in a list
- merge those lists into a dataframe

Finally:
- join the variable dfs onto the var+description df in a way where the variable name and description are repeated for each value pertaining to them

Then I will be able to use the codebooks to substitute the codes in the data for meaningful variable names with descriptions. Hopefully. Lol

### *Define Functions*

In [12]:
# define function to retrieve the links from the page(collapse above)

def getPUFLinks(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text)
    PUFlinks = []

    for link in soup.findAll('tr', attrs = {'id' : 'Row1_neoTD3'}):
        href = link.find('a').get('href')
        PUF = href.split('=')[1]
        if PUF in PUFids:
            PUFlinks.append(link.find('a').get('href'))

    return PUFlinks

In [75]:
#define function to get links to codebooks
def getCBLinks(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text)
    CBlinks = []
    
    for link in soup.findAll('tr', attrs = {'id' : 'faqRoll_neoTD3'}):
        href = link.findAll('a')[-1].get('href')
        prefix = 'download_data_files_codebook'
        if href.startswith(prefix) == True:
            CBlinks.append(link.findAll('a')[-1].get('href'))

    return CBlinks

### *Population Characteristics*

In [331]:
#get the links for full population characteristics pages
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=1%2CHousehold+Full+Year+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Population+Characteristics'
PUFids = ['HC-233', 'HC-224', 'HC-216', 'HC-209', 'HC-201', 'HC-192', 'HC-181', 'HC-171', 'HC-163', 'HC-155']

fpc = getPUFLinks(URL)
print(fcd)

['download_data_files_detail.jsp?cboPufNumber=HC-233', 'download_data_files_detail.jsp?cboPufNumber=HC-224', 'download_data_files_detail.jsp?cboPufNumber=HC-216', 'download_data_files_detail.jsp?cboPufNumber=HC-209', 'download_data_files_detail.jsp?cboPufNumber=HC-201', 'download_data_files_detail.jsp?cboPufNumber=HC-192', 'download_data_files_detail.jsp?cboPufNumber=HC-181', 'download_data_files_detail.jsp?cboPufNumber=HC-171', 'download_data_files_detail.jsp?cboPufNumber=HC-163', 'download_data_files_detail.jsp?cboPufNumber=HC-155']


In [332]:
#create list of links to get pages with cb html links

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in fpc])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-233', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-224', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-216', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-209', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-201', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-192', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-181', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-171', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-163', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-155']


In [333]:
#get the links to the codebook pages and turn into working links
cblinks = []

for link in PUFlinks:
    cbl = getCBLinks(link)
    cblinks.extend('https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in cbl)
print(cblinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H233', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H224', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H216', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H209', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H201', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H192', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H181', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H171', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H163', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H155']


In [334]:
#loop over cblinks to get html tables, as a dictionary of dataframes
fpcCBs = {}

for i, link in enumerate(cblinks, start=1):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        key = f'cb_{i}'                                                                # create dynamic key
        fpcCBs[key] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}")

In [335]:
#check dictionary contents
fpcCBs['cb_1']['Name'][:-2]

0           ACCELI42
1           ACTDTY31
2           ACTDTY42
3           ACTDTY53
4           ACTLIM31
            ...     
1483        WRKLIM53
1484        YCHJ3142
1485        YCHJ4253
1486    YNOUSC42_M18
1487         YRSINUS
Name: Name, Length: 1488, dtype: object

In [163]:
#create dictionary of lists of links from name columns of each variable df
PUFids_vars = ['H233', 'H224', 'H216', 'H209', 'H201', 'H192', 'H181', 'H171', 'H163', 'H155']
varlinks = {}

for i, id in enumerate(PUFids_vars, start = 1):
    key = f'cb_{i}'
    varlinks[key] = ['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId='+ id 
                     + '&varName=' + word for word in fcdCBs[f'cb_{i}']['Name'][:-2]]


In [195]:
#loop over dictionary of lists to create dictionary of dictionaries of dataframes of variable values
#altered to have a progress bar and run over one year at a time
key = 'cb_10'
varlinkslist = varlinks[key]

dict = {}
for i, link in tqdm(enumerate(varlinkslist, start=0)):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        k = f'var_{i}'                                                                # create dynamic key
        dict[k] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}") 

0it [00:00, ?it/s]

In [196]:
len(varlinks['cb_10'])

1883

In [197]:
#code to pickle results of above loop so i never have to run it again

a = dict

with open('../data/fpc_cbpickles/cb_10.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [341]:
#export variables and descriptions to csvs
for cb in fpcCBs:
    fpcCBs[cb].iloc[:, [0, 3]].to_csv(f'../data/fpcCBcsvs/{cb}.csv', index= False)

### *Hospital Inpatient Stays*

In [278]:
#get the links for full population characteristics pages
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Hospital+Inpatient+Stays'
PUFids = ['HC-229D', 'HC-220D', 'HC-213D', 'HC-206D', 'HC-197D', 'HC-188D', 'HC-178D', 'HC-168D', 'HC-160D', 'HC-152D']

his = getPUFLinks(URL)
print(his)

['download_data_files_detail.jsp?cboPufNumber=HC-229D', 'download_data_files_detail.jsp?cboPufNumber=HC-220D', 'download_data_files_detail.jsp?cboPufNumber=HC-213D', 'download_data_files_detail.jsp?cboPufNumber=HC-206D', 'download_data_files_detail.jsp?cboPufNumber=HC-197D', 'download_data_files_detail.jsp?cboPufNumber=HC-188D', 'download_data_files_detail.jsp?cboPufNumber=HC-178D', 'download_data_files_detail.jsp?cboPufNumber=HC-168D', 'download_data_files_detail.jsp?cboPufNumber=HC-160D', 'download_data_files_detail.jsp?cboPufNumber=HC-152D']


In [199]:
#create list of links to get pages with cb html links

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in his])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-229D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-220D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-213D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-206D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-197D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-188D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-178D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-168D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-160D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-152D']


In [200]:
#get the links to the codebook pages and turn into working links
cblinks = []

for link in PUFlinks:
    cbl = getCBLinks(link)
    cblinks.extend('https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in cbl)
print(cblinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H229D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H220D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H213D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H206D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H197D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H188D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H178D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H168D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H160D', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H152D']


In [206]:
#loop over cblinks to get html tables, as a dictionary of dataframes
hisCBs = {}

for i, link in enumerate(cblinks, start=1):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        key = f'cb_{i}'                                                                # create dynamic key
        hisCBs[key] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}")

In [212]:
#check dictionary contents
hisCBs['cb_1']['Name'][:-2]

0      ANYOPER
1     DSCHPMED
2         DUID
3     DUPERSID
4     EMERROOM
5     ERHEVIDX
6      EVENTRN
7      EVNTIDX
8      FFEEIDX
9     FFIPTYPE
10     IMPFLAG
11     IPBEGMM
12     IPBEGYR
13    IPDMD21X
14    IPDMR21X
15    IPDOF21X
16    IPDOT21X
17    IPDPV21X
18    IPDSF21X
19    IPDSL21X
20    IPDTC21X
21    IPDTR21X
22    IPDVA21X
23    IPDWC21X
24    IPDXP21X
25     IPENDMM
26     IPENDYR
27    IPFMD21X
28    IPFMR21X
29    IPFOF21X
30    IPFOT21X
31    IPFPV21X
32    IPFSF21X
33    IPFSL21X
34    IPFTC21X
35    IPFTR21X
36    IPFVA21X
37    IPFWC21X
38    IPFXP21X
39     IPTC21X
40     IPXP21X
41     MPCDATA
42    NUMNIGHX
43       PANEL
44    PERWT21F
45         PID
46    RSNINHOS
47    SPECCOND
48      VARPSU
49      VARSTR
Name: Name, dtype: object

In [208]:
#create dictionary of lists of links from name columns of each variable df

PUFids_vars = ['H229D', 'H220D', 'H213D', 'H206D', 'H197D', 'H188D', 'H178D', 'H168D', 'H160D', 'H152D']
varlinks = {}

for i, id in enumerate(PUFids_vars, start = 1):
    key = f'cb_{i}'
    varlinks[key] = ['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId='+ id 
                     + '&varName=' + word for word in hisCBs[f'cb_{i}']['Name'][:-2]]


In [236]:
#loop over dictionary of lists to create dictionary of dictionaries of dataframes of variable values
#altered to have a progress bar and run over one year at a time
key = 'cb_10'
varlinkslist = varlinks[key]

dict = {}
for i, link in tqdm(enumerate(varlinkslist, start=0)):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        k = f'var_{i}'                                                                # create dynamic key
        dict[k] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}") 

0it [00:00, ?it/s]

In [237]:
len(varlinks['cb_10'])

69

In [238]:
#code to pickle results of above loop so i never have to run it again

a = dict

with open('../data/his_cbpickles/cb_10.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [342]:
#export variables and descriptions to csvs
for cb in hisCBs:
    hisCBs[cb].iloc[:, [0, 3]].to_csv(f'../data/hisCBcsvs/{cb}.csv', index= False)

### *Emergency Room Visits*

In [277]:
#get the links for full population characteristics pages
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Emergency+Room+Visits'
PUFids = ['HC-229E', 'HC-220E', 'HC-213E', 'HC-206E', 'HC-197E', 'HC-188E', 'HC-178E', 'HC-168E', 'HC-160E', 'HC-152E']

erv = getPUFLinks(URL)
print(erv)

['download_data_files_detail.jsp?cboPufNumber=HC-229E', 'download_data_files_detail.jsp?cboPufNumber=HC-220E', 'download_data_files_detail.jsp?cboPufNumber=HC-213E', 'download_data_files_detail.jsp?cboPufNumber=HC-206E', 'download_data_files_detail.jsp?cboPufNumber=HC-197E', 'download_data_files_detail.jsp?cboPufNumber=HC-188E', 'download_data_files_detail.jsp?cboPufNumber=HC-178E', 'download_data_files_detail.jsp?cboPufNumber=HC-168E', 'download_data_files_detail.jsp?cboPufNumber=HC-160E', 'download_data_files_detail.jsp?cboPufNumber=HC-152E']


In [240]:
#create list of links to get pages with cb html links

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in erv])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-229E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-220E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-213E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-206E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-197E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-188E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-178E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-168E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-160E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-152E']


In [241]:
#get the links to the codebook pages and turn into working links
cblinks = []

for link in PUFlinks:
    cbl = getCBLinks(link)
    cblinks.extend('https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in cbl)
print(cblinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H229E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H220E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H213E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H206E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H197E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H188E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H178E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H168E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H160E', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H152E']


In [242]:
#loop over cblinks to get html tables, as a dictionary of dataframes
ervCBs = {}

for i, link in enumerate(cblinks, start=1):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        key = f'cb_{i}'                                                                # create dynamic key
        ervCBs[key] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}")

In [243]:
#check dictionary contents
ervCBs['cb_1']['Name'][:-2]

0             DUID
1         DUPERSID
2          EKG_M18
3         ERDATEMM
4         ERDATEYR
5         ERDMD21X
6         ERDMR21X
7         ERDOF21X
8         ERDOT21X
9         ERDPV21X
10        ERDSF21X
11        ERDSL21X
12        ERDTC21X
13        ERDTR21X
14        ERDVA21X
15        ERDWC21X
16        ERDXP21X
17        ERFMD21X
18        ERFMR21X
19        ERFOF21X
20        ERFOT21X
21        ERFPV21X
22        ERFSF21X
23        ERFSL21X
24        ERFTC21X
25        ERFTR21X
26        ERFVA21X
27        ERFWC21X
28        ERFXP21X
29        ERHEVIDX
30         ERTC21X
31         ERXP21X
32         EVENTRN
33         EVNTIDX
34         FFEEIDX
35        FFERTYPE
36         IMPFLAG
37     LABTEST_M18
38      MAMMOG_M18
39        MEDPRESC
40         MPCDATA
41         MRI_M18
42           PANEL
43        PERWT21F
44             PID
45      RCVVAC_M18
46    SONOGRAM_M18
47        SURGPROC
48          VARPSU
49          VARSTR
50        VSTCTGRY
51        VSTRELCN
52       XRA

In [244]:
#create dictionary of lists of links from name columns of each variable df

PUFids_vars = ['H229E', 'H220E', 'H213E', 'H206E', 'H197E', 'H188E', 'H178E', 'H168E', 'H160E', 'H152E']
varlinks = {}

for i, id in enumerate(PUFids_vars, start = 1):
    key = f'cb_{i}'
    varlinks[key] = ['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId='+ id 
                     + '&varName=' + word for word in obvCBs[f'cb_{i}']['Name'][:-2]]


In [272]:
#loop over dictionary of lists to create dictionary of dictionaries of dataframes of variable values
#altered to have a progress bar and run over one year at a time
key = 'cb_10'
varlinkslist = varlinks[key]

dict = {}
for i, link in tqdm(enumerate(varlinkslist, start=0)):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        k = f'var_{i}'                                                                # create dynamic key
        dict[k] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}") 

0it [00:00, ?it/s]

In [273]:
len(varlinks['cb_10'])

71

In [274]:
#code to pickle results of above loop so i never have to run it again

a = dict

with open('../data/erv_cbpickles/cb_10.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [343]:
#export variables and descriptions to csvs
for cb in ervCBs:
    ervCBs[cb].iloc[:, [0, 3]].to_csv(f'../data/ervCBcsvs/{cb}.csv', index= False)

### *Office-Based Medical Provider Visits*

In [280]:
URL = 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_results.jsp?cboDataYear=All&cboDataTypeY=2%2CHousehold+Event+File&buttonYearandDataType=Search&cboPufNumber=All&SearchTitle=Office-Based+Medical+Provider+Visits'
PUFids = ['HC-229G', 'HC-220G', 'HC-213G', 'HC-206G', 'HC-197G', 'HC-188G', 'HC-178G', 'HC-168G', 'HC-160G', 'HC-152G']

obv = getPUFLinks(URL)
print(obv)

['download_data_files_detail.jsp?cboPufNumber=HC-229G', 'download_data_files_detail.jsp?cboPufNumber=HC-220G', 'download_data_files_detail.jsp?cboPufNumber=HC-213G', 'download_data_files_detail.jsp?cboPufNumber=HC-206G', 'download_data_files_detail.jsp?cboPufNumber=HC-197G', 'download_data_files_detail.jsp?cboPufNumber=HC-188G', 'download_data_files_detail.jsp?cboPufNumber=HC-178G', 'download_data_files_detail.jsp?cboPufNumber=HC-168G', 'download_data_files_detail.jsp?cboPufNumber=HC-160G', 'download_data_files_detail.jsp?cboPufNumber=HC-152G']


In [281]:
#create list of links to get pages with cb html links

PUFlinks = (['https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in obv])
print(PUFlinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-229G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-220G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-213G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-206G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-197G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-188G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-178G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-168G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-160G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_detail.jsp?cboPufNumber=HC-152G']


In [282]:
#get the links to the codebook pages and turn into working links
cblinks = []

for link in PUFlinks:
    cbl = getCBLinks(link)
    cblinks.extend('https://meps.ahrq.gov/mepsweb/data_stats/' + word for word in cbl)
print(cblinks)

['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H229G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H220G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H213G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H206G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H197G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H188G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H178G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H168G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H160G', 'https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId=H152G']


In [283]:
#loop over cblinks to get html tables, as a dictionary of dataframes
obvCBs = {}

for i, link in enumerate(cblinks, start=1):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        key = f'cb_{i}'                                                                # create dynamic key
        obvCBs[key] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}")

In [284]:
#check dictionary contents
obvCBs['cb_1']['Name'][:-2]

0           DOCATLOC
1        DRSPLTY_M18
2               DUID
3           DUPERSID
4            EKG_M18
5            EVENTRN
6            EVNTIDX
7            FFBEF21
8            FFEEIDX
9           FFOBTYPE
10           FFTOT22
11           IMPFLAG
12       LABTEST_M18
13        MAMMOG_M18
14          MEDPRESC
15      MEDPTYPE_M18
16           MPCDATA
17           MPCELIG
18           MRI_M18
19          OBDATEMM
20          OBDATEYR
21           OBMD21X
22           OBMR21X
23           OBOF21X
24           OBOT21X
25           OBPV21X
26           OBSF21X
27           OBSL21X
28           OBTC21X
29           OBTR21X
30           OBVA21X
31           OBWC21X
32           OBXP21X
33             PANEL
34          PERWT21F
35               PID
36        RCVVAC_M18
37        SEEDOC_M18
38      SONOGRAM_M18
39          SURGPROC
40    TELEHEALTHFLAG
41            VARPSU
42            VARSTR
43         VISITTYPE
44          VSTCTGRY
45      VSTRELCN_M18
46         XRAYS_M18
Name: Name, d

In [285]:
#create dictionary of lists of links from name columns of each variable df

PUFids_vars = ['H229G', 'H220G', 'H213G', 'H206G', 'H197G', 'H188G', 'H178G', 'H168G', 'H160G', 'H152G']
varlinks = {}

for i, id in enumerate(PUFids_vars, start = 1):
    key = f'cb_{i}'
    varlinks[key] = ['https://meps.ahrq.gov/mepsweb/data_stats/download_data_files_codebook.jsp?PUFId='+ id 
                     + '&varName=' + word for word in obvCBs[f'cb_{i}']['Name'][:-2]]


In [314]:
#loop over dictionary of lists to create dictionary of dictionaries of dataframes of variable values
#altered to have a progress bar and run over one year at a time
key = 'cb_10'
varlinkslist = varlinks[key]

dict = {}
for i, link in tqdm(enumerate(varlinkslist, start=0)):
    try:
        page = pd.read_html(link)                                                      # read html tables from the page
        table = page[1]                                                                # grab second table(should be right one)
        k = f'var_{i}'                                                                # create dynamic key
        dict[k] = table                                                            # store the df in the dictionary
    except Exception as e:
        print(f"Error processing {link}: {e}") 

0it [00:00, ?it/s]

In [315]:
len(varlinks['cb_10'])

75

In [316]:
#code to pickle results of above loop so i never have to run it again

a = dict

with open('../data/obv_cbpickles/cb_10.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [344]:
#export variables and descriptions to csvs
for cb in obvCBs:
    obvCBs[cb].iloc[:, [0, 3]].to_csv(f'../data/obvCBcsvs/{cb}.csv', index= False)

In [340]:
obvCBs[cb].iloc[:, [0, 3]]

Unnamed: 0,Name,Description
0,ANESTH,THIS VISIT DID P RECEIVE ANESTHESIA
1,CHEMOTH,THIS VISIT DID P HAVE CHEMOTHERAPY
2,DOCATLOC,ANY MD WORK AT LOCATION WHERE P SAW PROV
3,DRSPLTY,MVIS DOCTOR'S SPECIALTY
4,DRUGTRT,THIS VIS DID P HAVE TRT FOR DRUG/ALCOHOL
...,...,...
72,VSTCTGRY,BEST CATEGORY FOR CARE P RECV ON VST DT
73,VSTRELCN,THIS VST/PHONE CALL RELATED TO SPEC COND
74,XRAYS,THIS VISIT DID P HAVE X-RAYS
75,,
