## Obtaining other features

### Opioid prescribing map, 2016
https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html

In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import pickle
import time

from bs4 import BeautifulSoup

In [2]:
prescription_map_2016_url = "https://www.cdc.gov/drugoverdose/maps/rxcounty2016.html"

def get_rx_map(url):
    response = requests.get(url)
    page = response.text

    soup = BeautifulSoup(page, "html5lib")

    tables = soup.find_all("table")

    tables=soup.find_all("table")
    rows=[row for row in tables[0].find_all('tr')]

    records = []

    for row in rows:
        cells = [cell.text for cell in row.find_all('td')]
        records.append(cells)

    records = records[1:]
    records = [[r[1], r[2], r[3]] for r in records]
    
    return records

In [3]:
rx_2016 = get_rx_map(prescription_map_2016_url)

In [4]:
opioid_rx_df = pd.DataFrame(rx_2016, columns=["state", "INCITS", "opioid_rx_rate_2016"])

In [5]:
prescription_map_2015_url = "https://www.cdc.gov/drugoverdose/maps/rxcounty2015.html"

rx_2015 = get_rx_map(prescription_map_2015_url)

opioid_rx_2015 = pd.DataFrame(rx_2015, columns=["state", "INCITS", "opioid_rx_rate_2015"])

In [6]:
opioid_rx_df = pd.merge(opioid_rx_df, opioid_rx_2015[['INCITS','opioid_rx_rate_2015']], on='INCITS')

In [8]:
prescription_map_2014_url = "https://www.cdc.gov/drugoverdose/maps/rxcounty2014.html"

rx_2014 = get_rx_map(prescription_map_2014_url)

opioid_rx_2014 = pd.DataFrame(rx_2014, columns=["state", "INCITS", "opioid_rx_rate_2014"])

opioid_rx_df = pd.merge(opioid_rx_df, opioid_rx_2014[['INCITS','opioid_rx_rate_2014']], on='INCITS')

In [10]:
opioid_rx_df.head()

Unnamed: 0,state,INCITS,opioid_rx_rate_2016,opioid_rx_rate_2015,opioid_rx_rate_2014
0,AK,2013,–,–,–
1,AK,2016,–,–,–
2,AK,2020,66.3,68.2,68.0
3,AK,2050,–,–,–
4,AK,2060,–,–,–


In [13]:
#master_df = master_df[master_df.columns].apply(pd.to_numeric)
year_cols = ['opioid_rx_rate_2016', 'opioid_rx_rate_2015', 'opioid_rx_rate_2014']
opioid_rx_df[year_cols] = opioid_rx_df[year_cols].apply(pd.to_numeric, errors='coerce')

In [18]:
state_avg = opioid_rx_df.groupby('state').mean()

# df["value"] = df.groupby("name")["value"].transform(lambda x: x.fillna(x.mean()))
# https://stackoverflow.com/questions/19966018/pandas-filling-missing-values-by-mean-in-each-group
for year in year_cols:
    opioid_rx_df[year] = opioid_rx_df.groupby('state')[year].transform(lambda x: x.fillna(x.mean()))

In [19]:
opioid_rx_df.head()

Unnamed: 0,state,INCITS,opioid_rx_rate_2016,opioid_rx_rate_2015,opioid_rx_rate_2014
0,AK,2013,56.366667,59.425,63.141667
1,AK,2016,56.366667,59.425,63.141667
2,AK,2020,66.3,68.2,68.0
3,AK,2050,56.366667,59.425,63.141667
4,AK,2060,56.366667,59.425,63.141667


In [20]:
with open('opioid_rx_df.pkl', 'wb') as picklefile:
    pickle.dump(opioid_rx_df, picklefile)

## Age of PDMP in 2016

Scraped from http://www.pdmpassist.org/content/state-profiles.

In [5]:
pdmp_url = "http://www.pdmpassist.org"
state_profiles = "/content/state-profiles"

response = requests.get(pdmp_url + state_profiles)
page = response.text

soup = BeautifulSoup(page, "html5lib")

table = soup.select("table[class*=agenda]")

links = soup.select("a[href^=/content/]")

profile_links = []

for link in links[32:92]:
    profile_links.append(link['href'])

In [6]:
state_links = [pdmp_url + pl for pl in profile_links]

In [7]:
state_links[0]

'http://www.pdmpassist.org/content/alabama-state-profile'

In [8]:
pmp_state_year_dict = {}

for state in state_links:
    response = requests.get(state)
    page = response.text
    soup = BeautifulSoup(page, "html5lib")
    
    title = soup.find("title")
    state_name = title.text.split("State")[0].strip()
    
    pmp_operational = soup.select('div[class*="field-field-pmp-operational]"')
    year_pmp_operational = str(pmp_operational[0].contents)
    year_pmp_operational = year_pmp_operational.split("\n")
    year_pmp_operational = re.sub(r'\<.*?\>', "", year_pmp_operational[4]).strip()
    
    pmp_state_year_dict[state_name] = year_pmp_operational
    
    time.sleep(0.5)

In [9]:
pmp_state_year_dict

{'Alabama': '2006',
 'Idaho': '1967',
 'Missouri': '4-25-2017',
 'Pennsylvania': '1973',
 'Alaska': '2011',
 'Illinois': '1968',
 'Montana': '2012',
 'Rhode Island': '1979',
 'Arizona': '2008',
 'Indiana': '1998',
 'Nebraska': '2011',
 'South Carolina': '2008',
 'Arkansas': '2013',
 'Iowa': '2009',
 'Nevada': '1997',
 'South Dakota': '2011',
 'California': '1939',
 'Kansas': '2011',
 'New Hampshire': '2014',
 'Tennessee': '2006',
 'Colorado': '2007',
 'Kentucky': '1999',
 'New Jersey': '2011',
 'Texas': '1982',
 'Connecticut': '2008',
 'Louisiana': '2008',
 'New Mexico': '2005',
 'Utah': '1996',
 'Delaware': '2012',
 'Maine': '2004',
 'New York': '1973',
 'Vermont': '2009',
 'District of Columbia': '2016',
 'Maryland': '2013',
 'North Carolina': '2007',
 'Virginia': '2003',
 'Florida': '2011',
 'Massachusetts': '1994',
 'North Dakota': '2007',
 'Washington': '2011',
 'Georgia': '2013',
 'Michigan': '1989',
 'Ohio': '2006',
 'West Virginia': '1995',
 'Guam Territory Profile             

In [10]:
# This list is small enough that I'm going to manually remove Guam and also correct the year for Missouri.
pmp_state_year_dict.pop('Guam Territory Profile                     | The PDMP Training and Technical Assistance Center')
pmp_state_year_dict['Missouri'] = '2017'

In [11]:
for key, value in pmp_state_year_dict.items():
    temp = int(pmp_state_year_dict[key])
    pmp_state_year_dict[key] = temp

In [12]:
#DataFrame.from_dict(data, orient='columns', dtype=None)
#pmp_state_year_df = pd.DataFrame(np.asarray(pmp_state_year_dict), columns=['state', 'year'])
x = pd.Series(pmp_state_year_dict)

In [13]:
pmp_state_year_df = pd.DataFrame(x, columns=['year'])
pmp_state_year_df.reset_index(inplace=True)

In [14]:
pmp_state_year_df.head()

Unnamed: 0,index,year
0,Alabama,2006
1,Idaho,1967
2,Missouri,2017
3,Pennsylvania,1973
4,Alaska,2011


In [15]:
pmp_state_year_df['pmp_age_in_2016'] = 2016 - pmp_state_year_df.year

In [16]:
pmp_state_year_df['pmp_age_in_2016'] = pmp_state_year_df['pmp_age_in_2016'].clip_lower(0)

In [17]:
with open('pmp_age.pkl', 'wb') as picklefile:
    pickle.dump(pmp_state_year_df, picklefile)

## Geographic divisions (per bureau)

Geographic divisions from this [csv](https://github.com/cphalpert/census-regions/blob/master/us%20census%20bureau%20regions%20and%20divisions.csv). These variables need to be dummified.

In [20]:
us_regions = pd.read_csv("https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv")

In [22]:
us_regions.head()

Unnamed: 0,State,State Code,Region,Division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


In [42]:
us_regions_dummies = pd.concat([us_regions['State Code'], pd.get_dummies(us_regions['Division'], prefix = 'Division')], axis=1)

In [43]:
us_regions_dummies.head()

Unnamed: 0,State Code,Division_East North Central,Division_East South Central,Division_Middle Atlantic,Division_Mountain,Division_New England,Division_Pacific,Division_South Atlantic,Division_West North Central,Division_West South Central
0,AK,0,0,0,0,0,1,0,0,0
1,AL,0,1,0,0,0,0,0,0,0
2,AR,0,0,0,0,0,0,0,0,1
3,AZ,0,0,0,1,0,0,0,0,0
4,CA,0,0,0,0,0,1,0,0,0


In [44]:
with open('us_regions_dummies.pkl', 'wb') as picklefile:
    pickle.dump(us_regions_dummies, picklefile)