In [43]:
import pandas as pd
import urllib, json, requests

# read CT accidental overdose data
deaths = pd.read_csv('data/Accidental_Drug_Related_Deaths_2012-2018.csv')

# convert date to datetime and extract year
deaths["Date"] = pd.to_datetime(deaths["Date"])
deaths["Year"] = (deaths.Date.dt.to_period('Y')).astype('str')

# check number of na values
print(len(deaths[deaths["DeathCounty"].isna()]))
print(len(deaths[deaths["ResidenceCounty"].isna()]))

1100
797


In [44]:
def fill_county_by_geoloc_match(record):
    """
    Fills blank ResidenceCounty and DeathCounty values if a record is missing one and the geolocation data for residence city and death city matches.
    """
    if type(record["DeathCounty"]) == float and record["DeathCityGeo"] == record["ResidenceCityGeo"]:
        record["DeathCounty"] = record["ResidenceCounty"]
    if type(record["ResidenceCounty"]) == float and record["DeathCityGeo"] == record["ResidenceCityGeo"]:
        record["ResidenceCounty"] = record["DeathCounty"] 
    return record
        
deaths = deaths.apply(lambda x: fill_county_by_geoloc_match(x), axis=1)

# check number of na values
print(len(deaths[deaths["DeathCounty"].isna()]))
print(len(deaths[deaths["ResidenceCounty"].isna()]))

626
555


In [41]:
def find_county_by_geolocation(record, county_type):
    """
    Queries FCC API for county name based on geolocation coordinates.
    """
    if county_type == "res":
        coordinates = record[-3][record[-3].find('(')+1:-1].split(', ')
    else:
        coordinates = record[-4][record[-4].find('(')+1:-1].split(', ')
    
    with urllib.request.urlopen("https://geo.fcc.gov/api/census/area?lat=" + coordinates[0] + "&lon=" + coordinates[1] + "&format=json") as url:
        data = json.loads(url.read().decode())
        county_name = data["results"][0]["county_name"]
        return county_name

def fill_county_if_nan(record):
    """
    Fills DeathCounty and ResidenceCounty values (if NaN) via find_county_by_geolocation() 
    """
    
    if type(record["ResidenceCounty"]) == float and type(record["ResidenceCityGeo"]) == str:
        county = find_county_by_geolocation(record, 'res')
        record["ResidenceCounty"] = county.upper()
    
    if type(record["DeathCounty"]) == float:
        county = find_county_by_geolocation(record, 'death')
        record["DeathCounty"] = county.upper()
        
    return record
    
deaths = deaths.apply(lambda x: fill_county_if_nan(x), axis=1)

# check number of na values
print(len(deaths[deaths["DeathCounty"].isna()]))
print(len(deaths[deaths["ResidenceCounty"].isna()]))

0
24


In [40]:
"""
Create new array with county death numbers by year.
"""

counties = ('FAIRFIELD', 'HARTFORD', 'NEW HAVEN', 'NEW LONDON',
       'LITCHFIELD', 'MIDDLESEX', 'WINDHAM', 'TOLLAND')
years = ('2012', '2013', '2014', '2015', '2016', '2017', '2018')
deaths_county_and_year = []

for county in counties:
    
    residence_county = deaths[deaths["ResidenceCounty"] == county]
    death_county = deaths[deaths["DeathCounty"] == county]
    
    for year in years:
        
        deaths_county_and_year.append([
            county,
            year,
            len(residence_county[residence_county["Year"] == year]),
            len(death_county[death_county["Year"] == year])
        ])

# print to test
print(deaths_county_and_year)

[['FAIRFIELD', '2012', 51, 52], ['FAIRFIELD', '2013', 83, 94], ['FAIRFIELD', '2014', 76, 91], ['FAIRFIELD', '2015', 106, 103], ['FAIRFIELD', '2016', 159, 165], ['FAIRFIELD', '2017', 147, 165], ['FAIRFIELD', '2018', 148, 151], ['HARTFORD', '2012', 92, 101], ['HARTFORD', '2013', 119, 132], ['HARTFORD', '2014', 142, 166], ['HARTFORD', '2015', 189, 212], ['HARTFORD', '2016', 255, 284], ['HARTFORD', '2017', 284, 338], ['HARTFORD', '2018', 283, 339], ['NEW HAVEN', '2012', 90, 99], ['NEW HAVEN', '2013', 130, 135], ['NEW HAVEN', '2014', 140, 141], ['NEW HAVEN', '2015', 188, 211], ['NEW HAVEN', '2016', 233, 263], ['NEW HAVEN', '2017', 274, 288], ['NEW HAVEN', '2018', 239, 269], ['NEW LONDON', '2012', 43, 41], ['NEW LONDON', '2013', 51, 56], ['NEW LONDON', '2014', 49, 51], ['NEW LONDON', '2015', 59, 61], ['NEW LONDON', '2016', 85, 90], ['NEW LONDON', '2017', 85, 81], ['NEW LONDON', '2018', 89, 91], ['LITCHFIELD', '2012', 20, 17], ['LITCHFIELD', '2013', 35, 36], ['LITCHFIELD', '2014', 41, 36], ['

In [42]:
"""
Convert deaths data to dataframe and write to CSV
"""

deaths_by_county = pd.DataFrame(deaths_county_and_year)
deaths_by_county.columns = ('Jurisdiction', 
        'Year',
        'ResidentDeaths',
        'DeathsInCounty'
                           )
deaths_by_county["Jurisdiction"] = deaths_by_county["Jurisdiction"].str.title()
deaths_by_county.to_csv('data/deaths_by_county.csv')