## Data

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import os

# Raw Data
import pandas as pd
dr = pd.read_csv('../Data/deathrate.csv')

# Rename Column
dr = dr.rename(columns = {"Unnamed: 0": "Number"}) 

# Drop Mc Kean County and kalawao County due to missing poverty data
dr = dr[dr['County'] != 'Mc Kean County, PA']
dr = dr[dr['County'] != 'Kalawao County, HI']

# Drop duplicate rows
dr = dr[dr['Number'] != 788]
dr = dr[dr['Number'] != 3924]
dr = dr[dr['Number'] != 7060]
dr = dr[dr['Number'] != 10196]

# Extract out state and county to two new columns
dr['State'] = dr.apply(lambda row: row['County'][-2:], axis=1)
dr['City'] = dr.apply(lambda row: row['County'].split(',')[0], axis=1)

# Merge latitude and longitude with county
# Obatined from https://simplemaps.com/data/us-cities
cities = pd.read_csv('../Data/cities.csv')
cities = cities.drop(columns=['city_ascii', 'state_name', 'population', 'population_proper', 'density', 'source', 'incorporated', 'timezone', 'zips', 'id'])
cities_mean = cities.groupby('county_fips').mean()
dr = dr.merge(cities_mean, left_on='FIPS', right_on='county_fips', how='left')

# Manually insert missing latatiude and longitude
# Obtained data from google
dr.loc[dr['FIPS'] == 44001, 'lat'] = 41.7258
dr.loc[dr['FIPS'] == 44001, 'lng'] = -71.3112
dr.loc[dr['FIPS'] == 46113, 'lat'] = 43.2437
dr.loc[dr['FIPS'] == 46113, 'lng'] = -102.6216
dr.loc[dr['FIPS'] == 2270, 'lat'] = 62.1458
dr.loc[dr['FIPS'] == 2270, 'lng'] = -162.8919

dr.head(10)


Unnamed: 0,Number,Year,County,FIPS,Deathrate,Population,Poverty,State,City,lat,lng
0,1,1999,"Abbeville County, SC",45001,1,25921,3257.0,SC,Abbeville County,34.253043,-82.500214
1,2,1999,"Acadia Parish, LA",22001,7,58762,12461.0,LA,Acadia Parish,30.260009,-92.424273
2,3,1999,"Accomack County, VA",51001,5,37614,6107.0,VA,Accomack County,37.767581,-75.665943
3,4,1999,"Ada County, ID",16001,7,294292,24964.0,ID,Ada County,43.620729,-116.327886
4,5,1999,"Adair County, IA",19001,1,8298,697.0,IA,Adair County,41.28256,-94.48486
5,6,1999,"Adair County, KY",21001,5,17054,3656.0,KY,Adair County,37.110171,-85.307586
6,7,1999,"Adair County, MO",29001,3,24961,3284.0,MO,Adair County,40.1575,-92.5247
7,8,1999,"Adair County, OK",40001,3,20904,4385.0,OK,Adair County,35.877953,-94.649721
8,9,1999,"Adams County, CO",8001,9,354146,32040.0,CO,Adams County,39.844996,-104.87217
9,10,1999,"Adams County, IA",19003,1,4498,510.0,IA,Adams County,41.0445,-94.722


## Model

We model the spatio-temporal structure of our data using a graph structure $G = (V,E)$ where $V$ consists of nodes corresponding each county-year pair and $E$ represents the set of edges between nodes.

We consider the case of Kentucky in 1999. We say that two counties are connected by an edge in the graph G if their distance in units of latitude & longitude between them is less than 0.36.

In [None]:
# Generate distance Matrix

dr = dr[dr['Year'] == 1999]
dr = dr[dr['State'] == "KY"]
#dr = dr.head(4)
matrix = np.zeros([120,120])

row=0
column=0

for county_row in dr.County.unique():
    row = 0
    for county_column in dr.County.unique():
        latitude = float(dr[dr.County==county_row].lat) - float(dr[dr.County==county_column].lat)
        longitude = float(dr[dr.County==county_row].lng) - float(dr[dr.County==county_column].lng)
        matrix[row][column] = latitude**2 + longitude**2
        row += 1
    column += 1

matrix = np.around(matrix, decimals=3)    

In [None]:
# Generate delta matrix 

matrix_neighbor = np.zeros([120,120])
for j in range(120):
    for i in range(120):
        if matrix[i][j] < 0.36:
            matrix_neighbor[i][j] = 1
        else:
            matrix_neighbor[i][j] = 0

index = 0
for row in matrix_neighbor:
    matrix_neighbor[index][index] = sum(row)-1
    index += 1

delta = np.zeros([1140,120])
index = 0
for j in range(120):
    for i in range(int(matrix_neighbor[j][j])):
        delta[index+i][j] = -1
    index += int(matrix_neighbor[j][j])

index = 0
for i in range(120):
    for j in range(120):
        if int(matrix_neighbor[i][j]) == 1:
            delta[index][j] = 1
            index += 1

In [None]:
delta

## Ridge regression

$$
\min_{\beta} \|y - \beta \|_2^2 + \lambda \|\Delta \beta \|_2^2 
$$

In [13]:
def ridge(y, X, llambda, Sigma):
    return np.linalg.solve((X.T@X) + (llambda*Sigma.T@Sigma), (X.T@y))

In [44]:
def get_beta(year, state, llambda):
    df = dr[dr['Year'] == year]
    df = df[df['State'] == state]
    y = df['Deathrate']
    n = len(y)
    X = np.eye(n)
    beta = ridge(y, X, llambda, Sigma = delta)
    return beta

In [51]:
def get_df(year,state,llambda):
    beta = get_beta(year, state, llambda)
    df = dr[dr['Year'] == year]
    df = df[df['State'] == state]
    df['beta'] = beta
    return df

In [57]:
df = get_df(1999, 'KY', 0.1)
df.to_csv('../Output/beta.csv',index = False)

In [58]:
df

Unnamed: 0,Number,Year,County,FIPS,Deathrate,Population,Poverty,State,City,lat,lng,beta
5,6,1999,"Adair County, KY",21001,5,17054,3656.0,KY,Adair County,37.110171,-85.307586,5.856156
48,49,1999,"Allen County, KY",21003,5,17658,2458.0,KY,Allen County,36.729100,-86.261725,5.081116
60,61,1999,"Anderson County, KY",21005,5,18864,1527.0,KY,Anderson County,38.036667,-84.935367,6.006104
121,122,1999,"Ballard County, KY",21007,3,8225,1051.0,KY,Ballard County,37.033229,-88.960600,3.449327
136,137,1999,"Barren County, KY",21009,5,37687,5599.0,KY,Barren County,36.967600,-85.953612,5.238692
147,148,1999,"Bath County, KY",21011,9,10911,2209.0,KY,Bath County,38.149140,-83.764600,8.709408
170,171,1999,"Bell County, KY",21013,19,30026,7393.0,KY,Bell County,36.712443,-83.698657,15.498846
235,236,1999,"Boone County, KY",21015,7,83349,5194.0,KY,Boone County,38.992874,-84.707005,7.630050
247,248,1999,"Bourbon County, KY",21017,7,19340,2438.0,KY,Bourbon County,38.231433,-84.151450,7.446210
252,253,1999,"Boyd County, KY",21019,13,49816,7368.0,KY,Boyd County,38.410240,-82.685400,11.881015


## Fused Lasso

$$
\min_{\beta} \|y-\beta\|_2^2 + \lambda \|\Delta \beta \|_1
$$

CVX solution