In [61]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
import os

# Raw Data
import pandas as pd
dr = pd.read_csv('deathrate.csv')

# Rename Column
dr = dr.rename(columns = {"Unnamed: 0": "Number"}) 

# Drop Mc Kean County and kalawao County due to missing poverty data
dr = dr[dr['County'] != 'Mc Kean County, PA']
dr = dr[dr['County'] != 'Kalawao County, HI']

# Drop duplicate rows
dr = dr[dr['Number'] != 788]
dr = dr[dr['Number'] != 3924]
dr = dr[dr['Number'] != 7060]
dr = dr[dr['Number'] != 10196]

# Extract out state and county to two new columns
dr['State'] = dr.apply(lambda row: row['County'][-2:], axis=1)
dr['City'] = dr.apply(lambda row: row['County'].split(',')[0], axis=1)

# Merge latitude and longitude with county
# Obatined from https://simplemaps.com/data/us-cities
cities = pd.read_csv('cities.csv')
cities = cities.drop(columns=['city_ascii', 'state_name', 'population', 'population_proper', 'density', 'source', 'incorporated', 'timezone', 'zips', 'id'])
cities_mean = cities.groupby('county_fips').mean()
dr = dr.merge(cities_mean, left_on='FIPS', right_on='county_fips', how='left')

# Manually insert missing latatiude and longitude
# Obtained data from google
dr.loc[dr['FIPS'] == 44001, 'lat'] = 41.7258
dr.loc[dr['FIPS'] == 44001, 'lng'] = -71.3112
dr.loc[dr['FIPS'] == 46113, 'lat'] = 43.2437
dr.loc[dr['FIPS'] == 46113, 'lng'] = -102.6216
dr.loc[dr['FIPS'] == 2270, 'lat'] = 62.1458
dr.loc[dr['FIPS'] == 2270, 'lng'] = -162.8919


FileNotFoundError: File b'deathrate.csv' does not exist

In [None]:
matrix = np.zeros([15,3133])
row=0
column=0
for county in dr.FIPS.unique():
    dr_county = dr[dr.FIPS==county]
    row=0
    for year in [1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013]:
        dr_1999 = dr_county[dr_county.Year==year]
        dr_2000 = dr_county[dr_county.Year==year+1]
        dr_2000 = dr_2000.drop(columns=['Number', 'Year', 'County', 'Population', 'Poverty', 'State', 'City', 'lat', 'lng'])
        dr_2000.rename(columns={'Deathrate':'Target'}, inplace=True)
        dr_1999_2000 = dr_1999.merge(dr_2000, left_on='FIPS', right_on='FIPS', how='left')
        dr_1999_2000['PovertyRate'] = dr_1999_2000.apply(lambda row: row['Poverty']/row['Population'], axis=1)

        #X = pd.DataFrame(data=dr_1999_2000, columns=['Deathrate','PovertyRate'])
        X = pd.DataFrame(data=dr_1999_2000, columns=['Deathrate'])
        Y = pd.DataFrame(dr_1999_2000['Target'])

        from sklearn import linear_model
        #reg = LinearRegression().fit(X, Y)
        #print(reg.coef_)
        clf = linear_model.LinearRegression(fit_intercept=False)
        clf.fit(X, Y)
        matrix[row][column] = clf.coef_
        row += 1
    column += 1

rounded = np.around(matrix, decimals=3)
rounded

In [None]:
# Generate distance Matrix
def get_delta_matrix(year, state):
    df = dr[dr['Year'] == year]
    df = df[df['State'] == state]
    matrix = np.zeros([len(df),len(df)])
    
    row=0
    column=0
    for county_row in df.County.unique():
        row = 0
        for county_column in df.County.unique():
            latitude = float(df[df.County==county_row].lat) - float(df[df.County==county_column].lat)
            longitude = float(df[df.County==county_row].lng) - float(df[df.County==county_column].lng)
            matrix[row][column] = latitude**2 + longitude**2
            row += 1
        column += 1

    matrix = np.around(matrix, decimals=3)

    matrix_neighbor = np.zeros([len(df),len(df)])
    for j in range(len(df)):
        for i in range(len(df)):
            if matrix[i][j] < 0.36:
                matrix_neighbor[i][j] = 1
            else:
                matrix_neighbor[i][j] = 0

    index = 0
    for row in matrix_neighbor:
        matrix_neighbor[index][index] = sum(row)-1
        index += 1

    rownum = 0
    for j in range(len(df)):
        rownum += matrix_neighbor[j][j]
    
    delta = np.zeros([int(rownum),len(df)])
    index = 0
    for j in range(len(df)):
        for i in range(int(matrix_neighbor[j][j])):
            delta[index+i][j] = -1
        index += int(matrix_neighbor[j][j])

    index = 0
    for i in range(len(df)):
        for j in range(len(df)):
            if i == j:
                pass
            elif int(matrix_neighbor[i][j]) == 1:
                delta[index][j] = 1
                index += 1
    
    return(pd.DataFrame(delta).astype(int))
    

In [63]:
get_delta_matrix(2001, "MA")

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,-1,0,0,1,0,0,0,0,0,0,0,0,0,0
1,-1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,-1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,-1,0,0,0,0,0,0,0,1,0,0,0
4,0,0,-1,0,0,0,0,0,0,0,0,1,0,0
5,1,0,0,-1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,-1,0,0,0,0,0,1,0,0,0,0
7,0,0,0,0,-1,0,0,0,1,0,0,0,0,0
8,0,0,0,0,-1,0,0,0,0,0,1,0,0,0
9,0,0,0,0,-1,0,0,0,0,0,0,1,0,0
