In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors

# 1. FL_IPUMS Feature Selection

In [45]:
ipums = pd.read_csv('../../data/processed/FL_IPUMS.csv')

## 1.1. Extracting Relevant Features

In [46]:
ipums_18plus = ipums[ipums['AGE'] >= 18]

In [47]:
ipums_18plus.head(6)

Unnamed: 0.1,Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,PUMA,...,EMPSTAT,EMPSTATD,INCTOT,POVERTY,MIGRATE1,MIGRATE1D,is_owner,is_renter,owns_free_clear,owns_with_mortgage
0,0,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,...,1.0,10.0,8500.0,29,1,10.0,0,1,0,0
4,4,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,...,3.0,30.0,0.0,29,1,10.0,0,1,0,0
5,5,2017,201701,265771,2017000000011,76.0,2017002657711,12,11,1102,...,1.0,10.0,17400.0,238,1,10.0,1,0,0,1
6,6,2017,201701,265771,2017000000011,76.0,2017002657711,12,11,1102,...,1.0,10.0,17400.0,238,1,10.0,1,0,0,1
7,7,2017,201701,265772,2017000000021,97.0,2017002657721,12,9,902,...,3.0,30.0,8000.0,198,1,10.0,0,1,0,0
8,8,2017,201701,265772,2017000000021,97.0,2017002657721,12,9,902,...,3.0,30.0,21000.0,198,1,10.0,0,1,0,0


In [48]:
ipums_ft = ipums_18plus[['YEAR', 'SERIAL', 'COUNTYFIP', 'NCHILD', 
                         'SEX', 'AGE', 'HISPAN', 'RACE', 'EDUC',
                         'EMPSTAT', 'OWNERSHP', 'RENT', 'is_owner', 'is_renter', 'HHINCOME', 'INCTOT',
                         'VALUEH', 'MARST', 'MIGRATE1', 'GQ']]

In [49]:
ipums_ft.head(6)

Unnamed: 0,YEAR,SERIAL,COUNTYFIP,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,OWNERSHP,RENT,is_owner,is_renter,HHINCOME,INCTOT,VALUEH,MARST,MIGRATE1,GQ
0,2017,265770,0,3,2,30,4,7,6,1.0,2.0,950,0,1,8500.0,8500.0,,6,1,1
4,2017,265770,0,3,1,41,4,7,6,3.0,2.0,950,0,1,8500.0,0.0,,3,1,1
5,2017,265771,11,0,1,78,0,1,10,1.0,1.0,0,1,0,34800.0,17400.0,60000.0,1,1,1
6,2017,265771,11,0,2,68,0,1,7,1.0,1.0,0,1,0,34800.0,17400.0,60000.0,1,1,1
7,2017,265772,9,0,1,73,0,1,11,3.0,2.0,700,0,1,29000.0,8000.0,,1,1,1
8,2017,265772,9,0,2,72,0,1,11,3.0,2.0,700,0,1,29000.0,21000.0,,1,1,1


## 1.2. Light cleaning

In [50]:
# We have an extra year (2023)
# should be removed for consistency with other datasets

ipums_ft['YEAR'].value_counts()
ipums_ft = ipums_ft[ipums_ft['YEAR'] != 2023]

In [None]:
# Filter on Broward County (FIPS 11)

ipums_ft_31 = ipums_ft[ipums_ft['COUNTYFIP'] == 31].copy()

In [None]:
ipums_ft_31.to_csv('../../data/IPUMS/ipums_ft_11.csv', index=False)

In [None]:
# Recoding the SEX column (it usees 1 and 2 instead of 0 and 1)
# Male = 1 (originally) = 1 (new)
# Female = 2 (originally) = 0 (new)

ipums_ft_31['SEX'] = ipums_ft_31['SEX'].map({1: 1, 2: 0})
ipums_ft_31['SEX'].value_counts()

SEX
0    40133
1    36817
Name: count, dtype: int64

In [None]:
# NA values check

ipums_ft_31.isnull().sum()

# HHINCOME and VALUEH has lots of NA values

YEAR             0
SERIAL           0
COUNTYFIP        0
NCHILD           0
SEX              0
AGE              0
HISPAN           0
RACE             0
EDUC             0
EMPSTAT          0
OWNERSHP      2097
RENT             0
is_owner         0
is_renter        0
HHINCOME      2097
INCTOT           0
VALUEH       23328
MARST            0
MIGRATE1         0
GQ               0
dtype: int64

### 1.2.1. Missingness Diagnosis (FUFCK)

In [None]:
hhincome_missing =ipums_ft_31.groupby('GQ').agg({
    'HHINCOME': lambda x: x.isnull().sum()})

hhincome_missing = hhincome_missing.rename(columns={'HHINCOME': 'missing HHINCOME'})
hhincome_missing['total n'] = ipums_ft_31.groupby('GQ').size()

hhincome_missing

Unnamed: 0_level_0,missing HHINCOME,total n
GQ,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,74783
2,0,70
3,1248,1248
4,849,849


Good to know the missingness is structural rather than random... GQ codes 3 and 4 represents group quarters living arrangements (dormitories, prisons, nursing homes, etc.) where income data is not collected.

In [None]:
valueh_missing = ipums_ft_31.groupby('is_owner').agg({
    'VALUEH': lambda x: x.isnull().sum()})

valueh_missing = valueh_missing.rename(columns={'VALUEH': 'missing VALUEH'})
valueh_missing['total n'] = ipums_ft_31.groupby('is_owner').size()

valueh_missing

Unnamed: 0_level_0,missing VALUEH,total n
is_owner,Unnamed: 1_level_1,Unnamed: 2_level_1
0,23328,23328
1,0,53622


THANK GOD THANK GOD THANK GOD THANK GOD THANK GOD THANK GOD that all the missing VALUEH are from non-owners, which makes sense. No need to impute those values.

Still, should get rid of these guys

In [None]:
ipums_ft_31 = ipums_ft_31[ipums_ft_31['GQ'].isin([1, 2])].copy()

# 2. IRS Migration Data

In [58]:
irs = pd.read_csv('../../data/processed/FL_hurricane_county_year_impacts.csv')

In [None]:
irs_11 = irs[irs['fips'] == 12031]

In [60]:
irs_11.head()

Unnamed: 0,fips,year,num_declarations,state,county_name,outflow_returns,inflow_returns,net_migration_returns,outflow_people,inflow_people,net_migration_people,outflow_agi,inflow_agi,max_wind_speed,storms
29,12011,2017,2,FL,Broward (County),945662.0,938691.0,-6971.0,1819259.0,1796960.0,-22299.0,62693616.0,64096409.0,155.0,HURRICANE IRMA
30,12011,2019,2,FL,Broward (County),953386.0,945057.0,-8329.0,1822477.0,1797113.0,-25364.0,73548626.0,74823577.0,160.0,HURRICANE DORIAN; HURRICANE DORIAN
31,12011,2020,1,FL,Broward (County),967336.0,946916.0,-20420.0,1857598.0,1810177.0,-47421.0,78133352.0,76750728.0,80.0,HURRICANE ISAIAS
32,12011,2022,4,FL,Broward (County),999987.0,994604.0,-5383.0,1808085.0,1786372.0,-21713.0,93860199.0,99337673.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...


# 3. FEMA NHS

In [61]:
nhs = pd.read_csv('../../data/processed/FL_NHS.csv')

## 3.1. Cleaning

### 3.1.1. Replacing "Blank" with NA

In [62]:
def replace_Blank_with_NA(column_list):
    """
    ARGS:
        column_list = list of columns to clean "Blank" entries from
    RETURNS:
        clean_nhs: now-clean dataframe with "Blank" entries replaced with np.nan
    """

    clean_nhs = nhs.copy()

    for col in column_list:
        clean_nhs[col] = clean_nhs[col].replace("Blank", np.nan)
    
    return clean_nhs

In [63]:
# Replacing Blanks with NA

cols_with_Blank = ['hrcn_prepactions', 'hrcn_safe_shelter', 'hrcn_impacts',
                   'hrcn_prep_specific', 'numchild_school', 'school_plan_aware',
                   'hazard_insurance', 'hrcn_action_1plus', 'hrcn_action_3plus',
                   'hrcn_influencer_aware', 'hrcn_influencer_exp',
                   'hrcn_influencer_efficacy']

nhs = replace_Blank_with_NA(cols_with_Blank)

### 3.1.2. Handling "Don't know" Responses

In [64]:
idk_counts = nhs.apply(lambda col: (col == "Don't know").sum())
print(idk_counts[idk_counts > 0].sort_values(ascending=False))

hrcn_preparedness_stage    15
homeowners_insurance       14
hrcn_shutters              10
hrcn_risk_perception        9
mortgage                    9
mortgage_4cat               9
hrcn_aware                  6
hrcn_experience             6
hrcn_prep_confidence        6
hrcn_influencer_risk        6
caretaker                   6
hrcn_prep_efficacy          5
numchild_school             2
hometype                    2
dtype: int64


In [65]:
# Cleaning "Don't know" entries

def clean_idk(column_list):
    """
    ARGS: 
        column_list = list of columns to clean "Don't know" entries from
    RETURNS: 
        clean_nhs: now-clean dataframe with "Don't know" entries removed
        removed_cols: dict of number of "Don't know" entries removed per column
    """

    removed_cols = {}
    clean_nhs = nhs.copy()

    for col in column_list:
        cnt = (nhs[col] == "Don't know").sum()
        removed_cols[col] = int(cnt)
        clean_nhs = clean_nhs[clean_nhs[col] != "Don't know"]
    
    return clean_nhs, removed_cols

In [66]:
NotSureIfShouldDropDontKnows = ['caretaker', 'numchild_school', 'mortgage', 'hometype',
                                'ses_score', 'mortgage_4cat', 'homeowners_insurance']

idk_cols_demographics = []

idk_cols_hrcn = ['hrcn_aware', 'hrcn_risk_perception', 'hrcn_experience',
            'hrcn_shutters', 'hrcn_prep_efficacy', 'hrcn_prep_confidence',
            'hrcn_preparedness_stage']

nhs, removed_cols = clean_idk(idk_cols_hrcn)

In [67]:
idk_counts = nhs.apply(lambda col: (col == "Don't know").sum())
print(idk_counts[idk_counts > 0].sort_values(ascending=False))

homeowners_insurance    8
mortgage                6
mortgage_4cat           6
caretaker               2
numchild_school         2
hrcn_influencer_risk    1
dtype: int64


In [68]:
removed_cols

{'hrcn_aware': 6,
 'hrcn_risk_perception': 9,
 'hrcn_experience': 6,
 'hrcn_shutters': 10,
 'hrcn_prep_efficacy': 5,
 'hrcn_prep_confidence': 6,
 'hrcn_preparedness_stage': 15}

In [69]:
# NOTES FOR LATER

# Should I turn employment into binary (Yes/No)?
# 

## 3.2. Using St. John's as Duval County Proxy

In [23]:
nhs['county'].value_counts()

# No Duval...

county
Miami-Dade      65
Broward         39
Volusia         18
Palm Beach      17
Brevard         14
Lee             13
St. Lucie       10
Seminole         9
Indian River     7
Escambia         6
Bay              6
Highlands        5
St. Johns        5
Charlotte        4
Alachua          4
Martin           3
Name: count, dtype: int64

In [1521]:
county_fips_map = {
    'Miami-Dade': 12086, 'Broward': 12011, 'Volusia': 12127,
    'Palm Beach': 12099, 'Brevard': 12009, 'Lee': 12071,
    'St. Lucie': 12111, 'Seminole': 12117, 'Indian River': 12061,
    'Escambia': 12033, 'Bay': 12005, 'St. Johns': 12109,
    'Highlands': 12055, 'Charlotte': 12015, 'Alachua': 12001,
    'Martin': 12085
}

nhs['fips'] = nhs['county'].map(county_fips_map)
nhs['fips'].value_counts() 

fips
12086    57
12011    34
12127    14
12009    14
12099    14
12071    13
12111     9
12117     7
12061     6
12033     6
12055     5
12109     5
12005     5
12015     4
12085     3
12001     1
Name: count, dtype: int64

In [1522]:
# Creating county-level aggregations

nhs_county = nhs.groupby('fips').agg({
    'hrcn_experience': lambda x: (x == 'Yes').mean(),
    'hrcn_risk_perception': lambda x: (x == 'Very likely').mean(),
    'hrcn_preparedness_stage': lambda x: x.str.contains('prepared for MORE', na=False).mean(),
    'respid': 'count'
}).reset_index()

nhs_county.columns = [
    'fips', 
    'hurricane_exp_%',
    'high_risk_%', 
    'well_prepared_%',
    'n'
] # Renaming to make it pretty

nhs_county.head()

Unnamed: 0,fips,hurricane_exp_%,high_risk_%,well_prepared_%,n
0,12001,1.0,0.0,0.0,1
1,12005,0.6,0.4,0.6,5
2,12009,0.928571,0.642857,0.785714,14
3,12011,0.794118,0.617647,0.558824,34
4,12015,1.0,0.25,1.0,4


We don't have Duval County in the NHS data. We can either use Nassau or St. John as a proxy.

In [1523]:
potential_sub = nhs_county[(nhs_county['fips'] == 12109) | (nhs_county['fips'] == 12089)]

potential_sub.head()

Unnamed: 0,fips,hurricane_exp_%,high_risk_%,well_prepared_%,n
12,12109,1.0,0.4,0.8,5


We don't even have Nassau lmao. We will have to use St. John's value to impute Duval County's NHS features.

In [None]:
st_john = potential_sub[potential_sub['fips'] == 12109].iloc[0]
st_john.head()

NameError: name 'potential_sub' is not defined

In [1525]:
duval = pd.DataFrame({
    'fips': [12031],
    'hurricane_exp_%': [st_john['hurricane_exp_%']],
    'high_risk_%': [st_john['high_risk_%']],
    'well_prepared_%': [st_john['well_prepared_%']],
    'n': [0]
})

duval.head()

Unnamed: 0,fips,hurricane_exp_%,high_risk_%,well_prepared_%,n
0,12031,1.0,0.4,0.8,0


In [1526]:
nhs_county = pd.concat([nhs_county, duval], ignore_index=True)

nhs_county.tail()

Unnamed: 0,fips,hurricane_exp_%,high_risk_%,well_prepared_%,n
12,12109,1.0,0.4,0.8,5
13,12111,0.888889,0.444444,0.666667,9
14,12117,0.857143,0.857143,0.714286,7
15,12127,0.857143,0.785714,0.5,14
16,12031,1.0,0.4,0.8,0


## 3.3. Preparing NHS for merging

### 3.3.1. Mapping these jawns

Since NHS and IPUMS have different data types (and formats) when it comes to columns like age and income, we will have to change the way it's coded in the NHS data to match IPUMS.

In [1527]:
# Age

age_map = {
    '18-19': 18.5, '20-29': 25, '30-39': 35, '40-49': 45,
    '50-59': 55, '60-69': 65, '70-79': 75, '80+': 85
}

def map_age(age):
    if pd.isna(age):
        return np.nan
    return age_map.get(age, np.nan)

nhs['age_mapped'] = nhs['age'].apply(map_age)

In [1528]:
# Income

income_map = {
    'Less than $10,000': 5, '$10,000 to $14,999': 12.5,
    '$15,000 to $24,999': 20, '$25,000 to $34,999': 30,
    '$35,000 to $49,999': 42.5, '$50,000 to $74,999': 62.5,
    '$75,000 to $99,999': 87.5, '$100,000 to $149,999': 125,
    '$150,000 to $199,999': 175, '$200,000 or more': 225
}

def map_income(income):
    if pd.isna(income):
        return np.nan
    return income_map.get(income, np.nan)

nhs['income_mapped_k'] = nhs['income'].apply(map_income)
nhs['income_mapped_k'].value_counts()

income_mapped_k
62.5     37
87.5     30
42.5     26
30.0     25
20.0     24
125.0    17
5.0      14
12.5     12
175.0    10
225.0     2
Name: count, dtype: int64

### 3.3.2. Other prep work

In [1529]:
nhs['has_children'].value_counts()

has_children
0    126
1     71
Name: count, dtype: int64

In [1530]:
# Turning homeownership column into binary 0 and 1

nhs['owns_home'] = (nhs['homeownership'] == 'Own').astype(int)
nhs['owns_home'].value_counts()

owns_home
1    108
0     89
Name: count, dtype: int64

In [1531]:
nhs['sex'].value_counts()

sex
Female                105
Male                   90
Third-Gender/Other      2
Name: count, dtype: int64

In [1532]:
# Binary coding sex column
# Have to remove 2 entries with "Other" unfortunately

nhs = nhs[nhs['sex'] != 'Third-Gender/Other']

nhs['sex_b'] = (nhs['sex'] == 'Male').astype(int)
nhs['sex_b'].value_counts()

sex_b
0    105
1     90
Name: count, dtype: int64

In [1533]:
merge_cols_ipums = ['age_mapped', 'income_mapped_k', 'owns_home', 'has_children', 
              'sex_b']

In [1534]:
# Ordinal encoding risk perception

risk_map = {"Unlikely": 0, "Likely": 1, "Very likely": 2}
nhs['risk_perception'] = nhs['hrcn_risk_perception'].map(risk_map)

nhs['risk_perception'].value_counts()

risk_perception
2    117
1     70
0      8
Name: count, dtype: int64

In [1535]:
# Binary hurricane experience

nhs['hrcn_experience_b'] = (nhs['hrcn_experience'] == 'Yes').astype(int)
nhs['hrcn_experience_b'].value_counts()

hrcn_experience_b
1    170
0     25
Name: count, dtype: int64

In [1536]:
# Binary preparedness stage

nhs['prepared_b'] = nhs['hrcn_preparedness_stage'].str.contains('prepared for MORE|prepared for LESS', case=False, na=False).astype(int)
nhs['prepared_b'].value_counts()

prepared_b
1    142
0     53
Name: count, dtype: int64

In [1537]:
# Ordinal encoding preparedness confidence

conf_map = {
    "Not at all confident": 0,
    "Slightly confident": 1,
    "Somewhat confident": 2,
    "Moderately confident": 3,
    "Extremely confident": 4
}

nhs['conf_level'] = nhs['hrcn_prep_confidence'].map(conf_map)
nhs['conf_level'].value_counts()

conf_level
4    87
3    72
2    29
1     4
0     3
Name: count, dtype: int64

### 3.3.3. Build matching Nearest Neighbor model

In [1538]:
X_nhs = nhs[merge_cols_ipums].values
scaler = StandardScaler()
X_nhs_scaled = scaler.fit_transform(X_nhs)

In [1539]:
nn_model = NearestNeighbors(n_neighbors=5, metric='euclidean')
nn_model.fit(X_nhs_scaled)

In [1540]:
print(f"Matching dataset: {len(nhs)} records")

Matching dataset: 195 records


# 4. Merging

## 4.1. Merging IPUMS with IRS Migration Data

In [1541]:
df = ipums_ft_31.merge(irs_31, left_on='YEAR', right_on='year', how='left')

In [1542]:
df.head()

Unnamed: 0,YEAR,SERIAL,COUNTYFIP,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,...,outflow_returns,inflow_returns,net_migration_returns,outflow_people,inflow_people,net_migration_people,outflow_agi,inflow_agi,max_wind_speed,storms
0,2017,265776,31,1,0,47,0,1,11,1.0,...,466919.0,476385.0,9466.0,927237.0,929484.0,2247.0,28279105.0,28316064.0,155.0,HURRICANE IRMA
1,2017,265801,31,0,0,63,0,1,6,1.0,...,466919.0,476385.0,9466.0,927237.0,929484.0,2247.0,28279105.0,28316064.0,155.0,HURRICANE IRMA
2,2017,265801,31,1,1,59,0,1,6,3.0,...,466919.0,476385.0,9466.0,927237.0,929484.0,2247.0,28279105.0,28316064.0,155.0,HURRICANE IRMA
3,2017,265801,31,1,0,54,0,1,6,1.0,...,466919.0,476385.0,9466.0,927237.0,929484.0,2247.0,28279105.0,28316064.0,155.0,HURRICANE IRMA
4,2017,265801,31,0,1,27,0,1,6,1.0,...,466919.0,476385.0,9466.0,927237.0,929484.0,2247.0,28279105.0,28316064.0,155.0,HURRICANE IRMA


In [1543]:
df.tail()

Unnamed: 0,YEAR,SERIAL,COUNTYFIP,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,...,outflow_returns,inflow_returns,net_migration_returns,outflow_people,inflow_people,net_migration_people,outflow_agi,inflow_agi,max_wind_speed,storms
37310,2022,387132,31,1,0,85,4,7,8,3.0,...,495466.0,515879.0,20413.0,924430.0,948488.0,24058.0,39625573.0,41875373.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
37311,2022,387189,31,1,1,73,0,1,11,3.0,...,495466.0,515879.0,20413.0,924430.0,948488.0,24058.0,39625573.0,41875373.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
37312,2022,387189,31,3,0,43,0,1,7,1.0,...,495466.0,515879.0,20413.0,924430.0,948488.0,24058.0,39625573.0,41875373.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
37313,2022,387189,31,0,1,19,0,1,6,1.0,...,495466.0,515879.0,20413.0,924430.0,948488.0,24058.0,39625573.0,41875373.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
37314,2022,387189,31,0,1,68,0,1,6,3.0,...,495466.0,515879.0,20413.0,924430.0,948488.0,24058.0,39625573.0,41875373.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...


In [1544]:
df.drop(columns=['COUNTYFIP', 'year', 'state', 'county_name'], inplace=True)

In [1545]:
df.dtypes

YEAR                       int64
SERIAL                     int64
NCHILD                     int64
SEX                        int64
AGE                        int64
HISPAN                     int64
RACE                       int64
EDUC                       int64
EMPSTAT                  float64
OWNERSHP                 float64
RENT                       int64
is_owner                   int64
is_renter                  int64
HHINCOME                 float64
INCTOT                   float64
VALUEH                   float64
MARST                      int64
MIGRATE1                   int64
GQ                         int64
fips                     float64
num_declarations         float64
outflow_returns          float64
inflow_returns           float64
net_migration_returns    float64
outflow_people           float64
inflow_people            float64
net_migration_people     float64
outflow_agi              float64
inflow_agi               float64
max_wind_speed           float64
storms    

In [1546]:
cat_cols = ['SEX', 'HISPAN', 'RACE', 'EDUC', 'EMPSTAT', 'is_owner', 'is_renter', 'MARST', 'MIGRATE1', 'storms']

for col in cat_cols:
    df[col] = df[col].astype('category')

In [1547]:
df['fips'] = df['fips'].replace(np.nan, 12031).astype(int)
df['fips'].isnull().sum()

np.int64(0)

In [1548]:
df.dtypes

YEAR                        int64
SERIAL                      int64
NCHILD                      int64
SEX                      category
AGE                         int64
HISPAN                   category
RACE                     category
EDUC                     category
EMPSTAT                  category
OWNERSHP                  float64
RENT                        int64
is_owner                 category
is_renter                category
HHINCOME                  float64
INCTOT                    float64
VALUEH                    float64
MARST                    category
MIGRATE1                 category
GQ                          int64
fips                        int64
num_declarations          float64
outflow_returns           float64
inflow_returns            float64
net_migration_returns     float64
outflow_people            float64
inflow_people             float64
net_migration_people      float64
outflow_agi               float64
inflow_agi                float64
max_wind_speed

In [1549]:
df.isnull().sum()

YEAR                         0
SERIAL                       0
NCHILD                       0
SEX                          0
AGE                          0
HISPAN                       0
RACE                         0
EDUC                         0
EMPSTAT                      0
OWNERSHP                     0
RENT                         0
is_owner                     0
is_renter                    0
HHINCOME                     0
INCTOT                       0
VALUEH                   11732
MARST                        0
MIGRATE1                     0
GQ                           0
fips                         0
num_declarations          6762
outflow_returns           6762
inflow_returns            6762
net_migration_returns     6762
outflow_people            6762
inflow_people             6762
net_migration_people      6762
outflow_agi               6762
inflow_agi                6762
max_wind_speed            6762
storms                    6762
dtype: int64

## 4.2. Merging above merge with NHS

In [1550]:
df.columns

Index(['YEAR', 'SERIAL', 'NCHILD', 'SEX', 'AGE', 'HISPAN', 'RACE', 'EDUC',
       'EMPSTAT', 'OWNERSHP', 'RENT', 'is_owner', 'is_renter', 'HHINCOME',
       'INCTOT', 'VALUEH', 'MARST', 'MIGRATE1', 'GQ', 'fips',
       'num_declarations', 'outflow_returns', 'inflow_returns',
       'net_migration_returns', 'outflow_people', 'inflow_people',
       'net_migration_people', 'outflow_agi', 'inflow_agi', 'max_wind_speed',
       'storms'],
      dtype='object')

In [1551]:
# Merging county-level aggregates

df = df.merge(nhs_county, on='fips', how='left')

In [1552]:
df['SEX'].value_counts()

SEX
0    19919
1    17396
Name: count, dtype: int64

In [1553]:
df['age_mapped'] = df['AGE']
df['income_mapped_k'] = df['HHINCOME'] / 1000
df['owns_home'] = df['is_owner'].astype(int)
df['has_children'] = ((df['NCHILD'] > 0)).astype(int)
df['sex_b'] = df['SEX']

In [1554]:
df = df[df['GQ'].isin([1, 2])].copy()  # Only households and non-institutional group quarters

In [1555]:
X_ipums = df[merge_cols_ipums]
X_ipums_scaled = scaler.transform(X_ipums.values)

In [1556]:
distances, indices = nn_model.kneighbors(X_ipums_scaled)

In [1557]:
# Average risk perception level of nearest neighbors for ONE INDIVIDUAL!

df['risk_perception'] = np.array([
    nhs.iloc[neighbor_idx]['risk_perception'].mean() 
    for neighbor_idx in indices
])

# Score definition = Average risk perception level from 0 ("Unlikely") to 2 ("Very likely")

In [1558]:
# Average hurricane experience level of nearest neighbors for ONE INDIVIDUAL!

df['hurricane_experience'] = np.array([
    nhs.iloc[neighbor_idx]['hrcn_experience_b'].mean() 
    for neighbor_idx in indices
])

# We can use score for proportion with hurricane experience (0 to 1)

In [1559]:
# Average preparedness level of nearest neighbors for ONE INDIVIDUAL!

df['preparedness_level'] = np.array([
    nhs.iloc[neighbor_idx]['prepared_b'].mean() 
    for neighbor_idx in indices
])

# Score meaning = average preparedness level from 0 (not prepared) to 1 (prepared)

In [1560]:
# Average confidence level of nearest neighbors for ONE INDIVIDUAL!

df['conf_level'] = np.array([
    nhs.iloc[neighbor_idx]['conf_level'].mean() 
    for neighbor_idx in indices
])

In [1561]:
# Adding one extra column for match quality check

df['knn_avg_distance'] = distances.mean(axis=1)

In [1562]:
# GQ column isn't helpful. Let's drop it.

df.drop(columns=['GQ'], inplace=True)

In [1563]:
# Checking for NA

df.isnull().sum()

YEAR                         0
SERIAL                       0
NCHILD                       0
SEX                          0
AGE                          0
HISPAN                       0
RACE                         0
EDUC                         0
EMPSTAT                      0
OWNERSHP                     0
RENT                         0
is_owner                     0
is_renter                    0
HHINCOME                     0
INCTOT                       0
VALUEH                   11732
MARST                        0
MIGRATE1                     0
fips                         0
num_declarations          6762
outflow_returns           6762
inflow_returns            6762
net_migration_returns     6762
outflow_people            6762
inflow_people             6762
net_migration_people      6762
outflow_agi               6762
inflow_agi                6762
max_wind_speed            6762
storms                    6762
hurricane_exp_%              0
high_risk_%                  0
well_pre

In [1564]:
# Duval had no hurricane declaration in 2018, hence the NA values.
hurricane_cols = ['num_declarations', 'outflow_returns', 'inflow_returns', 
                  'net_migration_returns', 'outflow_people', 'inflow_people',
                  'net_migration_people', 'outflow_agi', 'inflow_agi', 
                  'max_wind_speed']
df[hurricane_cols] = df[hurricane_cols].fillna(0)

df['storms'] = df['storms'].cat.add_categories(['NO HURRICANE'])
df['storms'] = df['storms'].fillna('NO HURRICANE')

In [1565]:
# For VALUEH, we should be able to impute it with 0, and then use the owns_home column

df['VALUEH'] = df['VALUEH'].fillna(0)

In [1566]:
df.isnull().sum()

YEAR                     0
SERIAL                   0
NCHILD                   0
SEX                      0
AGE                      0
HISPAN                   0
RACE                     0
EDUC                     0
EMPSTAT                  0
OWNERSHP                 0
RENT                     0
is_owner                 0
is_renter                0
HHINCOME                 0
INCTOT                   0
VALUEH                   0
MARST                    0
MIGRATE1                 0
fips                     0
num_declarations         0
outflow_returns          0
inflow_returns           0
net_migration_returns    0
outflow_people           0
inflow_people            0
net_migration_people     0
outflow_agi              0
inflow_agi               0
max_wind_speed           0
storms                   0
hurricane_exp_%          0
high_risk_%              0
well_prepared_%          0
n                        0
age_mapped               0
income_mapped_k          0
owns_home                0
h

## 4.3. Fixing data types

In [1567]:
df.dtypes

YEAR                        int64
SERIAL                      int64
NCHILD                      int64
SEX                      category
AGE                         int64
HISPAN                   category
RACE                     category
EDUC                     category
EMPSTAT                  category
OWNERSHP                  float64
RENT                        int64
is_owner                 category
is_renter                category
HHINCOME                  float64
INCTOT                    float64
VALUEH                    float64
MARST                    category
MIGRATE1                 category
fips                        int64
num_declarations          float64
outflow_returns           float64
inflow_returns            float64
net_migration_returns     float64
outflow_people            float64
inflow_people             float64
net_migration_people      float64
outflow_agi               float64
inflow_agi                float64
max_wind_speed            float64
storms        

In [1568]:
cat_cols = ['OWNERSHP', 'owns_home', 'has_children',
            'MARST']

for col in cat_cols:
    df[col] = df[col].astype('category')

df.dtypes

YEAR                        int64
SERIAL                      int64
NCHILD                      int64
SEX                      category
AGE                         int64
HISPAN                   category
RACE                     category
EDUC                     category
EMPSTAT                  category
OWNERSHP                 category
RENT                        int64
is_owner                 category
is_renter                category
HHINCOME                  float64
INCTOT                    float64
VALUEH                    float64
MARST                    category
MIGRATE1                 category
fips                        int64
num_declarations          float64
outflow_returns           float64
inflow_returns            float64
net_migration_returns     float64
outflow_people            float64
inflow_people             float64
net_migration_people      float64
outflow_agi               float64
inflow_agi                float64
max_wind_speed            float64
storms        

# 5. Exporting

In [1569]:
df.head()

Unnamed: 0,YEAR,SERIAL,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,OWNERSHP,...,age_mapped,income_mapped_k,owns_home,has_children,sex_b,risk_perception,hurricane_experience,preparedness_level,conf_level,knn_avg_distance
0,2017,265776,1,0,47,0,1,11,1.0,2.0,...,47,98.0,0,1,0,1.8,0.8,0.4,3.2,0.716361
1,2017,265801,0,0,63,0,1,6,1.0,1.0,...,63,67.7,1,0,0,1.4,1.0,1.0,3.0,0.541057
2,2017,265801,1,1,59,0,1,6,3.0,1.0,...,59,67.7,1,1,1,1.6,0.8,0.8,3.0,0.840465
3,2017,265801,1,0,54,0,1,6,1.0,1.0,...,54,67.7,1,1,0,1.6,0.8,0.4,3.0,0.67026
4,2017,265801,0,1,27,0,1,6,1.0,1.0,...,27,67.7,1,0,1,1.8,1.0,1.0,3.4,0.611395


In [1570]:
df.isnull().sum()

YEAR                     0
SERIAL                   0
NCHILD                   0
SEX                      0
AGE                      0
HISPAN                   0
RACE                     0
EDUC                     0
EMPSTAT                  0
OWNERSHP                 0
RENT                     0
is_owner                 0
is_renter                0
HHINCOME                 0
INCTOT                   0
VALUEH                   0
MARST                    0
MIGRATE1                 0
fips                     0
num_declarations         0
outflow_returns          0
inflow_returns           0
net_migration_returns    0
outflow_people           0
inflow_people            0
net_migration_people     0
outflow_agi              0
inflow_agi               0
max_wind_speed           0
storms                   0
hurricane_exp_%          0
high_risk_%              0
well_prepared_%          0
n                        0
age_mapped               0
income_mapped_k          0
owns_home                0
h

In [1571]:
df.to_csv('../../notebooks/data4model/FL_31_data.csv', index=False)

In [1572]:
df.to_parquet('../../notebooks/data4model/FL_31_data.parquet', index=False)