In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import NearestNeighbors

# 1. FL_IPUMS Feature Selection

In [4]:
ipums = pd.read_csv('../../data/processed/FL_IPUMS.csv')

## 1.1. Extracting Relevant Features

In [5]:
ipums_18plus = ipums[ipums['AGE'] >= 18]

In [6]:
ipums_18plus.head(6)

Unnamed: 0.1,Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STATEFIP,COUNTYFIP,PUMA,...,EMPSTAT,EMPSTATD,INCTOT,POVERTY,MIGRATE1,MIGRATE1D,is_owner,is_renter,owns_free_clear,owns_with_mortgage
0,0,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,...,1.0,10.0,8500.0,29,1,10.0,0,1,0,0
4,4,2017,201701,265770,2017000000009,107.0,2017002657701,12,0,8611,...,3.0,30.0,0.0,29,1,10.0,0,1,0,0
5,5,2017,201701,265771,2017000000011,76.0,2017002657711,12,11,1102,...,1.0,10.0,17400.0,238,1,10.0,1,0,0,1
6,6,2017,201701,265771,2017000000011,76.0,2017002657711,12,11,1102,...,1.0,10.0,17400.0,238,1,10.0,1,0,0,1
7,7,2017,201701,265772,2017000000021,97.0,2017002657721,12,9,902,...,3.0,30.0,8000.0,198,1,10.0,0,1,0,0
8,8,2017,201701,265772,2017000000021,97.0,2017002657721,12,9,902,...,3.0,30.0,21000.0,198,1,10.0,0,1,0,0


In [7]:
ipums_ft = ipums_18plus[['YEAR', 'SERIAL', 'COUNTYFIP', 'NCHILD', 
                         'SEX', 'AGE', 'HISPAN', 'RACE', 'EDUC',
                         'EMPSTAT', 'OWNERSHP', 'RENT', 'is_owner', 'is_renter', 'HHINCOME', 'INCTOT',
                         'VALUEH', 'MARST', 'MIGRATE1', 'GQ']]

In [8]:
ipums_ft.head(6)

Unnamed: 0,YEAR,SERIAL,COUNTYFIP,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,OWNERSHP,RENT,is_owner,is_renter,HHINCOME,INCTOT,VALUEH,MARST,MIGRATE1,GQ
0,2017,265770,0,3,2,30,4,7,6,1.0,2.0,950,0,1,8500.0,8500.0,,6,1,1
4,2017,265770,0,3,1,41,4,7,6,3.0,2.0,950,0,1,8500.0,0.0,,3,1,1
5,2017,265771,11,0,1,78,0,1,10,1.0,1.0,0,1,0,34800.0,17400.0,60000.0,1,1,1
6,2017,265771,11,0,2,68,0,1,7,1.0,1.0,0,1,0,34800.0,17400.0,60000.0,1,1,1
7,2017,265772,9,0,1,73,0,1,11,3.0,2.0,700,0,1,29000.0,8000.0,,1,1,1
8,2017,265772,9,0,2,72,0,1,11,3.0,2.0,700,0,1,29000.0,21000.0,,1,1,1


## 1.2. Light cleaning

In [9]:
# We have an extra year (2023)
# should be removed for consistency with other datasets

ipums_ft['YEAR'].value_counts()
ipums_ft = ipums_ft[ipums_ft['YEAR'] != 2023]

In [10]:
# Filter on Hillsborough County (FIPS 57)

ipums_ft_57 = ipums_ft[ipums_ft['COUNTYFIP'] == 57].copy()

In [11]:
ipums_ft_57.to_csv('../../data/IPUMS/ipums_ft_57.csv', index=False)

In [12]:
# Recoding the SEX column (it usees 1 and 2 instead of 0 and 1)
# Male = 1 (originally) = 1 (new)
# Female = 2 (originally) = 0 (new)

ipums_ft_57['SEX'] = ipums_ft_57['SEX'].map({1: 1, 2: 0})
ipums_ft_57['SEX'].value_counts()

SEX
0    35294
1    32287
Name: count, dtype: int64

In [13]:
# NA values check

ipums_ft_57.isnull().sum()

# HHINCOME and VALUEH has lots of NA values

YEAR             0
SERIAL           0
COUNTYFIP        0
NCHILD           0
SEX              0
AGE              0
HISPAN           0
RACE             0
EDUC             0
EMPSTAT          0
OWNERSHP      2751
RENT             0
is_owner         0
is_renter        0
HHINCOME      2751
INCTOT           0
VALUEH       22241
MARST            0
MIGRATE1         0
GQ               0
dtype: int64

### 1.2.1. Missingness Diagnosis (FUFCK)

In [14]:
hhincome_missing =ipums_ft_57.groupby('GQ').agg({
    'HHINCOME': lambda x: x.isnull().sum()})

hhincome_missing = hhincome_missing.rename(columns={'HHINCOME': 'missing HHINCOME'})
hhincome_missing['total n'] = ipums_ft_57.groupby('GQ').size()

hhincome_missing

Unnamed: 0_level_0,missing HHINCOME,total n
GQ,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,64676
2,0,122
3,884,884
4,1867,1867
5,0,32


Good to know the missingness is structural rather than random... GQ codes 3 and 4 represents group quarters living arrangements (dormitories, prisons, nursing homes, etc.) where income data is not collected.

In [15]:
valueh_missing = ipums_ft_57.groupby('is_owner').agg({
    'VALUEH': lambda x: x.isnull().sum()})

valueh_missing = valueh_missing.rename(columns={'VALUEH': 'missing VALUEH'})
valueh_missing['total n'] = ipums_ft_57.groupby('is_owner').size()

valueh_missing

Unnamed: 0_level_0,missing VALUEH,total n
is_owner,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22241,22241
1,0,45340


THANK GOD THANK GOD THANK GOD THANK GOD THANK GOD THANK GOD that all the missing VALUEH are from non-owners, which makes sense. No need to impute those values.

Still, should get rid of these guys

In [16]:
ipums_ft_71 = ipums_ft_57[ipums_ft_57['GQ'].isin([1, 2])].copy()

# 2. IRS Migration Data

In [17]:
irs = pd.read_csv('../../data/processed/FL_hurricane_county_year_impacts.csv')

In [18]:
irs_57 = irs[irs['fips'] == 12057]

In [19]:
irs_57.head()

Unnamed: 0,fips,year,num_declarations,state,county_name,outflow_returns,inflow_returns,net_migration_returns,outflow_people,inflow_people,net_migration_people,outflow_agi,inflow_agi,max_wind_speed,storms
136,12057,2017,2,FL,Hillsborough (County),673402.0,692195.0,18793.0,1354009.0,1395827.0,41818.0,45221903.0,46579934.0,155.0,HURRICANE IRMA
137,12057,2018,1,FL,Hillsborough (County),661559.0,678133.0,16574.0,1323045.0,1356011.0,32966.0,45694475.0,47337545.0,140.0,HURRICANE MICHAEL
138,12057,2019,1,FL,Hillsborough (County),680318.0,696606.0,16288.0,1354426.0,1381100.0,26674.0,51580563.0,53137137.0,160.0,HURRICANE DORIAN
139,12057,2020,1,FL,Hillsborough (County),699896.0,715736.0,15840.0,1404448.0,1431939.0,27491.0,55188737.0,55538065.0,130.0,HURRICANE ETA
140,12057,2021,1,FL,Hillsborough (County),727781.0,744751.0,16970.0,1382387.0,1418325.0,35938.0,56628435.0,58977564.0,75.0,TROPICAL STORM ELSA


# 3. FEMA NHS

In [20]:
nhs = pd.read_csv('../../data/processed/FL_NHS.csv')

## 3.1. Cleaning

### 3.1.1. Replacing "Blank" with NA

In [21]:
def replace_Blank_with_NA(column_list):
    """
    ARGS:
        column_list = list of columns to clean "Blank" entries from
    RETURNS:
        clean_nhs: now-clean dataframe with "Blank" entries replaced with np.nan
    """

    clean_nhs = nhs.copy()

    for col in column_list:
        clean_nhs[col] = clean_nhs[col].replace("Blank", np.nan)
    
    return clean_nhs

In [22]:
# Replacing Blanks with NA

cols_with_Blank = ['hrcn_prepactions', 'hrcn_safe_shelter', 'hrcn_impacts',
                   'hrcn_prep_specific', 'numchild_school', 'school_plan_aware',
                   'hazard_insurance', 'hrcn_action_1plus', 'hrcn_action_3plus',
                   'hrcn_influencer_aware', 'hrcn_influencer_exp',
                   'hrcn_influencer_efficacy']

nhs = replace_Blank_with_NA(cols_with_Blank)

### 3.1.2. Handling "Don't know" Responses

In [23]:
idk_counts = nhs.apply(lambda col: (col == "Don't know").sum())
print(idk_counts[idk_counts > 0].sort_values(ascending=False))

hrcn_preparedness_stage    15
homeowners_insurance       14
hrcn_shutters              10
hrcn_risk_perception        9
mortgage                    9
mortgage_4cat               9
hrcn_aware                  6
hrcn_experience             6
hrcn_prep_confidence        6
hrcn_influencer_risk        6
caretaker                   6
hrcn_prep_efficacy          5
numchild_school             2
hometype                    2
dtype: int64


In [24]:
# Cleaning "Don't know" entries

def clean_idk(column_list):
    """
    ARGS: 
        column_list = list of columns to clean "Don't know" entries from
    RETURNS: 
        clean_nhs: now-clean dataframe with "Don't know" entries removed
        removed_cols: dict of number of "Don't know" entries removed per column
    """

    removed_cols = {}
    clean_nhs = nhs.copy()

    for col in column_list:
        cnt = (nhs[col] == "Don't know").sum()
        removed_cols[col] = int(cnt)
        clean_nhs = clean_nhs[clean_nhs[col] != "Don't know"]
    
    return clean_nhs, removed_cols

In [25]:
NotSureIfShouldDropDontKnows = ['caretaker', 'numchild_school', 'mortgage', 'hometype',
                                'ses_score', 'mortgage_4cat', 'homeowners_insurance']

idk_cols_demographics = []

idk_cols_hrcn = ['hrcn_aware', 'hrcn_risk_perception', 'hrcn_experience',
            'hrcn_shutters', 'hrcn_prep_efficacy', 'hrcn_prep_confidence',
            'hrcn_preparedness_stage']

nhs, removed_cols = clean_idk(idk_cols_hrcn)

In [26]:
idk_counts = nhs.apply(lambda col: (col == "Don't know").sum())
print(idk_counts[idk_counts > 0].sort_values(ascending=False))

homeowners_insurance    8
mortgage                6
mortgage_4cat           6
caretaker               2
numchild_school         2
hrcn_influencer_risk    1
dtype: int64


In [27]:
removed_cols

{'hrcn_aware': 6,
 'hrcn_risk_perception': 9,
 'hrcn_experience': 6,
 'hrcn_shutters': 10,
 'hrcn_prep_efficacy': 5,
 'hrcn_prep_confidence': 6,
 'hrcn_preparedness_stage': 15}

In [28]:
# NOTES FOR LATER

# Should I turn employment into binary (Yes/No)?
# 

In [29]:
nhs['county'].value_counts()

# We have Lee, we're good.

county
Miami-Dade      57
Broward         34
Volusia         14
Brevard         14
Palm Beach      14
Lee             13
St. Lucie        9
Seminole         7
Indian River     6
Escambia         6
Highlands        5
St. Johns        5
Bay              5
Charlotte        4
Martin           3
Alachua          1
Name: count, dtype: int64

In [30]:
county_fips_map = {
    'Miami-Dade': 12086, 'Broward': 12011, 'Volusia': 12127,
    'Palm Beach': 12099, 'Brevard': 12009, 'Lee': 12071,
    'St. Lucie': 12111, 'Seminole': 12117, 'Indian River': 12061,
    'Escambia': 12033, 'Bay': 12005, 'St. Johns': 12109,
    'Highlands': 12055, 'Charlotte': 12015, 'Alachua': 12001,
    'Martin': 12085
}

nhs['fips'] = nhs['county'].map(county_fips_map)
nhs['fips'].value_counts() 

fips
12086    57
12011    34
12127    14
12009    14
12099    14
12071    13
12111     9
12117     7
12061     6
12033     6
12055     5
12109     5
12005     5
12015     4
12085     3
12001     1
Name: count, dtype: int64

In [31]:
# Creating county-level aggregations

nhs_county = nhs.groupby('fips').agg({
    'hrcn_experience': lambda x: (x == 'Yes').mean(),
    'hrcn_risk_perception': lambda x: (x == 'Very likely').mean(),
    'hrcn_preparedness_stage': lambda x: x.str.contains('prepared for MORE', na=False).mean(),
    'respid': 'count'
}).reset_index()

nhs_county.columns = [
    'fips', 
    'hurricane_exp_%',
    'high_risk_%', 
    'well_prepared_%',
    'n'
] # Renaming to make it pretty

nhs_county.head()

Unnamed: 0,fips,hurricane_exp_%,high_risk_%,well_prepared_%,n
0,12001,1.0,0.0,0.0,1
1,12005,0.6,0.4,0.6,5
2,12009,0.928571,0.642857,0.785714,14
3,12011,0.794118,0.617647,0.558824,34
4,12015,1.0,0.25,1.0,4


In [32]:
nhs_county[(nhs_county['fips'] == 12057)]

Unnamed: 0,fips,hurricane_exp_%,high_risk_%,well_prepared_%,n


## 3.3. Preparing NHS for merging

### 3.3.1. Mapping these jawns

Since NHS and IPUMS have different data types (and formats) when it comes to columns like age and income, we will have to change the way it's coded in the NHS data to match IPUMS.

In [33]:
# Age

age_map = {
    '18-19': 18.5, '20-29': 25, '30-39': 35, '40-49': 45,
    '50-59': 55, '60-69': 65, '70-79': 75, '80+': 85
}

def map_age(age):
    if pd.isna(age):
        return np.nan
    return age_map.get(age, np.nan)

nhs['age_mapped'] = nhs['age'].apply(map_age)

In [34]:
# Income

income_map = {
    'Less than $10,000': 5, '$10,000 to $14,999': 12.5,
    '$15,000 to $24,999': 20, '$25,000 to $34,999': 30,
    '$35,000 to $49,999': 42.5, '$50,000 to $74,999': 62.5,
    '$75,000 to $99,999': 87.5, '$100,000 to $149,999': 125,
    '$150,000 to $199,999': 175, '$200,000 or more': 225
}

def map_income(income):
    if pd.isna(income):
        return np.nan
    return income_map.get(income, np.nan)

nhs['income_mapped_k'] = nhs['income'].apply(map_income)
nhs['income_mapped_k'].value_counts()

income_mapped_k
62.5     37
87.5     30
42.5     26
30.0     25
20.0     24
125.0    17
5.0      14
12.5     12
175.0    10
225.0     2
Name: count, dtype: int64

### 3.3.2. Other prep work

In [35]:
nhs['has_children'].value_counts()

has_children
0    126
1     71
Name: count, dtype: int64

In [36]:
# Turning homeownership column into binary 0 and 1

nhs['owns_home'] = (nhs['homeownership'] == 'Own').astype(int)
nhs['owns_home'].value_counts()

owns_home
1    108
0     89
Name: count, dtype: int64

In [37]:
nhs['sex'].value_counts()

sex
Female                105
Male                   90
Third-Gender/Other      2
Name: count, dtype: int64

In [38]:
# Binary coding sex column
# Have to remove 2 entries with "Other" unfortunately

nhs = nhs[nhs['sex'] != 'Third-Gender/Other']

nhs['sex_b'] = (nhs['sex'] == 'Male').astype(int)
nhs['sex_b'].value_counts()

sex_b
0    105
1     90
Name: count, dtype: int64

In [39]:
merge_cols_ipums = ['age_mapped', 'income_mapped_k', 'owns_home', 'has_children', 
              'sex_b']

In [40]:
# Ordinal encoding risk perception

risk_map = {"Unlikely": 0, "Likely": 1, "Very likely": 2}
nhs['risk_perception'] = nhs['hrcn_risk_perception'].map(risk_map)

nhs['risk_perception'].value_counts()

risk_perception
2    117
1     70
0      8
Name: count, dtype: int64

In [41]:
# Binary hurricane experience

nhs['hrcn_experience_b'] = (nhs['hrcn_experience'] == 'Yes').astype(int)
nhs['hrcn_experience_b'].value_counts()

hrcn_experience_b
1    170
0     25
Name: count, dtype: int64

In [42]:
# Binary preparedness stage

nhs['prepared_b'] = nhs['hrcn_preparedness_stage'].str.contains('prepared for MORE|prepared for LESS', case=False, na=False).astype(int)
nhs['prepared_b'].value_counts()

prepared_b
1    142
0     53
Name: count, dtype: int64

In [43]:
# Ordinal encoding preparedness confidence

conf_map = {
    "Not at all confident": 0,
    "Slightly confident": 1,
    "Somewhat confident": 2,
    "Moderately confident": 3,
    "Extremely confident": 4
}

nhs['conf_level'] = nhs['hrcn_prep_confidence'].map(conf_map)
nhs['conf_level'].value_counts()

conf_level
4    87
3    72
2    29
1     4
0     3
Name: count, dtype: int64

### 3.3.3. Build matching Nearest Neighbor model

In [44]:
X_nhs = nhs[merge_cols_ipums].values
scaler = StandardScaler()
X_nhs_scaled = scaler.fit_transform(X_nhs)

In [45]:
nn_model = NearestNeighbors(n_neighbors=5, metric='euclidean')
nn_model.fit(X_nhs_scaled)

In [46]:
print(f"Matching dataset: {len(nhs)} records")

Matching dataset: 195 records


# 4. 

# 5. Merging

## 5.1. Merging IPUMS with IRS Migration Data

In [47]:
df = ipums_ft_57.merge(irs_57, left_on='YEAR', right_on='year', how='left')

In [48]:
df.head()

Unnamed: 0,YEAR,SERIAL,COUNTYFIP,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,...,outflow_returns,inflow_returns,net_migration_returns,outflow_people,inflow_people,net_migration_people,outflow_agi,inflow_agi,max_wind_speed,storms
0,2017,265787,57,0,0,57,0,2,6,3.0,...,673402.0,692195.0,18793.0,1354009.0,1395827.0,41818.0,45221903.0,46579934.0,155.0,HURRICANE IRMA
1,2017,265794,57,3,1,59,0,1,11,1.0,...,673402.0,692195.0,18793.0,1354009.0,1395827.0,41818.0,45221903.0,46579934.0,155.0,HURRICANE IRMA
2,2017,265794,57,3,0,48,0,6,7,3.0,...,673402.0,692195.0,18793.0,1354009.0,1395827.0,41818.0,45221903.0,46579934.0,155.0,HURRICANE IRMA
3,2017,265794,57,0,1,19,0,8,7,3.0,...,673402.0,692195.0,18793.0,1354009.0,1395827.0,41818.0,45221903.0,46579934.0,155.0,HURRICANE IRMA
4,2017,265794,57,0,0,24,0,8,8,1.0,...,673402.0,692195.0,18793.0,1354009.0,1395827.0,41818.0,45221903.0,46579934.0,155.0,HURRICANE IRMA


In [49]:
df.tail()

Unnamed: 0,YEAR,SERIAL,COUNTYFIP,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,...,outflow_returns,inflow_returns,net_migration_returns,outflow_people,inflow_people,net_migration_people,outflow_agi,inflow_agi,max_wind_speed,storms
67576,2022,387155,57,0,0,69,0,1,10,3.0,...,751349.0,770819.0,19470.0,1411631.0,1442015.0,30384.0,66819293.0,71196338.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
67577,2022,387195,57,0,0,71,0,1,10,3.0,...,751349.0,770819.0,19470.0,1411631.0,1442015.0,30384.0,66819293.0,71196338.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
67578,2022,387204,57,1,1,56,2,8,7,1.0,...,751349.0,770819.0,19470.0,1411631.0,1442015.0,30384.0,66819293.0,71196338.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
67579,2022,387204,57,0,0,20,2,8,7,1.0,...,751349.0,770819.0,19470.0,1411631.0,1442015.0,30384.0,66819293.0,71196338.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...
67580,2022,387209,57,0,0,50,0,2,10,1.0,...,751349.0,770819.0,19470.0,1411631.0,1442015.0,30384.0,66819293.0,71196338.0,140.0,HURRICANE NICOLE; HURRICANE IAN; TROPICAL STOR...


In [50]:
df.drop(columns=['COUNTYFIP', 'year', 'state', 'county_name'], inplace=True)

In [51]:
df.dtypes

YEAR                       int64
SERIAL                     int64
NCHILD                     int64
SEX                        int64
AGE                        int64
HISPAN                     int64
RACE                       int64
EDUC                       int64
EMPSTAT                  float64
OWNERSHP                 float64
RENT                       int64
is_owner                   int64
is_renter                  int64
HHINCOME                 float64
INCTOT                   float64
VALUEH                   float64
MARST                      int64
MIGRATE1                   int64
GQ                         int64
fips                       int64
num_declarations           int64
outflow_returns          float64
inflow_returns           float64
net_migration_returns    float64
outflow_people           float64
inflow_people            float64
net_migration_people     float64
outflow_agi              float64
inflow_agi               float64
max_wind_speed           float64
storms    

In [52]:
cat_cols = ['SEX', 'HISPAN', 'RACE', 'EDUC', 'EMPSTAT', 'is_owner', 'is_renter', 'MARST', 'MIGRATE1', 'storms']

for col in cat_cols:
    df[col] = df[col].astype('category')

In [53]:
df['fips'] = df['fips'].replace(np.nan, 12057).astype(int)
df['fips'].isnull().sum()

np.int64(0)

In [54]:
df.dtypes

YEAR                        int64
SERIAL                      int64
NCHILD                      int64
SEX                      category
AGE                         int64
HISPAN                   category
RACE                     category
EDUC                     category
EMPSTAT                  category
OWNERSHP                  float64
RENT                        int64
is_owner                 category
is_renter                category
HHINCOME                  float64
INCTOT                    float64
VALUEH                    float64
MARST                    category
MIGRATE1                 category
GQ                          int64
fips                        int64
num_declarations            int64
outflow_returns           float64
inflow_returns            float64
net_migration_returns     float64
outflow_people            float64
inflow_people             float64
net_migration_people      float64
outflow_agi               float64
inflow_agi                float64
max_wind_speed

In [55]:
df.isnull().sum()

YEAR                         0
SERIAL                       0
NCHILD                       0
SEX                          0
AGE                          0
HISPAN                       0
RACE                         0
EDUC                         0
EMPSTAT                      0
OWNERSHP                  2751
RENT                         0
is_owner                     0
is_renter                    0
HHINCOME                  2751
INCTOT                       0
VALUEH                   22241
MARST                        0
MIGRATE1                     0
GQ                           0
fips                         0
num_declarations             0
outflow_returns              0
inflow_returns               0
net_migration_returns        0
outflow_people               0
inflow_people                0
net_migration_people         0
outflow_agi                  0
inflow_agi                   0
max_wind_speed               0
storms                       0
dtype: int64

## 5.2. Merging above merge with NHS

In [56]:
df.columns

Index(['YEAR', 'SERIAL', 'NCHILD', 'SEX', 'AGE', 'HISPAN', 'RACE', 'EDUC',
       'EMPSTAT', 'OWNERSHP', 'RENT', 'is_owner', 'is_renter', 'HHINCOME',
       'INCTOT', 'VALUEH', 'MARST', 'MIGRATE1', 'GQ', 'fips',
       'num_declarations', 'outflow_returns', 'inflow_returns',
       'net_migration_returns', 'outflow_people', 'inflow_people',
       'net_migration_people', 'outflow_agi', 'inflow_agi', 'max_wind_speed',
       'storms'],
      dtype='object')

In [57]:
# Merging county-level aggregates

df = df.merge(nhs_county, on='fips', how='left')

In [58]:
df['SEX'].value_counts()

SEX
0    35294
1    32287
Name: count, dtype: int64

In [59]:
df['age_mapped'] = df['AGE']
df['income_mapped_k'] = df['HHINCOME'] / 1000
df['owns_home'] = df['is_owner'].astype(int)
df['has_children'] = ((df['NCHILD'] > 0)).astype(int)
df['sex_b'] = df['SEX']

In [60]:
df = df[df['GQ'].isin([1, 2])].copy()  # Only households and non-institutional group quarters

In [61]:
X_ipums = df[merge_cols_ipums]
X_ipums_scaled = scaler.transform(X_ipums.values)

In [62]:
distances, indices = nn_model.kneighbors(X_ipums_scaled)

In [63]:
# Average risk perception level of nearest neighbors for ONE INDIVIDUAL!

df['risk_perception'] = np.array([
    nhs.iloc[neighbor_idx]['risk_perception'].mean() 
    for neighbor_idx in indices
])

# Score definition = Average risk perception level from 0 ("Unlikely") to 2 ("Very likely")

In [64]:
# Average hurricane experience level of nearest neighbors for ONE INDIVIDUAL!

df['hurricane_experience'] = np.array([
    nhs.iloc[neighbor_idx]['hrcn_experience_b'].mean() 
    for neighbor_idx in indices
])

# We can use score for proportion with hurricane experience (0 to 1)

In [65]:
# Average preparedness level of nearest neighbors for ONE INDIVIDUAL!

df['preparedness_level'] = np.array([
    nhs.iloc[neighbor_idx]['prepared_b'].mean() 
    for neighbor_idx in indices
])

# Score meaning = average preparedness level from 0 (not prepared) to 1 (prepared)

In [66]:
# Average confidence level of nearest neighbors for ONE INDIVIDUAL!

df['conf_level'] = np.array([
    nhs.iloc[neighbor_idx]['conf_level'].mean() 
    for neighbor_idx in indices
])

In [67]:
# Adding one extra column for match quality check

df['knn_avg_distance'] = distances.mean(axis=1)

In [68]:
# GQ column isn't helpful. Let's drop it.

df.drop(columns=['GQ'], inplace=True)

In [69]:
# Checking for NA

df.isnull().sum()

YEAR                         0
SERIAL                       0
NCHILD                       0
SEX                          0
AGE                          0
HISPAN                       0
RACE                         0
EDUC                         0
EMPSTAT                      0
OWNERSHP                     0
RENT                         0
is_owner                     0
is_renter                    0
HHINCOME                     0
INCTOT                       0
VALUEH                   19478
MARST                        0
MIGRATE1                     0
fips                         0
num_declarations             0
outflow_returns              0
inflow_returns               0
net_migration_returns        0
outflow_people               0
inflow_people                0
net_migration_people         0
outflow_agi                  0
inflow_agi                   0
max_wind_speed               0
storms                       0
hurricane_exp_%          64798
high_risk_%              64798
well_pre

In [70]:
# Duval had no hurricane declaration in 2018, hence the NA values.
hurricane_cols = ['num_declarations', 'outflow_returns', 'inflow_returns', 
                  'net_migration_returns', 'outflow_people', 'inflow_people',
                  'net_migration_people', 'outflow_agi', 'inflow_agi', 
                  'max_wind_speed']
df[hurricane_cols] = df[hurricane_cols].fillna(0)

df['storms'] = df['storms'].cat.add_categories(['NO HURRICANE'])
df['storms'] = df['storms'].fillna('NO HURRICANE')

In [71]:
# For VALUEH, we should be able to impute it with 0, and then use the owns_home column

df['VALUEH'] = df['VALUEH'].fillna(0)

In [72]:
df.isnull().sum()

YEAR                         0
SERIAL                       0
NCHILD                       0
SEX                          0
AGE                          0
HISPAN                       0
RACE                         0
EDUC                         0
EMPSTAT                      0
OWNERSHP                     0
RENT                         0
is_owner                     0
is_renter                    0
HHINCOME                     0
INCTOT                       0
VALUEH                       0
MARST                        0
MIGRATE1                     0
fips                         0
num_declarations             0
outflow_returns              0
inflow_returns               0
net_migration_returns        0
outflow_people               0
inflow_people                0
net_migration_people         0
outflow_agi                  0
inflow_agi                   0
max_wind_speed               0
storms                       0
hurricane_exp_%          64798
high_risk_%              64798
well_pre

## 5.3. Fixing data types

In [73]:
df.dtypes

YEAR                        int64
SERIAL                      int64
NCHILD                      int64
SEX                      category
AGE                         int64
HISPAN                   category
RACE                     category
EDUC                     category
EMPSTAT                  category
OWNERSHP                  float64
RENT                        int64
is_owner                 category
is_renter                category
HHINCOME                  float64
INCTOT                    float64
VALUEH                    float64
MARST                    category
MIGRATE1                 category
fips                        int64
num_declarations            int64
outflow_returns           float64
inflow_returns            float64
net_migration_returns     float64
outflow_people            float64
inflow_people             float64
net_migration_people      float64
outflow_agi               float64
inflow_agi                float64
max_wind_speed            float64
storms        

In [74]:
cat_cols = ['OWNERSHP', 'owns_home', 'has_children',
            'MARST']

for col in cat_cols:
    df[col] = df[col].astype('category')

df.dtypes

YEAR                        int64
SERIAL                      int64
NCHILD                      int64
SEX                      category
AGE                         int64
HISPAN                   category
RACE                     category
EDUC                     category
EMPSTAT                  category
OWNERSHP                 category
RENT                        int64
is_owner                 category
is_renter                category
HHINCOME                  float64
INCTOT                    float64
VALUEH                    float64
MARST                    category
MIGRATE1                 category
fips                        int64
num_declarations            int64
outflow_returns           float64
inflow_returns            float64
net_migration_returns     float64
outflow_people            float64
inflow_people             float64
net_migration_people      float64
outflow_agi               float64
inflow_agi                float64
max_wind_speed            float64
storms        

# 4. Exporting

In [75]:
df.head()

Unnamed: 0,YEAR,SERIAL,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,OWNERSHP,...,age_mapped,income_mapped_k,owns_home,has_children,sex_b,risk_perception,hurricane_experience,preparedness_level,conf_level,knn_avg_distance
0,2017,265787,0,0,57,0,2,6,3.0,2.0,...,57,9.6,0,0,0,1.6,1.0,0.6,3.4,0.453774
1,2017,265794,3,1,59,0,1,11,1.0,1.0,...,59,103.0,1,1,1,1.8,0.8,1.0,3.6,0.875401
2,2017,265794,3,0,48,0,6,7,3.0,1.0,...,48,103.0,1,1,0,1.8,0.8,0.6,3.6,0.743254
3,2017,265794,0,1,19,0,8,7,3.0,1.0,...,19,103.0,1,0,1,1.6,1.0,1.0,3.6,0.863602
4,2017,265794,0,0,24,0,8,8,1.0,1.0,...,24,103.0,1,0,0,2.0,1.0,0.8,3.2,1.391514


In [76]:
df.isnull().sum()

YEAR                         0
SERIAL                       0
NCHILD                       0
SEX                          0
AGE                          0
HISPAN                       0
RACE                         0
EDUC                         0
EMPSTAT                      0
OWNERSHP                     0
RENT                         0
is_owner                     0
is_renter                    0
HHINCOME                     0
INCTOT                       0
VALUEH                       0
MARST                        0
MIGRATE1                     0
fips                         0
num_declarations             0
outflow_returns              0
inflow_returns               0
net_migration_returns        0
outflow_people               0
inflow_people                0
net_migration_people         0
outflow_agi                  0
inflow_agi                   0
max_wind_speed               0
storms                       0
hurricane_exp_%          64798
high_risk_%              64798
well_pre

In [77]:
df.to_csv('../../notebooks/data4model/FL_57_data.csv', index=False)

In [78]:
# df.to_parquet('../../notebooks/data4model/FL_37_data.parquet', index=False)