In [81]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from scipy.spatial.distance import euclidean

from mode_imputer import KNNImputerMode

In [82]:
data = pd.read_csv('./Financial Well-Being Survey Data/NFWBS_PUF_2016_data.csv')

print(f'The dataset contains {data.shape[0]} rows and {data.shape[1]} features.')

The dataset contains 6394 rows and 217 features.


# Data cleaning

In [83]:
data.set_index('PUF_ID', inplace=True)

## Dropping irrelevant features

We can remove features that were used to feature engineering

Engineered features: 
- FWBScore: created from FWB questions
- FSscore: created from FS questions
- LMscore: created from FINKKNOWL questions + FKcorrect
- KHscore: created from KHKNOWL questions + KHcorrect
- ON1correct: OBJNUMERACY1
- ON2correct: OBJNUMERACY2

In [84]:
FWB_cols = list(filter(lambda x: 'FWB' in x, data.columns.values))
FWB_drop = FWB_cols[1:]
data.drop(FWB_drop, axis = 1, inplace = True)

list(filter(lambda x: 'FWB' in x, data.columns.values))

['FWBscore']

In [85]:
FS_cols = list(filter(lambda x: 'FS' in x, data.columns.values))
FS_cols = FS_cols[1:]
data.drop(FS_cols, axis = 1, inplace = True)

list(filter(lambda x: 'FS' in x, data.columns.values))

['FSscore']

In [86]:
LM_cols = list(filter(lambda x: 'FINKNOWL' in x, data.columns.values))

data.drop(LM_cols, axis = 1, inplace = True)

list(filter(lambda x: 'FINKNOWL' in x, data.columns.values))

[]

In [87]:
FKcorrect_cols = ['FK1correct', 'FK2correct', 'FK3correct']

data.drop(FKcorrect_cols, axis = 1, inplace = True)

In [88]:
KH_cols = list(filter(lambda x: 'KHKNOWL' in x, data.columns.values))

data.drop(KH_cols, axis = 1, inplace = True)

list(filter(lambda x: 'KHKNOWL' in x, data.columns.values))

[]

In [89]:
KHcorrect_cols = ['KH1correct', 'KH2correct', 'KH3correct', 'KH4correct', 'KH5correct', 'KH6correct', 'KH7correct', 'KH8correct', 'KH9correct']

data.drop(KHcorrect_cols, axis = 1, inplace = True)

In [90]:
OBJNUMcols = list(filter(lambda x: 'OBJNUMERACY' in x, data.columns.values))

data.drop(OBJNUMcols, axis = 1, inplace = True)

list(filter(lambda x: 'OBJNUMERACY' in x, data.columns.values))

[]

In [91]:
data.head()

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10350,2,3,5,5,6,55,44,5,4,3,...,1,4,8,0,0,0,0,1,0,0.367292
7740,1,3,6,6,6,51,43,5,4,3,...,1,2,3,0,0,0,0,2,0,1.327561
13699,1,3,4,3,4,49,42,5,3,3,...,1,4,9,0,0,0,1,2,1,0.835156
7267,1,3,6,6,6,49,42,-1,-1,-1,...,1,3,7,0,0,0,0,1,0,1.410871
7375,1,3,4,4,4,49,42,4,3,3,...,1,2,4,0,0,1,0,4,1,4.260668


## Missing values

__-5: County not known__


Use PPREG9, PPINCIMP and PPEDUC to impute PCTLT200FPL
PPEDUC: ordinal
<br>
PPINCIMP: ordinal
<br>
PPREG9: categorical

distance to find nearest neighbors: euclidean for PPINC, PPEDUC  + one hot for PPREG9

In [92]:
# replace by missing values
data = data.replace(-5, np.nan)

In [93]:
nrows = data[data['PCTLT200FPL'].isna()].shape[0]
print(f'nr of rows with missing value: {nrows}')

nr of rows with missing value: 395


In [94]:
def distance(X, Y, missing_values = np.nan):
    
    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0

    eucl_dist = euclidean(X[[0, 1]], Y[[0, 1]])

    return eucl_dist + (X[2] != Y[2])

In [95]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

cols = ['PPEDUC', 'PPINCIMP', 'PPREG9', 'PCTLT200FPL']

new_data = imputer.fit_transform(data[cols])

new_data = pd.DataFrame(new_data, columns = cols, index = data.index)

In [96]:
new_data['PCTLT200FPL'].value_counts()

0.0    4842
1.0    1552
Name: PCTLT200FPL, dtype: int64

In [97]:
data.loc[data['PCTLT200FPL'].isna(), 'PCTLT200FPL'] = new_data.loc[data['PCTLT200FPL'].isna(), 'PCTLT200FPL'].copy()

__-4: Response not written to the database due to error__: SWB only (because FWB dropped) ->DROP

In [98]:
data[(data == -4).any(axis = 1)]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12173,1,3,-4,-4,-4,-4,48,7,4,3,...,1,3,5,0,0,0,0,1,0.0,1.253964


In [99]:
data.drop(12173, axis = 0, inplace = True)

__-3: Invalid response/ Incoherent data__

- Drop the only row with KIDS_2 = -3
- Replace -3 with -2 for SOCSEC2 because -3 are caused by invalid age, therefore -2 is the adequate label.
- Replace -3 with the mode for SOCSEC3

KIDS_2 drop

In [100]:
data[data['KIDS_2'] == -3]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13237,1,2,3,7,7,52,50,5,4,4,...,1,2,3,0,0,1,1,3,0.0,1.690435


In [101]:
data.drop(13237, axis = 0, inplace = True)

SOCSEC2

In [102]:
data['SOCSEC2'] = data['SOCSEC2'].replace(-3, -2).copy()

SOCSEC3

In [103]:
data.loc[data['SOCSEC3'].isna(), 'SOCSEC3'] = data['SOCSEC3'].mode()[0]

__8: I can't recall__

HSLOC: Where respondent attended high school
- 1: US and territories
- 2: Outside the US
- 8: Can't recall

What do to?
- impute with mode

In [104]:
data.loc[data['HSLOC'] == 8, 'HSLOC'] = data['HSLOC'].mode()[0]

In [105]:
data['HSLOC'].value_counts()

 1    5406
-1     644
 2     342
Name: HSLOC, dtype: int64

__-1: Not answered/ Left blank__
<br>
__98: Don't know__
<br>
__99: Prefer not to say__


- SWB: median
- SUBKNOWL1: median
- ACT: KNNImputerMode
- FINGOALS: KNNImputerMode
- MANAGE1_1: mode + knn
- SAVEHABIT: KNNImputerMode
- FRUGALITY: mode
- AUTOMATED: KNNImputerMode
- ASK: KNNImputerMode + mode
- SUBNUMERACY: KNNImputerMode + mode
- CHANGEABLE: mode
- GOALCONF: KNNImputerMode
- ENDSMEET: KNNImputerMode
- HOUSING:  KNNImputerMode
- LIVINGARRANGEMENT:
<br>
- HOUSERANGES:
- VALUERANGES:
- MORTGAGE:
- SAVINGSRANGES:
<br>
- CONSPROTECT:
- EARNERS:
- VOLATILITY:
- SNAP:
- MATHARDSHIP_6:
- COLLECT:
- REJECTED:
- ABSORBSHOCK:
- BENEFITS:
- FRAUD2:
- COVERCOSTS:
- BORROW:
- MANAGE2:
- PAIDHELP:
- HSLOC:
- PAREDUC:
- FINSOC:
- MATERIALISM:
- CONNECT:
- HEALTH:
- SCFHORIZON:
- DISCOUNT:
- MEMLOSS:
- DISTRESS:
- SELFCONTROL:
- OUTLOOK:
- PEM:
- HOUSESAT:
- SOCSEC:
- LIFEEXPECT:
- HHEDUC:
- KIDS:
<br>
- EMPLOY
<br>
- RETIRE:
- MILITARY:
- Military_Status:

First, let's replace the missing values with NaN

In [106]:
data = data.replace(-1, np.nan)
data = data.replace(98, np.nan)
data = data.replace(99, np.nan)

In [107]:
data[data.isna().sum(axis = 1) >= 5]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7267,1,3,6.0,6.0,6.0,49.0,42.0,,,,...,1,3,7,0,0,0,0,1,0.0,1.410871
8303,1,3,7.0,7.0,7.0,43.0,58.0,4.0,3.0,3.0,...,1,3,7,0,0,0,1,3,0.0,2.497838
9182,1,3,6.0,,,50.0,43.0,4.0,4.0,3.0,...,1,4,9,0,0,0,0,2,0.0,0.647301
11082,1,3,6.0,6.0,5.0,62.0,55.0,4.0,5.0,4.0,...,1,3,5,0,0,0,0,3,0.0,0.455829
8309,2,3,3.0,4.0,2.0,49.0,37.0,,4.0,3.0,...,1,3,5,0,0,0,0,1,0.0,0.597089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11078,3,1,3.0,4.0,5.0,26.0,44.0,4.0,3.0,3.0,...,1,2,3,0,1,0,0,1,0.0,1.379959
12353,1,3,5.0,7.0,3.0,62.0,26.0,4.0,3.0,3.0,...,1,3,5,0,0,0,0,2,0.0,1.745759
12297,3,3,4.0,5.0,7.0,55.0,45.0,5.0,4.0,3.0,...,1,2,4,0,0,0,1,2,0.0,0.883213
13085,3,2,7.0,7.0,7.0,50.0,66.0,6.0,1.0,5.0,...,1,3,7,0,0,0,0,1,1.0,2.518084


Now let's drop the rows with more than 5 missing values

In [108]:
data.drop(data[data.isna().sum(axis = 1) >= 5].index, axis = 0, inplace = True)

In [109]:
data.shape

(6025, 171)

Now, let's see which columns have missing values

In [110]:
data.isna().sum(axis = 0).index

Index(['sample', 'fpl', 'SWB_1', 'SWB_2', 'SWB_3', 'FWBscore', 'FSscore',
       'SUBKNOWL1', 'ACT1_1', 'ACT1_2',
       ...
       'PPMSACAT', 'PPREG4', 'PPREG9', 'PPT01', 'PPT25', 'PPT612', 'PPT1317',
       'PPT18OV', 'PCTLT200FPL', 'finalwt'],
      dtype='object', length=171)

And how many missing values per column

In [111]:
# how many missing values per column that has missing values

data.isna().sum(axis = 0)[data.isna().sum(axis = 0) > 0]

SWB_1              12
SWB_2              37
SWB_3              38
SUBKNOWL1          34
ACT1_1             14
                   ..
KIDS_4              7
EMPLOY             44
RETIRE              2
MILITARY           15
Military_Status    31
Length: 79, dtype: int64

Get all columns with less than 100 missing values

In [112]:
# get all columns with less than 100 missing values

data.isna().sum(axis = 0)[(data.isna().sum(axis = 0) > 0) & (data.isna().sum(axis = 0) < 100)].index

Index(['SWB_1', 'SWB_2', 'SWB_3', 'SUBKNOWL1', 'ACT1_1', 'ACT1_2', 'FINGOALS',
       'SAVEHABIT', 'FRUGALITY', 'AUTOMATED_1', 'AUTOMATED_2', 'ASK1_1',
       'ASK1_2', 'SUBNUMERACY2', 'SUBNUMERACY1', 'CHANGEABLE', 'GOALCONF',
       'ENDSMEET', 'HOUSING', 'LIVINGARRANGEMENT', 'CONSPROTECT1',
       'CONSPROTECT2', 'CONSPROTECT3', 'EARNERS', 'VOLATILITY', 'SNAP',
       'MATHARDSHIP_6', 'COLLECT', 'REJECTED_1', 'REJECTED_2', 'ABSORBSHOCK',
       'BENEFITS_3', 'BENEFITS_4', 'BENEFITS_5', 'FRAUD2', 'COVERCOSTS',
       'MANAGE2', 'PAIDHELP', 'PAREDUC', 'FINSOC2_3', 'FINSOC2_5', 'FINSOC2_6',
       'MATERIALISM_1', 'MATERIALISM_2', 'MATERIALISM_3', 'HEALTH',
       'SCFHORIZON', 'DISCOUNT', 'MEMLOSS', 'DISTRESS', 'SELFCONTROL_1',
       'SELFCONTROL_2', 'SELFCONTROL_3', 'OUTLOOK_1', 'OUTLOOK_2', 'PEM',
       'HOUSESAT', 'SOCSEC1', 'SOCSEC2', 'SOCSEC3', 'HHEDUC', 'KIDS_1',
       'KIDS_2', 'KIDS_3', 'KIDS_4', 'EMPLOY', 'RETIRE', 'MILITARY',
       'Military_Status'],
      dtype='object'

In [113]:
data.isna().sum(axis = 0)[data.isna().sum(axis = 0) > 100].index

Index(['HOUSERANGES', 'VALUERANGES', 'MORTGAGE', 'SAVINGSRANGES', 'BORROW_1',
       'BORROW_2', 'HSLOC', 'CONNECT', 'LIFEEXPECT', 'KIDS_NoChildren'],
      dtype='object')

__Central tendency imputations__

SWB_1, SWB_2, SUBKNOWL1, ACT1_2, ASK1_1, SUBNUMERACY1, LIVINGARRANGEMENT, CONSPROTECT1, ABSORBSHOCK, PAREDUC, MATERIALISM_3, HEALTH, SELFCONTROL_1, SELFCONTROL_2, SELFCONTROL_3, OUTLOOK_1, OUTLOOK_2, PEM: skewed -> median

SWB_3, ACT1_1, SAVEHABIT, FRUGALITY, ENDSMEET, CONSPROTECT2, MATHARDSHIP_6, HOUSESAT: staricase -> mode

AUTOMATED_1, AUTOMATED_2, ASK1_2, SUBNUMERACY2, CHANGEABLE, GOALCONF, EARNERS, MATERIALISM,_1, MATERIALISM_2, SCFHORIZON, DISTRESS, HHEDUC, EMPLOY, RETIRE (cuidado com -2): gaussian distribution -> mode

Categorical:

FINGOALS, AUTOMATED_2, HOUSING, CONSPROTECT3, VOLATILITY, SNAP, COLLECT, REJECTED_1, REJECTED_2, BENEFITS_3, BENEFITS_4, BENEFITS_5, FRAUD2, COVERCOSTS, MANAGE2, PAIDHELP (cuidado com -2), FINSOC2_3, FINSOC2_5, FINSOC2_6, DISCOUNT, MEMLOSS, SOCSEC1, SOCSEC2, SOCSEC3 (cuidado com -3 e -2), KIDS_1, KIDS_2, KIDS_3, KIDS_4, EMPLOY, MILITARY, Military_Status: mode

In [114]:
data['PAIDHELP'].value_counts()

-2.0    3113
 0.0    2709
 1.0     201
Name: PAIDHELP, dtype: int64

In [115]:
skewed_ordinal = ['SWB_1', 'SWB_2', 'SUBKNOWL1', 'ACT1_2', 'ASK1_1', 'SUBNUMERACY1', 'LIVINGARRANGEMENT', 'CONSPROTECT1', 'ABSORBSHOCK', 'PAREDUC', 'MATERIALISM_3', 'HEALTH', 'SELFCONTROL_1', 'SELFCONTROL_2', 'SELFCONTROL_3', 'OUTLOOK_1', 'OUTLOOK_2', 'PEM']

for feat in skewed_ordinal:
    data.loc[data[feat].isna(), feat] = data[feat].median()

staircase_ordinal = ['SWB_3', 'ACT1_1', 'SAVEHABIT', 'FRUGALITY', 'ENDSMEET', 'CONSPROTECT2', 'MATHARDSHIP_6', 'HOUSESAT']

for feat in staircase_ordinal:
    data.loc[data[feat].isna(), feat] = data[feat].mode()[0]



# excluding RETIRE because it has -2 values
gaussian_ordinal = ['AUTOMATED_1', 'AUTOMATED_2', 'ASK1_2', 'SUBNUMERACY2', 'CHANGEABLE', 'GOALCONF', 'EARNERS', 'MATERIALISM_1', 'MATERIALISM_2', 'SCFHORIZON', 'DISTRESS', 'HHEDUC', 'EMPLOY']

for feat in gaussian_ordinal:
    data.loc[data[feat].isna(), feat] = data[feat].mode()[0]

valid_data = data[data['RETIRE'] != -2].copy()
data.loc[data['RETIRE'].isna(), 'RETIRE'] = valid_data['RETIRE'].mode()[0]



# excluding PAIDHELP, SOCSEC1, SOCSEC2 and SOCSEC3 because they have -2 values
categorical = ['FINGOALS', 'AUTOMATED_2', 'HOUSING', 'CONSPROTECT3', 'VOLATILITY', 'SNAP', 'COLLECT', 'REJECTED_1', 'REJECTED_2', 'BENEFITS_3', 'BENEFITS_4', 'BENEFITS_5', 'FRAUD2', 'COVERCOSTS', 'MANAGE2', 'FINSOC2_3', 'FINSOC2_5', 'FINSOC2_6', 'DISCOUNT', 'MEMLOSS', 'KIDS_1', 'KIDS_2', 'KIDS_3', 'KIDS_4', 'EMPLOY', 'MILITARY', 'Military_Status']

for feat in categorical:
    data.loc[data[feat].isna(), feat] = data[feat].mode()[0]

valid_data = data[data['PAIDHELP'] != -2].copy()
data.loc[data['PAIDHELP'].isna(), 'PAIDHELP'] = valid_data['PAIDHELP'].mode()[0]

valid_data = data[data['SOCSEC1'] != -2].copy()
data.loc[data['SOCSEC1'].isna(), 'SOCSEC1'] = valid_data['SOCSEC1'].mode()[0]

valid_data = data[data['SOCSEC2'] != -2].copy()
data.loc[data['SOCSEC2'].isna(), 'SOCSEC2'] = valid_data['SOCSEC2'].mode()[0]

valid_data = data[data['SOCSEC3'] != -2].copy()
data.loc[data['SOCSEC3'].isna(), 'SOCSEC3'] = valid_data['SOCSEC3'].mode()[0]


__SBW: Subjective well-being and optimism questions__

In [78]:
data[list(filter(lambda x: 'SWB' in x, data.columns.values))].isna().sum()

SWB_1    0
SWB_2    0
SWB_3    0
dtype: int64

In [72]:
data.loc[data['SWB_1'].isna(), 'SWB_1'] = data['SWB_1'].median()
data.loc[data['SWB_2'].isna(), 'SWB_2'] = data['SWB_2'].median()
data.loc[data['SWB_3'].isna(), 'SWB_3'] = data['SWB_3'].median()

__SUBKNOWL1: How would you assess your overall financial knowledge?__

In [73]:
data.loc[data['SUBKNOWL1'].isna(), 'SUBKNOWL1'] = data['SUBKNOWL1'].median()

__ACT__

ACT1_1: KNNImputer using FINGOALS, PROPPLAN_1, MANAGE1_1 and ASK1_1
<br>
ACT1_2: KNNImputer using FINGOALS, PORPPLAN_1, MANAGE1_2 and ASK1_1

FINGOALS: binary
<br>
PROPPLAN_1: ordinal
<br>
MANAGE1_1: ordinal
<br>
ASK_1: ordinal

distance: euclidean(ordinal) + hamming(binary)

In [74]:
data[list(filter(lambda x: 'ACT' in x, data.columns.values))].isna().sum()

ACT1_1    14
ACT1_2     8
dtype: int64

In [75]:
data.loc[data['ACT1_1'].isna(), ['FINGOALS', 'PROPPLAN_1', 'MANAGE1_1' ,'ASK1_1', 'ACT1_1']]

Unnamed: 0_level_0,FINGOALS,PROPPLAN_1,MANAGE1_1,ASK1_1,ACT1_1
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
11083,0.0,4.0,4.0,5.0,
9698,0.0,4.0,4.0,4.0,
13130,0.0,3.0,2.0,3.0,
9752,1.0,4.0,5.0,5.0,
12998,1.0,4.0,5.0,4.0,
13517,1.0,5.0,5.0,5.0,
13937,1.0,3.0,5.0,3.0,
10300,0.0,3.0,3.0,5.0,
11807,1.0,3.0,4.0,3.0,
11328,1.0,4.0,5.0,1.0,


In [82]:
def distance(X, Y, missing_values = np.nan):

    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0

    eucl_dist = euclidean(X[[1, 2, 3]], Y[[1, 2, 3]])

    return eucl_dist + (X[0] != Y[0])

In [83]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

new_data = imputer.fit_transform(data[['FINGOALS', 'PROPPLAN_1', 'MANAGE1_1' ,'ASK1_1', 'ACT1_1']])

new_data = pd.DataFrame(new_data, columns = ['FINGOALS', 'PROPPLAN_1', 'MANAGE1_1' ,'ASK1_1', 'ACT1_1'], index = data.index)

In [84]:
new_data.loc[data[data['ACT1_1'].isna()].index.values, 'ACT1_1']

PUF_ID
11083    4.0
9698     3.0
13130    3.0
9752     4.0
12998    5.0
13517    5.0
13937    4.0
10300    1.0
11807    3.0
11328    3.0
11712    5.0
11614    4.0
12230    4.0
12971    4.0
Name: ACT1_1, dtype: float64

In [85]:
data.loc[data['ACT1_1'].isna(), 'ACT1_1'] = new_data.loc[data['ACT1_1'].isna(), 'ACT1_1'].copy()

In [86]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

new_data = imputer.fit_transform(data[['FINGOALS', 'PROPPLAN_1', 'MANAGE1_2' ,'ASK1_1', 'ACT1_2']])

new_data = pd.DataFrame(new_data, columns = ['FINGOALS', 'PROPPLAN_1', 'MANAGE1_2' ,'ASK1_1', 'ACT1_2'], index = data.index)

In [87]:
data.loc[data['ACT1_2'].isna(), 'ACT1_2'] = new_data.loc[data['ACT1_2'].isna(), 'ACT1_2'].copy()

In [88]:
data[list(filter(lambda x: 'ACT' in x, data.columns.values))].isna().sum()

ACT1_1    0
ACT1_2    0
dtype: int64

__FINGOALS: Do you have a current or recent financial goal?__

Let's use ACT1_2, SAVEHABIT, PROPPLAN_3 and GOALCONF to find the K nearest neighbors to impute the missing values

ACT1_2: ordinal
<br>
SAVEHABIT: ordinal
<br>
PROPPLAN_3: ordinal
<br>
GOALCONF: ordinal

distance: euclidean

In [89]:
data[data['FINGOALS'].isna()].shape[0]

54

In [90]:
data['FINGOALS'].value_counts()

1.0    3882
0.0    2104
Name: FINGOALS, dtype: int64

In [91]:
imputer = KNNImputerMode(n_neighbors = 5)

new_data = imputer.fit_transform(data[['FINGOALS', 'ACT1_2', 'SAVEHABIT' ,'PROPPLAN_3', 'GOALCONF']])

new_data = pd.DataFrame(new_data, columns = ['FINGOALS', 'ACT1_2', 'SAVEHABIT' ,'PROPPLAN_3', 'GOALCONF'], index = data.index)

In [92]:
new_data['FINGOALS'].value_counts()

1.0    3916
0.0    2124
Name: FINGOALS, dtype: int64

In [93]:
data.loc[data['FINGOALS'].isna(), 'FINGOALS'] = new_data.loc[data['FINGOALS'].isna(), 'FINGOALS'].copy()

In [94]:
data['FINGOALS'].isna().sum()

0

__MANAGE: Financial responsability__

MANAGE1_1: drop + mode
MANAGE1_2: drop + mode
MANAGE1_3: drop
MANAGE1_4: drop
MANAGE2: KNN Mode Imputer

features:
MANAGE1_4: ordinal
ASK1_2: ordinal
LMscore: numerical

distance: euclidean

In [95]:
data[list(filter(lambda x: 'MANAGE' in x, data.columns.values))].isna().sum()

MANAGE1_1     0
MANAGE1_2     0
MANAGE1_3     0
MANAGE1_4     0
MANAGE2      24
dtype: int64

There are only 24 missing values for MANAGE2

'Who in household makes financial decisions'

In [96]:
imputer = KNNImputerMode(n_neighbors = 5)

new_data = imputer.fit_transform(data[['MANAGE1_4', 'ASK1_2', 'LMscore', 'MANAGE2']])

new_data = pd.DataFrame(new_data, columns = ['MANAGE1_4', 'ASK1_2', 'LMscore', 'MANAGE2'], index = data.index)

In [97]:
data.loc[data['MANAGE2'].isna(), 'MANAGE2'] = new_data.loc[data['MANAGE2'].isna(), 'MANAGE2'].copy()

__SAVEHABIT__

KNN Mode Imputer

features:
<br>
FINGOALS: categorical
<br>
SELFCONTROL_2: ordinal
<br>
PROPPLAN_3: ordinal
<br>
SAVINGSRANGES: ordinal
<br>
SAVEHABIT: ordinal

distance: euclidean + hamming

In [102]:
data[data['SAVEHABIT'].isna()]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13428,2,3,7.0,7.0,6.0,78.0,69.0,5.0,5.0,5.0,...,1,4,8,0,0,0,0,2,0.0,0.42622
13828,1,3,5.0,7.0,7.0,52.0,59.0,7.0,4.0,4.0,...,0,2,3,0,0,0,0,1,1.0,0.490451
11308,1,3,4.0,4.0,5.0,52.0,40.0,4.0,3.0,4.0,...,1,3,7,0,0,0,1,2,0.0,0.945275


In [108]:
def distance(X, Y, missing_values = np.nan):

    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0

    eucl_dist = euclidean(X[[1, 2, 3]], Y[[1, 2, 3]])

    return eucl_dist + (X[0] != Y[0])

In [109]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

cols = ['FINGOALS', 'SELFCONTROL_2', 'PROPPLAN_3', 'SAVINGSRANGES', 'SAVEHABIT']

new_data = imputer.fit_transform(data[cols])

new_data = pd.DataFrame(new_data, columns = cols, index = data.index)

In [110]:
data.loc[data['SAVEHABIT'].isna(), 'SAVEHABIT'] = new_data.loc[data['SAVEHABIT'].isna(), 'SAVEHABIT'].copy()

In [111]:
data[data['SAVEHABIT'].isna()]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


__FRUGALITY__

'If I can re-use an item I already have, there's no sense in buying something new'

In [112]:
data[data['FRUGALITY'].isna()]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
13828,1,3,5.0,7.0,7.0,52.0,59.0,7.0,4.0,4.0,...,0,2,3,0,0,0,0,1,1.0,0.490451
12131,1,1,7.0,4.0,7.0,70.0,57.0,5.0,5.0,4.0,...,0,3,7,0,0,0,0,1,1.0,1.659776


In [113]:
data.loc[data['FRUGALITY'].isna(), 'FRUGALITY'] = data['FRUGALITY'].median()[0]

__AUTOMATED: Automatic savings__

Do you currently have money automaticaly transfered to:

'A Retirement Savings Account'
<br>
'A Non-Retirement Savings Account'

AUTOMATED_1: KNN Mode Imputer

features: PRODHAVE_1, PRODHAVE_4, SOCSEC1, SAVEHABIT

PRODHAVE_1: categorical
<br>
PRODHAVE_4: categorical
<br>
SAVEHABIT: ordinal

distance: euclidean(ordinal) + hamming(categorical) 


In [114]:
data [data['AUTOMATED_1'].isna() | data['AUTOMATED_2'].isna()][['AUTOMATED_1', 'AUTOMATED_2']].isna().sum()

AUTOMATED_1    53
AUTOMATED_2    64
dtype: int64

In [115]:
def distance(X, Y, missing_values = np.nan):
    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0

    eucl_dist = euclidean(X[2], Y[2])

    return eucl_dist + (X[0] != Y[0]) + (X[1] != Y[1])

In [116]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

cols = ['PRODHAVE_1', 'PRODHAVE_4', 'SAVEHABIT', 'AUTOMATED_1']

new_data = imputer.fit_transform(data[cols])

new_data = pd.DataFrame(new_data, columns = cols, index = data.index)

In [117]:
data.loc[data['AUTOMATED_1'].isna(), 'AUTOMATED_1'] = new_data.loc[data['AUTOMATED_1'].isna(), 'AUTOMATED_1'].copy()

In [118]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

cols = ['PRODHAVE_1', 'PRODHAVE_6', 'SAVEHABIT', 'AUTOMATED_2']

new_data = imputer.fit_transform(data[cols])

new_data = pd.DataFrame(new_data, columns = cols, index = data.index)

In [119]:
data.loc[data['AUTOMATED_2'].isna(), 'AUTOMATED_2'] = new_data.loc[data['AUTOMATED_2'].isna(), 'AUTOMATED_2'].copy()

In [120]:
data [data['AUTOMATED_1'].isna() | data['AUTOMATED_2'].isna()][['AUTOMATED_1', 'AUTOMATED_2']].isna().sum()

AUTOMATED_1    0.0
AUTOMATED_2    0.0
dtype: float64

__ASK__

ASK1_1: KNN Mode Imputer

variables to find NN: MANAGE1_2, MANAGE2, GOALCONF, KHscore, FRUGALITY discriminate well

MANAGE1_2: ordinal
<br>
MANAGE2: categorical
<br>
GOALCONF: ordinal
<br>
KHscore: numerical
<br>
FRUGALITY: ordinal
<br>

distance: euclidean(ordinal + numerical) + hamming (categorical)

ASK1_2: mode

In [121]:
data [data['ASK1_1'].isna() | data['ASK1_2'].isna()][['ASK1_1', 'ASK1_2']].isna().sum()

ASK1_1    6
ASK1_2    4
dtype: int64

In [122]:
def distance(X, Y, missing_values = np.nan):
    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0

    eucl_dist = euclidean(X[[0, 2, 3, 4]], Y[[0, 2, 3, 4]])

    return eucl_dist + (X[2] != Y[2])

In [123]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

new_data = imputer.fit_transform(data[['MANAGE1_2', 'MANAGE2', 'GOALCONF', 'KHscore', 'FRUGALITY', 'ASK1_1']])

new_data = pd.DataFrame(new_data, columns = ['MANAGE1_2', 'MANAGE2', 'GOALCONF', 'KHscore', 'FRUGALITY', 'ASK1_1'], index = data.index)

In [124]:
data.loc[data['ASK1_1'].isna(), 'ASK1_1'] = new_data.loc[data['ASK1_1'].isna(), 'ASK1_1'].copy()

In [125]:
data.loc[data['ASK1_2'].isna(), 'ASK1_2'] = data['ASK1_2'].mode()[0]

In [126]:
data [data['ASK1_1'].isna() | data['ASK1_2'].isna()][['ASK1_1', 'ASK1_2']].isna().sum()

ASK1_1    0.0
ASK1_2    0.0
dtype: float64

__SUBNUMERACY__

SUBNUMERACY1: 'How good are you at working with percentages?'

features:

ON1correct: categorical
<br>
ON1correct: categorical

distance: hamming

SUBNUMERACY2: 'Prefers words for expressions of probabilities'

mode

In [127]:
data[list(filter(lambda x: 'SUBNUMERACY' in x, data.columns.values))].isna().sum()

SUBNUMERACY2    16
SUBNUMERACY1    10
dtype: int64

In [128]:
def distance(X, Y, missing_values = np.nan):
    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0

    return (X[0] != Y[0]) + (X[1] != Y[1])

In [129]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

new_data = imputer.fit_transform(data[['ON1correct', 'ON2correct', 'SUBNUMERACY1']])

new_data = pd.DataFrame(new_data, columns = ['ON1correct', 'ON2correct', 'SUBNUMERACY1'], index = data.index)

In [130]:
data.loc[data['SUBNUMERACY1'].isna(), 'SUBNUMERACY1'] = new_data.loc[data['SUBNUMERACY1'].isna(), 'SUBNUMERACY1'].copy()

In [131]:
data.loc[data['SUBNUMERACY2'].isna(), 'SUBNUMERACY2'] = data['SUBNUMERACY2'].mode()[0]

__CHANGEABLE__

'Belief that ability to manage money is NOT changeable'

In [133]:
data[data['CHANGEABLE'].isna()].shape[0]

19

In [134]:
data.loc[data['CHANGEABLE'].isna(), 'CHANGEABLE'] = data['CHANGEABLE'].mode()[0]

__GOALCONF__

'Confidence in own ability to achieve financial goals'

features:
FWBscore: numerical
<br>
SAVEHABIT: ordinal
<br>
ASK1_1: ordinal
<br>
ENDSMEET: ordinal
<br>
SELFCONTROL_3: ordinal

distance: euclidean

In [136]:
data[data['GOALCONF'].isna()].shape[0]

7

In [137]:
imputer = KNNImputerMode(n_neighbors = 5)

cols = ['FWBscore', 'SAVEHABIT', 'ASK1_1', 'ENDSMEET', 'SELFCONTROL_3', 'GOALCONF']

new_data = imputer.fit_transform(data[cols])

new_data = pd.DataFrame(new_data, columns = cols, index = data.index)

In [138]:
data.loc[data['GOALCONF'].isna(), 'GOALCONF'] = new_data.loc[data['GOALCONF'].isna(), 'GOALCONF'].copy()

__ENDSMEET__

'Difficulty of covering monthly expenses and bills'

features:

SWB_1: ordinal
<br>
FWBscore: numerical
<br>
GOALCONF: ordinal
<br>
MATHARDSHIP_1: ordinal
<br>
REJECTED_1: categorical
<br>
HHEDUC: ordinal
<br>
PPEDUC: ordinal
<br>
PPINCIMP: ordinal

distance: euclidean + hamming

In [140]:
data[data['ENDSMEET'].isna()].shape[0]

12

In [141]:
def distance(X, Y, missing_values = np.nan):
    X[np.isnan(X)] = 0
    Y[np.isnan(Y)] = 0

    eucl_dist = euclidean(X[[0, 1, 2, 3, 5, 6, 7]], Y[[0, 1, 2, 3, 5, 6, 7]])

    return eucl_dist + (X[4] != Y[4])

In [142]:
imputer = KNNImputerMode(n_neighbors = 5, metric = distance)

cols = ['SWB_1', 'FWBscore', 'GOALCONF', 'MATHARDSHIP_1', 'REJECTED_1', 'HHEDUC', 'PPEDUC', 'PPINCIMP', 'ENDSMEET']

new_data = imputer.fit_transform(data[cols])

new_data = pd.DataFrame(new_data, columns = cols, index = data.index)

In [143]:
data.loc[data['ENDSMEET'].isna(), 'ENDSMEET'] = new_data.loc[data['ENDSMEET'].isna(), 'ENDSMEET']

__HOUSING__

'Which one of the following best describes your housing situation?'

Variables:

ENDSMEET: ordinal
<br>
SAVINGSRANGES: ordinal
<br>
agecat: ordinal
<br>
HOUSESAT: ordinal
<br>
MATHARDSHIP_3: ordinal

distance = euclidean

In [146]:
data[data['HOUSING'].isna()].shape[0]

12

In [147]:
imputer = KNNImputerMode(n_neighbors = 5)

cols = ['ENDSMEET', 'SAVINGSRANGES', 'agecat', 'HOUSESAT', 'MATHARDSHIP_3', 'HOUSING']

new_data = imputer.fit_transform(data[cols])

new_data = pd.DataFrame(new_data, columns = cols, index = data.index)

In [148]:
data.loc[data['HOUSING'].isna(), 'HOUSING'] = new_data.loc[data['HOUSING'].isna(), 'HOUSING']

In [149]:
data[data['HOUSING'].isna()].shape[0]

0

__LIVINGARRANGEMENT__

Current living arrangements


In [150]:
data[data['LIVINGARRANGEMENT'].isna()]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11388,1,3,7.0,6.0,1.0,73.0,78.0,5.0,5.0,5.0,...,0,2,4,0,0,0,0,2,0.0,0.744356
11989,2,3,1.0,3.0,2.0,32.0,49.0,4.0,3.0,2.0,...,1,1,1,0,0,0,0,1,0.0,0.618375


In [153]:
data.loc[data['LIVINGARRANGEMENT'].isna(), 'LIVINGARRANGEMENT'] = data['LIVINGARRANGEMENT'].mode()[0]

__HOUSERANGES__

'About how much do you pay for your home each month?'

In [154]:
data[data['HOUSERANGES'].isna()]

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FSscore,SUBKNOWL1,ACT1_1,ACT1_2,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11079,1,3,5.0,4.0,7.0,51.0,54.0,5.0,5.0,3.0,...,1,2,4,0,0,1,1,3,0.0,1.370522
7852,1,2,5.0,6.0,7.0,50.0,49.0,5.0,4.0,4.0,...,1,3,7,0,0,0,1,3,1.0,1.292576
11221,1,1,3.0,5.0,4.0,37.0,32.0,4.0,5.0,5.0,...,1,4,9,0,0,1,0,3,0.0,1.025192
12990,2,2,7.0,7.0,7.0,67.0,79.0,7.0,5.0,5.0,...,1,3,5,0,0,0,0,1,0.0,0.866999
10549,1,3,6.0,5.0,5.0,62.0,48.0,5.0,4.0,3.0,...,1,2,3,0,0,0,0,2,0.0,0.616246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10348,1,3,5.0,7.0,6.0,56.0,42.0,5.0,4.0,3.0,...,0,2,4,0,0,0,0,2,0.0,0.328092
12971,3,3,4.0,5.0,2.0,52.0,43.0,5.0,4.0,2.0,...,1,4,9,0,0,0,0,1,0.0,0.545312
12095,3,2,5.0,5.0,6.0,67.0,50.0,4.0,5.0,5.0,...,1,2,3,1,1,0,1,2,0.0,0.690902
13939,3,3,7.0,7.0,7.0,70.0,70.0,7.0,5.0,5.0,...,1,3,7,1,1,0,0,2,0.0,0.830442
