# Financial Well-Being Project: Data Exploration

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import requests
from pathlib import Path

In [2]:
# Option 1: Use if loading data from local folder in Jupyter Notebook
path = Path('../resources/NFWBS_PUF_2016_data.csv')

In [None]:
# Option 2: Use if loading data from Google Drive in Google Collab
from google.colab import drive
drive.mount('/content/drive')

path = Path('/content/drive/My Drive/Bootcamp/Project-4/NFWBS_PUF_2016_data.csv')

# Load financial well-being survey data
survey_df = pd.read_csv(path, index_col=0)

In [34]:
# Load survey data from CSV file
survey_df = pd.read_csv(path, index_col=0)

# Display sample data
print(f'Records: {len(survey_df)}')
survey_df.head()


Records: 6394


Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10350,2,3,5,5,6,55,3,3,3,3,...,1,4,8,0,0,0,0,1,0,0.367292
7740,1,3,6,6,6,51,2,2,3,3,...,1,2,3,0,0,0,0,2,0,1.327561
13699,1,3,4,3,4,49,3,3,3,3,...,1,4,9,0,0,0,1,2,1,0.835156
7267,1,3,6,6,6,49,3,3,3,3,...,1,3,7,0,0,0,0,1,0,1.410871
7375,1,3,4,4,4,49,3,3,3,3,...,1,2,4,0,0,1,0,4,1,4.260668


In [25]:
# Store columns for each scale in a variable
fwb_cols = ['FWBscore', 'FWB1_1', 'FWB1_2', 'FWB1_3', 'FWB1_4', 'FWB1_5', 'FWB1_6',
            'FWB2_1', 'FWB2_2', 'FWB2_3', 'FWB2_4']
fs_cols = ['FSScore', 'FS1_1', 'FS1_2', 'FS1_3', 'FS1_4', 'FS1_5', 'FS1_6', 'FS1_7',
            'FS2_1', 'FS2_2', 'FS2_3']
lm_cols = ['LMscore', 'FINKNOWL1', 'FINKNOWL2', 'FINKNOWL3',
           'FK1correct', 'FK2correct', 'FK3correct']
kh_cols = ['KHscore', 'KHKNOWL1', 'KHKNOWL2','KHKNOWL3','KHKNOWL4','KHKNOWL5',
           'KHKNOWL6','KHKNOWL7','KHKNOWL8','KHKNOWL9', 'KH1correct', 'KH2correct',
           'KH3correct', 'KH4correct', 'KH5correct', 'KH6correct', 'KH7correct',
           'KH8correct', 'KH9correct']

In [15]:
survey_df.dtypes

sample           int64
fpl              int64
SWB_1            int64
SWB_2            int64
SWB_3            int64
                ...   
PPT612           int64
PPT1317          int64
PPT18OV          int64
PCTLT200FPL      int64
finalwt        float64
Length: 216, dtype: object

In [47]:
# Check basic stats against research report
fwb_score = survey_df['FWBscore']
final_wt = survey_df['finalwt']
fwb_score_wt = fwb_score * final_wt
print(f"FWB Mean Score: {fwb_score.mean()}")
print(f"FWB Mean Score (Weighted): {fwb_score_wt.mean()}")
print(f"FWB Percentiles: \n{fwb_score.quantile([.1, .25, .5, .75, .9])}")
print(f"FWB Percentiles (Weighted): \n{fwb_score_wt.quantile([.1, .25, .5, .75, .9])}")

FWB Mean Score: 56.03409446355959
FWB Mean Score (Weighted): 54.20231649953065
FWB Percentiles: 
0.10    38.0
0.25    48.0
0.50    56.0
0.75    65.0
0.90    74.0
Name: FWBscore, dtype: float64
FWB Percentiles (Weighted): 
0.10    24.753388
0.25    32.990588
0.50    46.171876
0.75    66.921936
0.90    91.481102
dtype: float64
PUF_ID
10350     20.201053
7740      67.705596
13699     40.922635
7267      69.132679
7375     208.772739
            ...    
11220     31.872719
13118     59.897899
8709      67.039953
8515      56.347287
8516      30.697964
Length: 6394, dtype: float64


In [19]:
# Code Ref: https://stackoverflow.com/questions/64144977/how-to-find-pandas-columns-with-one-or-more-negative-values
int_columns = [col for col in survey_df.columns if survey_df[col].dtype.name == 'int64']

survey_df[survey_df[int_columns] < 0].count()

sample           0
fpl              0
SWB_1           31
SWB_2           57
SWB_3           65
              ... 
PPT612           0
PPT1317          0
PPT18OV          0
PCTLT200FPL    395
finalwt          0
Length: 216, dtype: int64

In [40]:
fwb_df = survey_df[fwb_cols]
fwb_df[fwb_df < 0].count()

FWBscore     5
FWB1_1      11
FWB1_2      11
FWB1_3       9
FWB1_4      11
FWB1_5      13
FWB1_6       9
FWB2_1      10
FWB2_2      14
FWB2_3      11
FWB2_4      12
dtype: int64

In [39]:
survey_df[(survey_df[fwb_cols] < 0).all(axis=1)][fwb_cols]

Unnamed: 0_level_0,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,FWB1_5,FWB1_6,FWB2_1,FWB2_2,FWB2_3,FWB2_4
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
12173,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4
7938,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7305,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7982,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7197,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
