# Financial Well-Being Project: Data Exploration

Survey was administered by the Consumer Financial Protection Bureau (CFPB)

In [2]:
# Import dependencies
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
from sklearn.linear_model import LinearRegression
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Option 1: Use if loading data from local folder in Jupyter Notebook
path = Path('../resources/NFWBS_PUF_2016_data.csv')

In [None]:
# Option 2: Use if loading data from Google Drive in Google Collab
from google.colab import drive
drive.mount('/content/drive')

path = Path('/content/drive/My Drive/Bootcamp/Project-4/NFWBS_PUF_2016_data.csv')

# Load financial well-being survey data
survey_df = pd.read_csv(path, index_col=0)

In [9]:
# Load survey data from CSV file
survey_df = pd.read_csv(path, index_col=0)

# Display sample data
print(f'Records: {len(survey_df)}')
survey_df.head()


Records: 6394


Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10350,2,3,5,5,6,55,3,3,3,3,...,1,4,8,0,0,0,0,1,0,0.367292
7740,1,3,6,6,6,51,2,2,3,3,...,1,2,3,0,0,0,0,2,0,1.327561
13699,1,3,4,3,4,49,3,3,3,3,...,1,4,9,0,0,0,1,2,1,0.835156
7267,1,3,6,6,6,49,3,3,3,3,...,1,3,7,0,0,0,0,1,0,1.410871
7375,1,3,4,4,4,49,3,3,3,3,...,1,2,4,0,0,1,0,4,1,4.260668


In [37]:
# Create lists to store columns for research tools administered as part of the survey

# Financial Well-Being Scale developed by CPFB
fwb_scale = ['FWBscore', 'FWB1_1', 'FWB1_2', 'FWB1_3', 'FWB1_4', 'FWB1_5', 'FWB1_6',
            'FWB2_1', 'FWB2_2', 'FWB2_3', 'FWB2_4']

# Financial Skills Scale developed by CPFB
fs_scale = ['FSscore', 'FS1_1', 'FS1_2', 'FS1_3', 'FS1_4', 'FS1_5', 'FS1_6', 'FS1_7',
            'FS2_1', 'FS2_2', 'FS2_3']

# Knoll and Houts Financial Knowledge Scale 
lm_scale = ['LMscore', 'FINKNOWL1', 'FINKNOWL2', 'FINKNOWL3',
           'FK1correct', 'FK2correct', 'FK3correct']

# Knoll and Houts Financial Knowledge Scale 
kh_scale = ['KHscore', 'KHKNOWL1', 'KHKNOWL2','KHKNOWL3','KHKNOWL4','KHKNOWL5',
           'KHKNOWL6','KHKNOWL7','KHKNOWL8','KHKNOWL9', 'KH1correct', 'KH2correct',
           'KH3correct', 'KH4correct', 'KH5correct', 'KH6correct', 'KH7correct',
           'KH8correct', 'KH9correct']

In [46]:
# Create lists to store needed columns in groups/measures
#  as defined in the survey report and documentation

# Individual characteristics
ind_measures = ['PPEDUC', 'agecat', 'HEALTH', 'PPETHM', 'PPGENDER']

# Household and family characteristics
house_fam_measures = ['PPMARIT', 'PPREG4', 'HOUSING', 'HOUSESAT', 
                      'KIDS_NoChildren','KIDS_1', 'KIDS_2', 'KIDS_3', 'KIDS_3']

# Income and employment
inc_emp_measures = ['fpl', 'PPINCIMP', 'VOLATILITY', 
                    'EMPLOY1_1', 'EMPLOY1_2', 'EMPLOY1_3', 'EMPLOY1_4',
                    'EMPLOY1_5', 'EMPLOY1_6', 'EMPLOY1_7', 'EMPLOY1_8', 'EMPLOY1_9', 
                    'BENEFITS_1', 'BENEFITS_2', 'BENEFITS_3',
                    'BENEFITS_4', 'BENEFITS_5', 'MILITARY']
# Note: columns removed from inc_emp: 'SOCSEC1', 'SOCSEC2', 'SOCSEC3' 

# Savings and safety nets
sav_safety_measures = ['SAVINGSRANGES', 'PRODHAVE_3', 'PRODHAVE_6',
                       'ABSORBSHOCK', 'BORROW_1', 'BORROW_2' ]

# Financial circumsstances
fin_circ_measures = ['ENDSMEET', 'MATHARDSHIP_1', 'MATHARDSHIP_2', 'MATHARDSHIP_3',
                     'MATHARDSHIP_4', 'MATHARDSHIP_5', 'MATHARDSHIP_6']

# Financial behaviors, skills and attitudes
fin_beh_measures = ['PROPPLAN_1', 'PROPPLAN_2', 'PROPPLAN_3', 'PROPPLAN_4',
                    'MANAGE1_1', 'MANAGE1_2', 'MANAGE1_3', 'MANAGE1_4',
                    'SAVEHABIT', 'GOALCONF', 'SCFHORIZON']

In [13]:
# Check basic stats against research report
fwb_score = survey_df['FWBscore']
final_wt = survey_df['finalwt']
fwb_score_wt = fwb_score * final_wt
print(f"FWB Mean Score: {fwb_score.mean()}")
print(f"FWB Mean Score (Weighted): {fwb_score_wt.mean()}")
print(f"FWB Percentiles: \n{fwb_score.quantile([.1, .25, .5, .75, .9])}")
print(f"FWB Percentiles (Weighted): \n{fwb_score_wt.quantile([.1, .25, .5, .75, .9])}")

FWB Mean Score: 56.03409446355959
FWB Mean Score (Weighted): 54.20231649953065
FWB Percentiles: 
0.10    38.0
0.25    48.0
0.50    56.0
0.75    65.0
0.90    74.0
Name: FWBscore, dtype: float64
FWB Percentiles (Weighted): 
0.10    24.753388
0.25    32.990588
0.50    46.171876
0.75    66.921936
0.90    91.481102
dtype: float64


In [22]:
# Check
# Code Ref: https://stackoverflow.com/questions/64144977/how-to-find-pandas-columns-with-one-or-more-negative-values
int_columns = [col for col in survey_df.columns if survey_df[col].dtype.name == 'int64']

survey_df[survey_df[int_columns].values < 0].groupby('PUF_ID').first()

Unnamed: 0_level_0,sample,fpl,SWB_1,SWB_2,SWB_3,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,...,PPMSACAT,PPREG4,PPREG9,PPT01,PPT25,PPT612,PPT1317,PPT18OV,PCTLT200FPL,finalwt
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7123,1,2,-1,3,3,39,3,-1,5,-1,...,1,4,9,0,0,0,0,1,0,0.685003
7125,1,3,7,7,7,35,5,5,4,4,...,1,4,9,0,0,0,0,2,0,1.093935
7132,1,3,4,4,3,43,3,3,5,3,...,1,4,9,0,0,0,0,3,1,2.248802
7133,1,3,3,3,3,52,3,3,2,3,...,1,1,2,0,0,0,0,1,0,1.011039
7140,3,1,7,7,7,26,5,5,5,5,...,1,3,5,0,0,0,0,1,-5,1.053616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14395,2,1,6,4,5,61,3,4,2,3,...,1,1,2,0,0,0,0,2,0,0.978883
14397,1,2,3,3,3,51,3,3,3,2,...,1,2,3,1,0,0,0,3,0,0.571152
14398,1,3,4,4,3,24,1,1,5,1,...,1,1,2,0,0,1,0,2,0,0.693519
14399,1,3,5,5,6,52,2,3,2,3,...,1,3,6,0,0,0,0,2,1,0.548649


In [44]:
# Check for -1 (refused to answer) and -4 (response not entered in database)
fwb_df = survey_df[fwb_scale]
fwb_df[fwb_df.values < 0].groupby('PUF_ID').min()

Unnamed: 0_level_0,FWBscore,FWB1_1,FWB1_2,FWB1_3,FWB1_4,FWB1_5,FWB1_6,FWB2_1,FWB2_2,FWB2_3,FWB2_4
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
7123,39,3,-1,5,-1,5,3,3,-1,4,-1
7197,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7214,50,3,3,3,3,3,3,-1,-1,-1,-1
7259,62,-1,5,-1,-1,-1,-1,-1,-1,-1,-1
7305,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7938,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
7982,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
9050,37,1,-1,4,-1,-1,4,4,-1,-1,-1
9338,58,-1,3,2,3,3,2,2,3,2,2
11974,63,3,-1,-1,3,-1,-1,1,3,1,3


## Preprocessing

In [83]:
# Identify columns for dataset

data_cols = (ind_measures + house_fam_measures + inc_emp_measures + sav_safety_measures
             + fin_circ_measures + fin_beh_measures + ['FWBscore'])
data_cols

['PPEDUC',
 'agecat',
 'HEALTH',
 'PPETHM',
 'PPGENDER',
 'PPMARIT',
 'PPREG4',
 'HOUSING',
 'HOUSESAT',
 'KIDS_NoChildren',
 'KIDS_1',
 'KIDS_2',
 'KIDS_3',
 'KIDS_3',
 'fpl',
 'PPINCIMP',
 'VOLATILITY',
 'EMPLOY1_1',
 'EMPLOY1_2',
 'EMPLOY1_3',
 'EMPLOY1_4',
 'EMPLOY1_5',
 'EMPLOY1_6',
 'EMPLOY1_7',
 'EMPLOY1_8',
 'EMPLOY1_9',
 'BENEFITS_1',
 'BENEFITS_2',
 'BENEFITS_3',
 'BENEFITS_4',
 'BENEFITS_5',
 'MILITARY',
 'SAVINGSRANGES',
 'PRODHAVE_3',
 'PRODHAVE_6',
 'ABSORBSHOCK',
 'BORROW_1',
 'BORROW_2',
 'ENDSMEET',
 'MATHARDSHIP_1',
 'MATHARDSHIP_2',
 'MATHARDSHIP_3',
 'MATHARDSHIP_4',
 'MATHARDSHIP_5',
 'MATHARDSHIP_6',
 'PROPPLAN_1',
 'PROPPLAN_2',
 'PROPPLAN_3',
 'PROPPLAN_4',
 'MANAGE1_1',
 'MANAGE1_2',
 'MANAGE1_3',
 'MANAGE1_4',
 'SAVEHABIT',
 'GOALCONF',
 'SCFHORIZON',
 'FWBscore']

In [84]:
data_df = survey_df[data_cols]
data_df.describe()

Unnamed: 0,PPEDUC,agecat,HEALTH,PPETHM,PPGENDER,PPMARIT,PPREG4,HOUSING,HOUSESAT,KIDS_NoChildren,...,PROPPLAN_3,PROPPLAN_4,MANAGE1_1,MANAGE1_2,MANAGE1_3,MANAGE1_4,SAVEHABIT,GOALCONF,SCFHORIZON,FWBscore
count,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,...,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0,6394.0
mean,3.160776,4.450422,3.414451,1.622771,1.475759,2.042071,2.644823,1.414295,3.303409,0.478886,...,3.672036,3.254301,4.533,3.807163,3.535033,4.20441,4.36894,3.213012,3.063341,56.034094
std,1.178349,2.120741,1.00567,1.077631,0.499451,1.393808,1.032583,0.672763,0.863732,0.65939,...,0.923921,1.027885,0.924622,1.084656,1.583223,1.071351,1.48672,0.778633,1.377079,14.154676
min,1.0,1.0,-1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-4.0
25%,2.0,3.0,3.0,1.0,1.0,1.0,2.0,1.0,3.0,0.0,...,3.0,3.0,4.0,3.0,2.0,4.0,4.0,3.0,2.0,48.0
50%,3.0,4.0,4.0,1.0,1.0,1.0,3.0,1.0,3.0,1.0,...,4.0,3.0,5.0,4.0,4.0,5.0,5.0,3.0,3.0,56.0
75%,4.0,6.0,4.0,2.0,2.0,3.0,3.0,2.0,4.0,1.0,...,4.0,4.0,5.0,5.0,5.0,5.0,6.0,4.0,4.0,65.0
max,5.0,8.0,5.0,4.0,2.0,5.0,4.0,3.0,4.0,1.0,...,5.0,5.0,5.0,5.0,5.0,5.0,6.0,4.0,5.0,95.0


In [85]:
# Get count of negative values in each column
# Code Ref: https://stackoverflow.com/questions/36155942/need-count-of-negative-values-in-a-dataframe
data_df.lt(0).sum()

PPEDUC               0
agecat               0
HEALTH              53
PPETHM               0
PPGENDER             0
PPMARIT              0
PPREG4               0
HOUSING             42
HOUSESAT            62
KIDS_NoChildren    592
KIDS_1               1
KIDS_2               3
KIDS_3               1
KIDS_3               1
fpl                  0
PPINCIMP             0
VOLATILITY          64
EMPLOY1_1            0
EMPLOY1_2            0
EMPLOY1_3            0
EMPLOY1_4            0
EMPLOY1_5            0
EMPLOY1_6            0
EMPLOY1_7            0
EMPLOY1_8            0
EMPLOY1_9            0
BENEFITS_1          35
BENEFITS_2          34
BENEFITS_3          42
BENEFITS_4          43
BENEFITS_5          39
MILITARY            50
SAVINGSRANGES       38
PRODHAVE_3           0
PRODHAVE_6           0
ABSORBSHOCK         37
BORROW_1           176
BORROW_2           269
ENDSMEET            44
MATHARDSHIP_1       27
MATHARDSHIP_2       27
MATHARDSHIP_3       26
MATHARDSHIP_4       27
MATHARDSHIP

In [86]:
# Display rows with negative values in each column
data_to_drop = data_df[data_df.values < 0].groupby('PUF_ID').min()
data_to_drop

Unnamed: 0_level_0,PPEDUC,agecat,HEALTH,PPETHM,PPGENDER,PPMARIT,PPREG4,HOUSING,HOUSESAT,KIDS_NoChildren,...,PROPPLAN_3,PROPPLAN_4,MANAGE1_1,MANAGE1_2,MANAGE1_3,MANAGE1_4,SAVEHABIT,GOALCONF,SCFHORIZON,FWBscore
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7123,3,2,-1,4,1,3,4,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,39
7132,2,1,-1,4,2,4,4,3,-1,-1,...,3,3,3,3,3,3,3,4,-1,43
7133,3,4,-1,1,2,1,1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,52
7140,3,2,5,2,2,5,3,1,4,-1,...,5,5,5,5,5,5,6,4,1,26
7149,5,8,-1,2,2,3,1,2,2,1,...,3,4,3,3,4,3,4,3,4,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14361,4,8,4,1,2,1,3,1,4,1,...,3,3,5,5,5,5,5,3,5,77
14375,3,1,4,1,1,4,3,3,3,1,...,2,2,4,3,1,3,4,3,3,64
14379,2,5,4,1,1,3,4,1,4,-1,...,4,4,5,3,5,5,5,4,3,56
14395,2,8,4,1,1,1,1,1,3,-1,...,4,4,5,4,5,4,5,3,4,61


In [87]:
# Remove rows with negative values
data_df.drop(index=data_to_drop.index, inplace=True)
data_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df.drop(index=data_to_drop.index, inplace=True)


Unnamed: 0_level_0,PPEDUC,agecat,HEALTH,PPETHM,PPGENDER,PPMARIT,PPREG4,HOUSING,HOUSESAT,KIDS_NoChildren,...,PROPPLAN_3,PROPPLAN_4,MANAGE1_1,MANAGE1_2,MANAGE1_3,MANAGE1_4,SAVEHABIT,GOALCONF,SCFHORIZON,FWBscore
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7740,2,3,3,1,1,3,2,1,3,1,...,2,1,4,4,1,4,1,3,3,51
7375,2,2,3,3,1,1,2,2,3,1,...,3,3,3,3,3,3,4,3,3,49
10910,4,2,5,1,1,1,2,1,4,1,...,3,4,5,3,5,5,4,4,1,67
11079,4,3,3,4,2,1,2,1,2,0,...,3,4,5,3,5,5,5,3,3,51
7741,1,2,2,1,2,4,2,3,2,1,...,3,2,1,1,1,1,2,2,2,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11220,5,7,5,2,2,1,2,1,4,0,...,3,3,5,3,3,4,4,3,3,61
13118,2,6,4,2,1,1,3,1,4,0,...,4,4,4,4,3,5,4,3,1,59
8709,5,2,3,1,2,1,1,2,3,1,...,4,4,5,5,5,4,5,3,5,59
8515,2,2,3,4,1,5,4,2,3,0,...,3,3,3,3,3,3,4,4,4,46


In [88]:
# Define features set
X = data_df.copy()
X.drop(columns='FWBscore', axis=1, inplace=True)
X.head()

Unnamed: 0_level_0,PPEDUC,agecat,HEALTH,PPETHM,PPGENDER,PPMARIT,PPREG4,HOUSING,HOUSESAT,KIDS_NoChildren,...,PROPPLAN_2,PROPPLAN_3,PROPPLAN_4,MANAGE1_1,MANAGE1_2,MANAGE1_3,MANAGE1_4,SAVEHABIT,GOALCONF,SCFHORIZON
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7740,2,3,3,1,1,3,2,1,3,1,...,2,2,1,4,4,1,4,1,3,3
7375,2,2,3,3,1,1,2,2,3,1,...,3,3,3,3,3,3,3,4,3,3
10910,4,2,5,1,1,1,2,1,4,1,...,4,3,4,5,3,5,5,4,4,1
11079,4,3,3,4,2,1,2,1,2,0,...,3,3,4,5,3,5,5,5,3,3
7741,1,2,2,1,2,4,2,3,2,1,...,3,3,2,1,1,1,1,2,2,2


In [89]:
y = data_df['FWBscore'].values.reshape(-1, 1)
y[:5]

array([[51],
       [49],
       [67],
       [51],
       [47]], dtype=int64)

## Buiding the Linear Regression Model

In [90]:
# Create a model with scikit-learn
model = LinearRegression()

In [91]:
# Fit the data into the model
model.fit(X, y)

In [92]:
# Display model information
print(f"Model's slope: {model.coef_}\n")
print(f"Model's y-intercept: {model.intercept_}\n")
print(f"Model's formula: y = {model.intercept_} + {model.coef_[0]}X\n")

Model's slope: [[-1.54257674e-01  5.76700057e-01  9.80613569e-01  2.91341779e-01
   3.71014600e-01 -2.15131797e-01  2.38267196e-01  4.16624810e-01
   1.59957437e+00  9.28166589e-01  3.20589802e-01 -8.43904263e-02
   1.00131484e-01  1.00131484e-01 -1.39814405e+00  5.77378800e-01
  -2.32219097e-01  6.18693883e-01  5.31810408e-01 -4.10681839e-01
   1.87055041e+00  2.16314712e-01  1.62207878e+00  6.43301514e-01
   3.02996687e+00  1.07685212e+00 -5.17624836e-01  4.56040753e-01
   6.71949604e-01  8.53887400e-02 -2.39538888e-01  4.90063868e-01
   3.62012859e-04 -8.40006356e-02  1.18917360e+00  4.98150255e-01
  -1.30684227e-01  4.78860826e-01 -6.50522242e+00 -5.48739405e-01
  -8.23815716e-01 -4.89129557e-01 -1.64742789e+00 -4.87293297e-01
   2.52164792e-01 -9.52060502e-01 -4.83326924e-01  1.60291422e-01
   2.71663502e-01 -2.22465277e-01  8.77455218e-01  5.57673083e-01
  -1.47445352e-01  9.47329682e-01  3.68175111e+00  7.40142931e-01]]

Model's y-intercept: [35.71540778]

Model's formula: y = [

In [93]:
# Make predictions using the X set
predicted_y_values = model.predict(X)

In [94]:
# Create a copy of the original data
predicted_df = data_df.copy()

# Add a column with the predicted salary values
predicted_df["predicted_FWBscore"] = predicted_y_values

# Display sample data
predicted_df

Unnamed: 0_level_0,PPEDUC,agecat,HEALTH,PPETHM,PPGENDER,PPMARIT,PPREG4,HOUSING,HOUSESAT,KIDS_NoChildren,...,PROPPLAN_4,MANAGE1_1,MANAGE1_2,MANAGE1_3,MANAGE1_4,SAVEHABIT,GOALCONF,SCFHORIZON,FWBscore,predicted_FWBscore
PUF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7740,2,3,3,1,1,3,2,1,3,1,...,1,4,4,1,4,1,3,3,51,43.870831
7375,2,2,3,3,1,1,2,2,3,1,...,3,3,3,3,3,4,3,3,49,47.544341
10910,4,2,5,1,1,1,2,1,4,1,...,4,5,3,5,5,4,4,1,67,60.084203
11079,4,3,3,4,2,1,2,1,2,0,...,4,5,3,5,5,5,3,3,51,50.348425
7741,1,2,2,1,2,4,2,3,2,1,...,2,1,1,1,1,2,2,2,47,27.164525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11220,5,7,5,2,2,1,2,1,4,0,...,3,5,3,3,4,4,3,3,61,56.937169
13118,2,6,4,2,1,1,3,1,4,0,...,4,4,4,3,5,4,3,1,59,55.910064
8709,5,2,3,1,2,1,1,2,3,1,...,4,5,5,5,4,5,3,5,59,60.274990
8515,2,2,3,4,1,5,4,2,3,0,...,3,3,3,3,3,4,4,4,46,50.508876


## Linear Regression Model Assessment

In [95]:
# Import relevant metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [96]:
# Compute metrics for the linear regression model: score, r2, mse, rmse, std
score = model.score(X, y, sample_weight=None)
r2 = r2_score(y, predicted_y_values)
mse = mean_squared_error(y, predicted_y_values)
rmse = np.sqrt(mse)
std = np.std(y)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.6559594687342374.
The r2 is 0.6559594687342374.
The mean squared error is 67.5127647564172.
The root mean squared error is 8.216615164191225.
The standard deviation is 14.008387345029595.


## Decision Tree

In [97]:
y = pd.qcut(data_df['FWBscore'], 3, labels=['<48', '48-65', '>65'])
y = y.values.reshape(-1, 1)
y[:5]

[['48-65'], ['<48'], ['>65'], ['48-65'], ['<48']]
Categories (3, object): ['<48' < '48-65' < '>65']

In [98]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Decision Tree Model

In [104]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier(max_depth=7, min_samples_split=5, min_samples_leaf=3)

In [105]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

## Making Predictions using Tree Model

In [106]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

## Model Evaluation

In [107]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual <48", "Actual 48-65", "Actual >65"],
    columns=["Predicted <48", "Predicted 48-65", "Predicted >65"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [108]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted <48,Predicted 48-65,Predicted >65
Actual <48,217,92,136
Actual 48-65,103,309,10
Actual >65,115,12,330


Accuracy Score : 0.6465256797583081
Classification Report
              precision    recall  f1-score   support

       48-65       0.50      0.49      0.49       445
         <48       0.75      0.73      0.74       422
         >65       0.69      0.72      0.71       457

    accuracy                           0.65      1324
   macro avg       0.65      0.65      0.65      1324
weighted avg       0.65      0.65      0.65      1324

