# Imports

In [394]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from os.path import exists

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

import warnings
warnings.filterwarnings("ignore")


# Acquire

In [395]:
raw = pd.read_csv('aug22pub.csv')

In [396]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127932 entries, 0 to 127931
Columns: 388 entries, hrhhid2 to pternh2
dtypes: float64(337), int64(51)
memory usage: 378.7 MB


In [397]:
raw.columns = raw.columns.str.lower()

## Desired Columns

'hryear4', 'hurespli', 'hufinal', 'hetenure', 'hehousut', 'hetelavl', 'hefaminc', 'hrnumhou', 'hrhtype', hrlonglk, hrhhid2, hubus, hubusl1, hubusl2, hubusl4, gereg, gediv, gestfips, gtcbsast, gtmetsta, gtcbsasz, prtage, partfage, pemaritl, peafever, peafnow, peeduca, ptdtrace, prdthsp, pehspnon, prmarsta, prpertyp, penatvty, pemntvty, pefntvty, prcitshp, prinusyr, pemlr, puwk, pubus1, pubus2ot, pudis, peret1, pudis1, pudis2, puabsot, pulay, peabsrsn, peabspdo, pemjot, pemjnum, pehrusl1, pehrusl2, perhrftpt, pehruslt, pehrwant, pehrrsn1, pehrrsn3, purhoff1, purhoff2, puhrot1, pehract1, pehract2, pehravl, pelayfto, pelkavl, pulkavr, pelkll10, pelkdur, pelkfto, pedwldo, pedwwk, pejhwko, pejhrsn, pejhwant, prabsrea, prcibvlf, prdisc, premphrs, prempnot, prexplf, prftlf, prhrusl, prjobsea, prpthrs, prptrea, prwksch, prwkstat, prwntjob, puiodp1, puiodp2, peio1cow, puio2cow, puio2mfg, prioelg, pragna, prcow1, prcow2, prdtcow1, prdtcow2, prdtind1, prdtind3, prdtocc1, prdtocc2, premp, prmjind1, prmjind2, prmjocc1, prmjocc2, prmjocgr, prnagpws, prnagws, prsjms, peernuot, peernper, peernrt, peernhry, pternh1c, pternh2, pternh1o, pternhly, pthr, peernhro, pternwa, ptwk, ptern, ptot, peernwkp, peernlab, peerncov, penlfjh, penlfret, penlfact, pescenr, peschft, peschlvl, prnlfsch, prchld, prnmchld, pedipged, pehgcomp, pecyc, peio1icd, ptio1ocd, peio2icd, ptio2ocd, primind1, primind2, pecert1

In [398]:
wanted = 'hrlonglk, hrhhid2, hubus, hubusl1, hubusl2, hubusl4, gereg, gediv, gestfips, gtcbsast, gtmetsta, gtcbsasz, prtage, prtfage, pemaritl, peafever, peafnow, peeduca, ptdtrace, prdthsp, pehspnon, prmarsta, prpertyp, penatvty, pemntvty, pefntvty, prcitshp, pemlr, puwk, pubus1, pubus2ot, pudis, peret1, pudis1, pudis2, puabsot, pulay, peabsrsn, peabspdo, pemjot, pemjnum, pehrusl1, pehrusl2, pehrftpt, pehruslt, pehrwant, pehrrsn1, pehrrsn2, pehrrsn3, puhroff1, puhroff2, puhrot1, pehract1, pehract2, pehravl, pelayfto, pelkavl, pulkavr, pelkll1o, pelkdur, pelkfto, pedwlko, pedwwk, pejhwko, pejhrsn, pejhwant, prabsrea, prcivlf, prdisc, premphrs, prempnot, prexplf, prftlf, prhrusl, prjobsea, prpthrs, prptrea, prwksch, prwkstat, prwntjob, puiodp1, puiodp2, peio1cow, peio2cow, puio2mfg, prioelg, pragna, prcow1, prcow2, prdtcow1, prdtcow2, prdtind1, prdtind2, prdtocc1, prdtocc2, premp, prmjind1, prmjind2, prmjocc1, prmjocc2, prmjocgr, prnagpws, prnagws, prsjmj, peernuot, peernper, peernrt, peernhry, pternh1c, pternh2, pternh1o, pternhly, pthr, peernhro, pternwa, ptwk, ptern, ptot, peernwkp, peernlab, peerncov, penlfjh, penlfret, penlfact, peschenr, peschft, peschlvl, prnlfsch, prchld, prnmchld, pedipged, pehgcomp, pecyc, peio1icd, ptio1ocd, peio2icd, ptio2ocd, primind1, primind2, pecert1'
wanted_list = wanted.split(', ')
wanted_list = ['hryear4', 'hurespli', 'hufinal', 'hehousut', 'hetelavl', 'hefaminc', 'hrnumhou', 'hrhtype', 'pesex'] + wanted_list

column_not_in_frame = []

for col in wanted_list:
    if col not in raw.columns.str.lower().to_list():
        column_not_in_frame.append(col)
        print(col)

## Takeaways 
* fixed typos in list of desired columns
* Check original data dictionary for:
    - hetenure not in ddrame --> need to drop from desired list
    - prsjmj - prsjms not in dframe --> prsjmj contains the information
    - prinusyr not in dframe - Immigrant's year of entry
    

In [399]:
# df.loc[:, wanted_list]
df = raw[wanted_list]

In [400]:
print(df.columns.to_list())

['hryear4', 'hurespli', 'hufinal', 'hehousut', 'hetelavl', 'hefaminc', 'hrnumhou', 'hrhtype', 'pesex', 'hrlonglk', 'hrhhid2', 'hubus', 'hubusl1', 'hubusl2', 'hubusl4', 'gereg', 'gediv', 'gestfips', 'gtcbsast', 'gtmetsta', 'gtcbsasz', 'prtage', 'prtfage', 'pemaritl', 'peafever', 'peafnow', 'peeduca', 'ptdtrace', 'prdthsp', 'pehspnon', 'prmarsta', 'prpertyp', 'penatvty', 'pemntvty', 'pefntvty', 'prcitshp', 'pemlr', 'puwk', 'pubus1', 'pubus2ot', 'pudis', 'peret1', 'pudis1', 'pudis2', 'puabsot', 'pulay', 'peabsrsn', 'peabspdo', 'pemjot', 'pemjnum', 'pehrusl1', 'pehrusl2', 'pehrftpt', 'pehruslt', 'pehrwant', 'pehrrsn1', 'pehrrsn2', 'pehrrsn3', 'puhroff1', 'puhroff2', 'puhrot1', 'pehract1', 'pehract2', 'pehravl', 'pelayfto', 'pelkavl', 'pulkavr', 'pelkll1o', 'pelkdur', 'pelkfto', 'pedwlko', 'pedwwk', 'pejhwko', 'pejhrsn', 'pejhwant', 'prabsrea', 'prcivlf', 'prdisc', 'premphrs', 'prempnot', 'prexplf', 'prftlf', 'prhrusl', 'prjobsea', 'prpthrs', 'prptrea', 'prwksch', 'prwkstat', 'prwntjob', 'p

In [401]:
second_cut = ['hryear4','hufinal', 'hehousut', 'hefaminc', 'hrnumhou', 'hrhtype', 'hrhhid2', 'hubus',\
'gereg', 'gediv', 'gestfips', 'gtcbsast', 'gtmetsta', 'gtcbsasz', 'prtage', 'prtfage', \
'pemaritl', 'pesex', 'peafever', 'peafnow', 'peeduca', 'ptdtrace', 'pehspnon', 'prmarsta', 'penatvty', 'pemntvty',\
'pefntvty', 'prcitshp', 'pemlr', 'pubus1', 'pubus2ot', 'pudis', 'peret1', 'pudis2', \
'pemjot', 'pemjnum', 'pehrftpt', 'pehruslt', 'pehrwant', 'pehrrsn2',\
'puhroff1', 'puhroff2', 'puhrot1', 'pehract1', 'pehract2', 'pehravl', 'pelayfto', 'pelkavl', 'pedwlko', 'pedwwk', 'pejhwant', 'prempnot',\
'prwntjob', 'prcow1', 'prcow2', \
'prmjind1', 'prmjind2', 'prmjocc1', 'prmjocc2', 'prmjocgr', 'peernuot', 'peernhro', 'peernwkp',\
'penlfjh', 'penlfret', 'penlfact', 'peschenr', 'peschft', 'peschlvl', 'prnlfsch','pehgcomp', 'prchld', 'prnmchld',\
'pedipged', 'pecert1']

In [402]:
df = df[second_cut]

In [403]:
#big change in numbers here
df.hufinal.value_counts(dropna=False)

201    94635
218    13250
1       7309
226     6440
216     1046
225      890
219      726
203      598
228      437
229      435
259      298
227      281
217      270
231      221
213      140
223      112
4        106
248      101
2        101
230       90
240       90
233       82
232       70
243       62
241       57
214       39
244       20
245       16
5          3
258        2
224        2
242        2
247        1
Name: hufinal, dtype: int64

In [404]:
#decided to dropna's cleared up problem with target variable
df = df.dropna()

#artifacts from trying to work figure out how to handle na's
#big change in numbers here
# df.hufinal.value_counts(dropna=False) --> column dropped
# df.columns[df.isnull().any()]
# df[df.hufinal != (218)][['pemlr', 'prexplf', 'prempnot']].value_counts(dropna=False)
# df[['prmjind1', 'prmjind2','prmjocc1', 'prmjocc2', 'prmjocgr']].value_counts(dropna=False)
# df[['pemlr', 'prexplf', 'prempnot']].value_counts(dropna=False)
# df[['pemlr', 'prexplf', 'prempnot']].value_counts(normalize=True, dropna=False)

## Takeaways
* Still need to verify the industry recodes
* eliminated additional columns that had greater change of leakage
----
* need to verify contents of industry recodes for viability
* look into hufinal and use that to drop records that refused to respond --> changing strategy
* drop any records that have 'NaN' for target variable, indicates incomplete survey/data
----
* decided to just do a straight dropna()

# Prepare
* Goals:
    - Get down to 30 columns
* Takeaways/fixes
    - Get ride of prtage == 0  or impute 
    - fix peafever --> add answers from peafnow to capture active duty 
    - ptdtrace, pehspnon --> hispanic/non needs to be added to race and race needs to be handled --> disregard
    - pemlr, prexplf, prempnot --> use information to create disposition


In [405]:
# remove prtfage. is top code for ages above 80
df[df.prtfage == 1]['prtage'].value_counts()

80.0    2525
85.0    2183
Name: prtage, dtype: int64

In [406]:
# fixing peafnow  --> get rid of peafnow
df.iloc[np.where(df.peafnow == 1)]['peafever'] = 1
df[['peafever']].value_counts(dropna=False)

peafever
 2.0        76019
-1.0        20685
 1.0         6045
dtype: int64

In [407]:
# create function to flatten race in less categories
def flatten_race(val):
    if val == 1:
        val = 'white'
    elif val == 2:
        val = 'black'
    elif val == 3:
        val = 'AI/NA'
    elif val == 4:
        val = 'asian'
    elif val == 5:
        val = 'HI/PI'
    elif 5 < val < 10:
        val = 'mixed_white'
    elif 10 <= val < 27 :
        val = 'mixed_other'
    return val

df['ptdtrace'] = df['ptdtrace'].apply(flatten_race)

#artifact from trying to combine with Hispanic status
# df[['ptdtrace','pehspnon']].sort_values('pehspnon').value_counts()

In [408]:
# remove responses from people not in the labor force by means other that discouragement
df = df[df.prempnot != 4]

In [409]:
# remove responses from unqualified respondents (children and active armed forces members)
df = df[df.prempnot != (-1)]

In [410]:
# binary recode of premmpnot to include repsondents discourage from workforce participation as unemployed
df.prempnot = np.where(df.prempnot == 1, 1, 0)

In [411]:
df.prempnot.value_counts()

1    48921
0     1885
Name: prempnot, dtype: int64

In [412]:
final_list = ['hehousut', 'hefaminc', 'hrnumhou', 'hrhtype','hubus',\
'gediv', 'gestfips', 'gtmetsta', 'gtcbsasz', 'prtage', 'pesex', \
'pemaritl', 'peafever', 'peeduca', 'ptdtrace', 'pehspnon', 'penatvty', 'pemntvty','pehrftpt',\
'pefntvty', 'prcitshp', 'pubus1', 'pudis2', 'pehruslt', 'pelkavl', 'pedwlko', 'pedwwk', 'pejhwant', 'prempnot',\
'prmjind1', 'prmjocc1', 'peernuot', 'peschenr', 'prchld', 'pecert1']

In [413]:
df = df[final_list]

# Prepare - Part II
* Goals:
    - Change column names
* Takeaways
    - `usual_35_hours_more`/`pehrftpt` can be removed, and data used to impute values in `usual_hours_worked`
    - use apply function to take handle --> back to before column renaming for flow purposes. leave as is for now

In [414]:
df = df.rename(columns={'hehousut': 'housing_type',
'hefaminc': 'family_income',
'hrnumhou': 'household_num',
'hrhtype': 'household_type',
'hubus': 'own_bus_or_farm',
'gediv': 'country_region',
'gestfips': 'state',
'gtmetsta': 'metropolitan',
'gtcbsasz': 'metro_area_size',
'prtage': 'age',
'pesex': 'sex',
'pemaritl': 'marital_status',
'peafever': 'veteran',
'peeduca': 'education',
'ptdtrace': 'race',
'pehspnon': 'hispanic_or_non',
'penatvty': 'birth_country',
'pemntvty': 'mother_birth_country',
'pefntvty': 'father_birth_country',
'prcitshp': 'citizenship',
'pubus1': 'upaid_work_last_week',
'pudis2': 'disability_preventing_work_in_next_6_months',
'pehrftpt' : 'usual_35_hours_more',
'pehruslt': 'usual_hours_worked',
'pelkavl': 'could_have_started_job',
'pedwlko': 'sought_work_last_12_months',
'pedwwk': 'worked_last_12_months',
'pejhwant': 'intend_to_look_next_12_months',
'prempnot': 'employed',
'prmjind1': 'industry',
'prmjocc1': 'occupation',
'peernuot': 'usual_ot_tips_commis',
'peschenr': 'enrolled_in_school',
'prchld': 'children_in_household',
'pecert1': 'professional_certification'})

In [417]:
#work done to fix usual_hours_worked with manual imputation 
# *****MUSTBEDONEAFTERNAMECHANGEFORNOW****

df[df.usual_hours_worked == -4].usual_35_hours_more.value_counts()
df.usual_35_hours_more.value_counts()
#mean hours worked for those less that work less than 35
more_than_35 = round(df[df.usual_hours_worked > 35].usual_hours_worked.mean())
#mean hours worked for those less that work less than 35
less_than_35 = round(df[(df.usual_hours_worked > 0) & (df.usual_hours_worked < 35)].usual_hours_worked.mean())
df[['usual_35_hours_more', 'usual_hours_worked']].value_counts()
df[df.usual_35_hours_more == 1]['usual_hours_worked'] = more_than_35
df[df.usual_35_hours_more == 2]['usual_hours_worked'] = less_than_35

df = df.drop(columns='usual_35_hours_more')

In [418]:
df.columns.to_list()

['housing_type',
 'family_income',
 'household_num',
 'household_type',
 'own_bus_or_farm',
 'country_region',
 'state',
 'metropolitan',
 'metro_area_size',
 'age',
 'sex',
 'marital_status',
 'veteran',
 'education',
 'race',
 'hispanic_or_non',
 'birth_country',
 'mother_birth_country',
 'father_birth_country',
 'citizenship',
 'upaid_work_last_week',
 'disability_preventing_work_in_next_6_months',
 'usual_hours_worked',
 'could_have_started_job',
 'sought_work_last_12_months',
 'worked_last_12_months',
 'intend_to_look_next_12_months',
 'employed',
 'industry',
 'occupation',
 'usual_ot_tips_commis',
 'enrolled_in_school',
 'children_in_household',
 'professional_certification']

In [419]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50806 entries, 1 to 127917
Data columns (total 34 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   housing_type                                 50806 non-null  int64  
 1   family_income                                50806 non-null  int64  
 2   household_num                                50806 non-null  int64  
 3   household_type                               50806 non-null  int64  
 4   own_bus_or_farm                              50806 non-null  int64  
 5   country_region                               50806 non-null  int64  
 6   state                                        50806 non-null  int64  
 7   metropolitan                                 50806 non-null  int64  
 8   metro_area_size                              50806 non-null  int64  
 9   age                                          50806 non-null  float64
 1

# Prepare - Part III
* Goals
    - identify final categorical vs numerical column disposition
    - binary encode where able
    - ~~translate to numerical when able~~ leave numerical alone
    - flatten necessary categorical columns 
        * possible save until after exploration phase. may be unnecessary
        * housing_type, household_type, marital_status, birth_country, mother_birth_country, father_birth_country


In [428]:
#fixing types on categorical columns
categorical_cols = ['housing_type','family_income','household_type',
                    'country_region','state','metropolitan','metro_area_size',
                    'marital_status','education','race','birth_country',
                    'mother_birth_country','father_birth_country','citizenship',
                    'industry','occupation']

#for loop to handle assignment as object
for col in categorical_cols:
    df[col] = (df[col].astype('object'))

In [451]:
#fixing binary columns freom -1, 1, 2 --> 0, 1
binary_cols = ['own_bus_or_farm', 'sex', 'veteran','hispanic_or_non', 
                'upaid_work_last_week','could_have_started_job',
                'sought_work_last_12_months', 'worked_last_12_months',
                'intend_to_look_next_12_months','employed', 'usual_ot_tips_commis',
                'enrolled_in_school','professional_certification']


In [457]:
# changed all non-affirmative answers to negative. tips, school, most affected
for col in binary_cols:
    df[col] = np.where(df[col] == 1, 1, 0)

In [462]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50806 entries, 1 to 127917
Data columns (total 34 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   housing_type                                 50806 non-null  object 
 1   family_income                                50806 non-null  object 
 2   household_num                                50806 non-null  int64  
 3   household_type                               50806 non-null  object 
 4   own_bus_or_farm                              50806 non-null  object 
 5   country_region                               50806 non-null  object 
 6   state                                        50806 non-null  object 
 7   metropolitan                                 50806 non-null  object 
 8   metro_area_size                              50806 non-null  object 
 9   age                                          50806 non-null  float64
 1

# Prep: Part IV 
* Goal:
    - Create functions that will handle sections of prep 
    - Big wrangle Function --> prep_aug()
    - don't forget to add docstring placeholders

In [472]:
def acquire_aug():
    ''' 
    Purpose:

    ---

    Parameters:

    ---

    Output:
    
    ---
    '''

    df = pd.read_csv('aug22pub.csv')

    df.columns = df.columns.str.lower()
    
    return df

# create function to flatten race in less categories
def flatten_race(val):
    ''' 
    Purpose:

    ---

    Parameters:

    ---

    Output:
    
    ---
    '''
    if val == 1:
        val = 'white'
    elif val == 2:
        val = 'black'
    elif val == 3:
        val = 'AI/NA'
    elif val == 4:
        val = 'asian'
    elif val == 5:
        val = 'HI/PI'
    elif 5 < val < 10:
        val = 'mixed_white'
    elif 10 < 27 :
        val = 'mixed_other'
    return val


def prep_values(df):
    ''' 
    Purpose:

    ---

    Parameters:

    ---

    Output:
    
    ---
    '''

    #decided to dropna's cleared up problem with target variable
    df = df.dropna()

    # fixing peafnow  --> get rid of peafnow
    df.iloc[np.where(df.peafnow == 1)]['peafever'] = 1

    #work done to fix usual_hours_worked with manual imputation 
    #mean hours worked for those less that work less than 35
    more_than_35 = round(df[df.pehruslt > 35].pehruslt.mean())
    #mean hours worked for those less that work less than 35
    less_than_35 = round(df[(df.pehruslt > 0) & (df.pehruslt < 35)].pehruslt.mean())
    df[['pehrftpt', 'pehruslt']].value_counts()

    df[df.pehrftpt == 1]['pehruslt'] = more_than_35
    df[df.pehrftpt == 2]['pehruslt'] = less_than_35

    # remove responses from people not in the labor force by means other that discouragement
    df = df[df.prempnot != 4]
    # remove responses from unqualified respondents (children and active armed forces members)
    df = df[df.prempnot != (-1)]
    # binary recode of premmpnot to include repsondents discourage from workforce participation as unemployed
    df.prempnot = np.where(df.prempnot == 1, 1, 0)

    return df


def prep_columns(df): 
    ''' 
    Purpose:

    ---

    Parameters:

    ---

    Output:
    
    ---
    '''

    df['ptdtrace'] = df['ptdtrace'].apply(flatten_race)

    final_list = ['hehousut', 'hefaminc', 'hrnumhou', 'hrhtype','hubus',\
    'gediv', 'gestfips', 'gtmetsta', 'gtcbsasz', 'prtage', 'pesex', \
    'pemaritl', 'peafever', 'peeduca', 'ptdtrace', 'pehspnon', 'penatvty', 'pemntvty',
    'pefntvty', 'prcitshp', 'pubus1', 'pudis2', 'pehruslt', 'pelkavl', 'pedwlko', 'pedwwk', 'pejhwant', 'prempnot',\
    'prmjind1', 'prmjocc1', 'peernuot', 'peschenr', 'prchld', 'pecert1']

    df = df[final_list]

    df = df.rename(columns={'hehousut': 'housing_type',
        'hefaminc': 'family_income',
        'hrnumhou': 'household_num',
        'hrhtype': 'household_type',
        'hubus': 'own_bus_or_farm',
        'gediv': 'country_region',
        'gestfips': 'state',
        'gtmetsta': 'metropolitan',
        'gtcbsasz': 'metro_area_size',
        'prtage': 'age',
        'pesex': 'sex',
        'pemaritl': 'marital_status',
        'peafever': 'veteran',
        'peeduca': 'education',
        'ptdtrace': 'race',
        'pehspnon': 'hispanic_or_non',
        'penatvty': 'birth_country',
        'pemntvty': 'mother_birth_country',
        'pefntvty': 'father_birth_country',
        'prcitshp': 'citizenship',
        'pubus1': 'upaid_work_last_week',
        'pudis2': 'disability_preventing_work_in_next_6_months',
        'pehruslt': 'usual_hours_worked',
        'pelkavl': 'could_have_started_job',
        'pedwlko': 'sought_work_last_12_months',
        'pedwwk': 'worked_last_12_months',
        'pejhwant': 'intend_to_look_next_12_months',
        'prempnot': 'employed',
        'prmjind1': 'industry',
        'prmjocc1': 'occupation',
        'peernuot': 'usual_ot_tips_commis',
        'peschenr': 'enrolled_in_school',
        'prchld': 'children_in_household',
        'pecert1': 'professional_certification'})

    #fixing types on categorical columns
    categorical_cols = ['housing_type','family_income','household_type',
                        'country_region','state','metropolitan','metro_area_size',
                        'marital_status','education','race','birth_country',
                        'mother_birth_country','father_birth_country','citizenship',
                        'industry','occupation']

    binary_cols = ['own_bus_or_farm', 'sex', 'veteran','hispanic_or_non', 
                'upaid_work_last_week','could_have_started_job',
                'sought_work_last_12_months', 'worked_last_12_months',
                'intend_to_look_next_12_months','employed', 'usual_ot_tips_commis',
                'enrolled_in_school','professional_certification']  

    #for loop to handle assignment as object
    for col in categorical_cols:
        df[col] = (df[col].astype('object'))

    # changed all non-affirmative answers to negative. tips, school, most affected
    for col in binary_cols:
        df[col] = np.where(df[col] == 1, 1, 0)                                 

    return df

In [473]:
def prep_aug():
    ''' 
    Purpose:

    ---

    Parameters:

    ---

    Output:
    
    ---
    '''

    df = acquire_aug()

    df = prep_values(df)

    df = prep_columns(df)

    return df

In [474]:
test = prep_aug()

In [475]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50806 entries, 1 to 127917
Data columns (total 34 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   housing_type                                 50806 non-null  object 
 1   family_income                                50806 non-null  object 
 2   household_num                                50806 non-null  int64  
 3   household_type                               50806 non-null  object 
 4   own_bus_or_farm                              50806 non-null  int64  
 5   country_region                               50806 non-null  object 
 6   state                                        50806 non-null  object 
 7   metropolitan                                 50806 non-null  object 
 8   metro_area_size                              50806 non-null  object 
 9   age                                          50806 non-null  float64
 1