In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
person_raw = pd.read_csv('../data/person-subset-2.5percent.csv')

In [3]:
df = person_raw.copy(deep=True)

In [4]:

# Clean important features
### Change from numeric to categorical (ordinal)
df['CIT_CAT'] = df.CIT.astype('category').astype('str')
df.CIT_CAT = df.CIT_CAT \
    .replace('1', 'Born - USA') \
    .replace('2', 'Born - USA Territory') \
    .replace('3', 'Born - Abroad (American Parents)') \
    .replace('4', 'Citizen - Naturalization') \
    .replace('5', 'Not Citizen')

df['OC_CAT'] = df.OC.astype('category').astype(bool)

df['ENG_CAT'] = df.ENG.astype('category')

important_features = [
    'PINCP',    # Total person's income (signed)
    'POVPIP',   # Income-to-poverty ratio recode (continuous)
    'JWMNP',    # Travel time to work (continuous)
    'AGEP',     # Age of person (continuous 0-95)
    'PWGTP',    # Person's weight (continuous)
    'CIT_CAT',  # Citizenship status (categorical - string)
    'OC_CAT',   # Own child (Boolean)
    'ENG_CAT'   # Ability to speak English (ordinal 1-4)
]

In [5]:
df = df[important_features]

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78320 entries, 0 to 78319
Data columns (total 8 columns):
PINCP      64498 non-null float64
POVPIP     75204 non-null float64
JWMNP      33117 non-null float64
AGEP       78320 non-null int64
PWGTP      78320 non-null int64
CIT_CAT    78320 non-null object
OC_CAT     78320 non-null bool
ENG_CAT    13310 non-null category
dtypes: bool(1), category(1), float64(3), int64(2), object(1)
memory usage: 3.7+ MB


In [7]:
df[['PINCP', 'POVPIP', 'JWMNP', 'PWGTP', 'AGEP']].describe().astype('int64')

Unnamed: 0,PINCP,POVPIP,JWMNP,PWGTP,AGEP
count,64498,75204,33117,78320,78320
mean,35980,304,26,101,40
std,53893,165,22,81,23
min,-12200,0,1,1,0
25%,7000,159,10,53,20
50%,21000,307,20,78,41
75%,45900,501,30,123,59
max,1027000,501,167,1505,95


In [8]:
df[['CIT_CAT', 'OC_CAT', 'ENG_CAT']].describe()

Unnamed: 0,CIT_CAT,OC_CAT,ENG_CAT
count,78320,78320,13310.0
unique,5,2,4.0
top,Born - USA,False,1.0
freq,68411,63727,7914.0
