In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import os

In [2]:
#LOAD DATA
path = 'adult19.csv'
interviewData = pd.read_csv (path)
interviewData

Unnamed: 0,URBRRL,RATCAT_A,INCGRP_A,INCTCFLG_A,FAMINCTC_A,IMPINCFLG_A,PPSU,PSTRAT,HISPALLP_A,RACEALLP_A,...,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_QRT,RECTYPE,WTFA_A,WTIA_A,HHX,POVRATTC_A
0,4,9,3,0,60000,2,2,122,3,2,...,,,1,1,1,10,13177.008,7601.336,H048109,2.96
1,4,9,3,0,50000,0,2,122,2,1,...,,,1,1,1,10,6140.552,3344.434,H027044,2.97
2,4,12,3,0,65000,1,2,122,2,1,...,,,1,1,1,10,9191.061,6949.498,H058855,4.28
3,4,14,5,0,120000,0,2,122,1,8,...,,,1,1,1,10,7900.035,6446.327,H031993,7.13
4,1,4,1,0,30000,0,2,115,2,1,...,,,1,1,1,10,10875.772,8646.586,H007122,1.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31992,4,14,5,0,116204,0,100,114,2,1,...,,,1,1,4,10,2679.016,1965.416,H046022,7.65
31993,4,8,3,0,68000,0,100,114,2,1,...,,,1,1,4,10,6726.495,3920.208,H046232,2.02
31994,4,13,3,0,60000,0,100,114,2,1,...,,,1,1,4,10,1246.306,864.743,H043765,4.98
31995,4,14,5,0,101000,0,100,114,1,1,...,,,1,1,4,10,3427.198,2595.209,H017695,6.00


In [3]:
#LOAD CSV WITH DATA CLEANING INSTRUCTIONS
path = 'VariableDictionary.csv'
data_cleaning_inst = pd.read_csv (path)
data_cleaning_inst

Unnamed: 0,COLUMN_INDEX,COLUMN_NAME,DESCRIPTION,DATA_TYPE,NANs,ENCODING,Code_1,Code_1_Meaning,Code_2,Code_2_Meaning,...,Code_4_Meaning,Code_5,Code_5_Meaning,Code_6,Code_6_Meaning,Code_7,Code_7_Meaning,Code_8,Code_8_Meaning,Notes
0,402,POVRATTC_A,SA family poverty ratio,Numerical,drop_nans,none,00.00-09.99,00.00-09.99,10.0,10.00+,...,,,,,,,,,,
1,403,HHX,Randomly assigned household number unique to a...,Recordkeeping,drop_nans,none,,,,,...,,,,,,,,,,
2,404,WTIA_A,Weight - annual pre-post stratification calibr...,Recordkeeping,drop_col,drop_col,,,,,...,,,,,,,,,,
3,405,WTFA_A,Weight - Final Annual,Recordkeeping,drop_col,drop_col,,,,,...,,,,,,,,,,
4,406,RECTYPE,Record type,Recordkeeping,drop_col,drop_col,10,Sample Adult,20.0,Sample Child,...,Sample Child Income,50.0,Paradata,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,529,PRRXCOV2_A,Plan pays for prescription drug - plan 2,Categorical - Nominal,replace,one_hot,1,Yes,2.0,No,...,Not Ascertained,9.0,Don't Know,,,,,,,
128,530,PRDNCOV1_A,Plan pays for dental care - plan 1,Categorical - Nominal,replace,one_hot,1,Yes,2.0,No,...,Not Ascertained,9.0,Don't Know,,,,,,,
129,531,PRDNCOV2_A,Plan pays for dental care - plan 2,Categorical - Nominal,replace,one_hot,1,Yes,2.0,No,...,Not Ascertained,9.0,Don't Know,,,,,,,
130,532,PRVSCOV1_A,Plan pays for vision care - plan 1,Categorical - Nominal,replace,one_hot,1,Yes,2.0,No,...,Not Ascertained,9.0,Don't Know,,,,,,,


In [4]:
#DROP IRRELEVANT COLUMNS
#make working copy of data
cleanData = interviewData.copy()
#make list of columns to drop 
cols = data_cleaning_inst.COLUMN_NAME[data_cleaning_inst.NANs == 'drop_col'].tolist()
# drop irrelevant columns
cleanData = cleanData.drop(columns = cols)
cleanData.shape

(31997, 530)

In [5]:
#DROP ROWS WITH NANS
#make list of columns to look for nans in 
cols = data_cleaning_inst.COLUMN_NAME[data_cleaning_inst.NANs == 'drop_nans'].tolist()
#drop rows with nans in cols
cleanData = cleanData.dropna(subset=cols)
cleanData.shape

(31997, 530)

In [6]:
#REPLACE NANS WITH CODE: 10 = NOT APPLICABLE
#make list of columns to replace nans in
cols = data_cleaning_inst.COLUMN_NAME[data_cleaning_inst.NANs == 'replace'].tolist()
#drop rows with nans in cols
cleanData[cols] = cleanData[cols].fillna(10)
cleanData.shape

(31997, 530)

In [7]:
#CHECK THERE ARE NO NANs LEFT
print(interviewData.iloc[:, 403:].isnull().sum().sum())
print(cleanData.iloc[:, 403:].isnull().sum().sum())

1917479
0


In [8]:
#ONE-HOT ENCODE CATEGORICAL DATA
#make list of columns to one-hot encode
cols = data_cleaning_inst.COLUMN_NAME[data_cleaning_inst.ENCODING == 'one_hot'].tolist()
#isolate data to encode as one-hot
clean_onehot_data = cleanData.loc[:, cols]
#create object
enc = OneHotEncoder()
#fit encoder
enc.fit(clean_onehot_data)
#transform data
clean_onehot_data = enc.transform(clean_onehot_data).toarray()
#get name of new columns
onehot_features = enc.get_feature_names_out(cols)
#join with previous data 
onehot_df = pd.DataFrame(clean_onehot_data, columns = onehot_features, index = cleanData.index)
cleanData = cleanData.drop(columns = cols)
cleanData = pd.concat([cleanData, onehot_df], axis=1)
cleanData.shape

(31997, 1027)