In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
import os

In [2]:
#DATA CLEANING

#load data
#set HHX (unique housing ID) as index
path = 'adult19.csv'
interviewData = pd.read_csv (path, index_col='HHX')

#get columns with no NANS
interview_noNANs = interviewData.dropna(axis=1)

#remove rows with "not ascertained" as answer i.e. essentially missing a target
interview_noNANs = interview_noNANs[interview_noNANs.PHQCAT_A != 8]

#remove columns with irrelevant features (e.g type of record type, survey year -> all the same)
irr_features = ['IMPINCFLG_A', 'PPSU', 'PSTRAT', 'SRVY_YR', 'ASTATNEW', 'HIKIND10_A', 'HIKIND09_A', 'HIKIND08_A', 
                'HIKIND07_A', 'HIKIND06_A', 'HIKIND05_A', 'HIKIND04_A', 'HIKIND03_A', 'HIKIND02_A', 'HIKIND01_A', 
                'HHSTAT_A', 'RECTYPE', 'WTFA_A', 'WTIA_A', 'WTIA_A']

interview_noNANs = interview_noNANs.drop(columns = irr_features)

In [3]:
#NUMERICAL DATA

#remove rows with missing data codes
def remove_missing_code (df, code_list, col_name_list):
    '''remove rows if they have a missing data code
    df: original dataset
    code_list: list of missing data codes
    col_name_list: list of column names with missing data code'''
    df_new = df.copy()
    for col in col_name_list:
        df_new = df_new[~df_new[col].isin(code_list)]
    return (df_new)

#remove all rows with missing code 8
clean_num_data = interview_noNANs.copy()

columns = ['PCNT18UPTC', 'PCNTLT18TC', 'PCNTKIDS_A', 'PCNTADLT_A', 'PCNTFAM_A']
code_list = [8]

clean_num_data = remove_missing_code(clean_num_data, code_list, columns)
    
#remove all rows with missing code 9
columns = ['PCNTTC']
code_list = [9]

clean_num_data = remove_missing_code(clean_num_data, code_list, columns)

#remove all rows with missing code 7, 8, 9
columns = ['URGNT12MTC_A', 'EMERG12MTC_A', 'NUMCAN_A'] 
code_list = [7, 8, 9]

clean_num_data = remove_missing_code(clean_num_data, code_list, columns)

#remove all rows with missing code 96, 97, 98, 99
columns = ['HEIGHTTC_A']
code_list = [96, 97, 98, 99]

clean_num_data = remove_missing_code(clean_num_data, code_list, columns)

#remove all rows with missing code 97, 98, 99
columns = ['AGEP_A']
code_list = [97, 98, 99]

clean_num_data = remove_missing_code(clean_num_data, code_list, columns)

#remove all rows with missing code 997, 998, 999
columns = ['RATCAT_A', 'WEIGHTLBTC_A'] 
code_list = [997, 998, 999]

clean_num_data = remove_missing_code(clean_num_data, code_list, columns)

In [4]:
#ORDINAL DATA

#remove all rows with missing code 8
clean_ord_data = clean_num_data.copy ()

columns = ['GADCAT_A', 'PCNTADTWFP_A', 'FDSCAT4_A', 'FDSCAT3_A', 'GAD71_A']
code_list = [8]

clean_ord_data = remove_missing_code(clean_ord_data, code_list, columns)

#remove all rows with missing code 9
columns = ['BMICAT_A']
code_list = [9]

clean_ord_data = remove_missing_code(clean_ord_data, code_list, columns)

#remove all rows with missing code 7, 8, 9
columns = ['HOUYRSLIV_A', 'FDSBALANCE_A', 'FDSLAST_A', 'FDSRUNOUT_A', 'GAD77_A', 'GAD76_A', 'GAD75_A', 'GAD74_A', 
           'GAD73_A', 'GAD72_A', 'PHQ88_A', 'PHQ87_A', 'PHQ86_A', 'PHQ85_A', 'PHQ84_A', 'PHQ83_A', 'PHQ82_A', 
           'PHQ81_A', 'DEPFREQ_A', 'PAYWORRY_A', 'SOCSCLPAR_A', 'SOCERRNDS_A', 'UPPOBJCT_A', 'UPPRAISE_A', 
          'UPPSLFCR_A', 'COMDIFF_A', 'DIFF_A', 'HEARINGDF_A', 'VISIONDF_A', 'PHSTAT_A', 'COGMEMDFF_A', 
          'DIABLAST_A', 'CHOLLAST_A', 'BPLAST_A', 'LASTDR_A', 'DENPREV_A', 
           'ANXFREQ_A'] 

code_list = [7, 8, 9]

clean_ord_data = remove_missing_code(clean_ord_data, code_list, columns)

#change code to 0 to 7 to keep in monotonic order
clean_ord_data = clean_ord_data.replace({'DIABLAST_A': 0, 'CHOLLAST_A': 0, 'BPLAST_A': 0, 'LASTDR_A': 0, 'DENPREV_A': 0}, 7)

# change code 5 to 0 to keep in monotonic order
clean_ord_data.ANXFREQ_A = clean_ord_data.ANXFREQ_A.replace(5, 0)

In [5]:
#BINARY DATA 

#remove all rows with missing code 7, 8, 9

clean_bin_data = clean_ord_data.copy()

columns = ['NOTCOV_A', 'IHS_A', 'HISP_A', 'FSNAP12M_A', 'INCINTER_A', 'INCWRKO_A', 'SCHCURENR_A', 'NATUSBORN_A', 
           'AFVET_A', 'SMOKELSEV_A', 'PIPEEV_A', 'CIGAREV_A', 'ECIGEV_A', 'SMKEV_A', 'MHTHND_A', 'MHTHDLY_A', 
           'MHTHRPY_A', 'DEPMED_A', 'ANXMED_A', 'HOMEHC12M_A', 'THERA12M_A', 'EYEEX12M_A', 'SHTPNUEV_A', 
           'SHTFLU12M_A', 'RXDG12M_A', 'RX12M_A', 'MEDNG12M_A', 'MEDDL12M_A', 'HOSPONGT_A', 'DENNG12M_A', 
           'DENDL12M_A', 'PAYBLL12M_A', 'SINCOVRX_A', 'SINCOVVS_A', 'SINCOVDE_A', 'HICOV_A', 'SOCWRKLIM_A',
           'EQUIP_A', 'HEARAID_A', 'WEARGLSS_A', 'DEPEV_A', 'ANXEV_A', 'DEMENEV_A', 'ARTHEV_A', 'COPDEV_A', 
           'DIBEV_A', 'PREDIB_A', 'CANEV_A', 'ASEV_A', 'STREV_A', 'MIEV_A', 'ANGEV_A', 'CHDEV_A', 'CHLEV_A', 
           'HYPEV_A', 'CITZNSTP_A', 'EMPWRKLSWK_A', 'DISAB3_A', 
           'OTHGOV_A', 'OTHPUB_A', 'MILITARY_A', 'CHIP_A', 'MEDICAID_A', 'MEDICARE_A', 'PRIVATE_A'] 
code_list = [7, 8, 9]

clean_bin_data = remove_missing_code(clean_bin_data, code_list, columns)

#change false code to 0
columns = ['NOTCOV_A', 'IHS_A', 'HISP_A', 'FSNAP12M_A', 'INCINTER_A', 'INCWRKO_A', 'SCHCURENR_A', 'NATUSBORN_A', 
           'AFVET_A', 'SMOKELSEV_A', 'PIPEEV_A', 'CIGAREV_A', 'ECIGEV_A', 'SMKEV_A', 'MHTHND_A', 'MHTHDLY_A', 
           'MHTHRPY_A', 'DEPMED_A', 'ANXMED_A', 'HOMEHC12M_A', 'THERA12M_A', 'EYEEX12M_A', 'SHTPNUEV_A', 
           'SHTFLU12M_A', 'RXDG12M_A', 'RX12M_A', 'MEDNG12M_A', 'MEDDL12M_A', 'HOSPONGT_A', 'DENNG12M_A', 
           'DENDL12M_A', 'PAYBLL12M_A', 'SINCOVRX_A', 'SINCOVVS_A', 'SINCOVDE_A', 'HICOV_A', 'SOCWRKLIM_A',
           'EQUIP_A', 'HEARAID_A', 'WEARGLSS_A', 'DEPEV_A', 'ANXEV_A', 'DEMENEV_A', 'ARTHEV_A', 'COPDEV_A', 
           'DIBEV_A', 'PREDIB_A', 'CANEV_A', 'ASEV_A', 'STREV_A', 'MIEV_A', 'ANGEV_A', 'CHDEV_A', 'CHLEV_A', 
           'HYPEV_A', 'CITZNSTP_A', 'EMPWRKLSWK_A', 'DISAB3_A']
for col in columns:
    clean_bin_data[col] = clean_bin_data[col].replace(2, 0)
    

#change second true code to 1 and false code to 0
columns = ['OTHGOV_A', 'OTHPUB_A', 'MILITARY_A', 'CHIP_A', 'MEDICAID_A', 'MEDICARE_A', 'PRIVATE_A'] 
for col in columns:
    clean_bin_data[col] = clean_bin_data[col].replace(2, 1)
    clean_bin_data[col] = clean_bin_data[col].replace(3, 0)

In [6]:
# ONE-HOT DATA

cols = ['HISPALLP_A', 'RACEALLP_A', 'LEGMSTAT_A', 'MARSTAT_A', 'SMKECIGST_A', 'SMKCIGST_A', 'PARSTAT_A', 
           'SAPARENTSC_A', 'HISDETP_A', 'REGION', 'SEX_A', 'HOUTENURE_A', 'MARITAL_A', 'ORIENT_A', 'USUALPL_A', 
           'AVAIL_A']

#isolate data to encode as one-hot
clean_onehot_data = clean_bin_data.loc[:, cols]
#create object
enc = OneHotEncoder()
#fit encoder
enc.fit(clean_onehot_data)
#transform data
clean_onehot_data = enc.transform(clean_onehot_data).toarray()
#get name of new columns
onehot_features = enc.get_feature_names_out(cols)
#join with previous data 
onehot_df = pd.DataFrame(clean_onehot_data, columns = onehot_features, index = clean_bin_data.index)
clean_bin_data.drop(columns = cols)
clean_data = pd.concat([clean_bin_data, onehot_df], axis=1)

In [7]:
#isolate target column
target = clean_data.PHQCAT_A

#remove target from features
features = clean_data.drop(columns = ['PHQCAT_A'])

#split into test and train 
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0)

In [8]:
#NORMALIZATION

#create object and fit to training data
normalizer = Normalizer().fit(x_train)
#normalize training data
x_train_norm = normalizer.transform (x_train)
#normalize test data using same transformer
x_test_norm = normalizer.transform (x_test)

In [9]:
#STORE DATA

#turn back to dataframes 
train = pd.DataFrame(x_train_norm, index = x_train.index, columns = x_train.columns)
train['PHQCAT_A'] = y_train

test = pd.DataFrame(x_test_norm, index = x_test.index, columns = x_test.columns)
test['PHQCAT_A'] = y_test

#make directory 
dataDir = 'CleanData'
if not os.path.exists(dataDir):
    os.mkdir(dataDir)

#store training data
filepath = os.path.join(dataDir, 'train.csv')
train.to_csv(filepath) 

#store testing data
filepath = os.path.join(dataDir, 'test.csv')
test.to_csv(filepath) 

In [10]:
#TEST TO MAKE SURE DATA CAN BE READ
filepath = os.path.join(dataDir, 'test.csv')
train = pd.read_csv (filepath)
train

Unnamed: 0,HHX,URBRRL,RATCAT_A,INCGRP_A,INCTCFLG_A,FAMINCTC_A,HISPALLP_A,RACEALLP_A,DISAB3_A,CITZNSTP_A,...,USUALPL_A_1,USUALPL_A_2,USUALPL_A_3,USUALPL_A_7,USUALPL_A_9,AVAIL_A_1,AVAIL_A_2,AVAIL_A_3,AVAIL_A_8,PHQCAT_A
0,H063832,0.000078,0.000272,0.000039,0.0,0.999982,0.000078,0.000039,0.0,0.000039,...,0.000039,0.000000,0.0,0.0,0.0,0.000039,0.0,0.000000,0.0,1
1,H064018,0.000044,0.000118,0.000044,0.0,0.999995,0.000029,0.000015,0.0,0.000015,...,0.000015,0.000000,0.0,0.0,0.0,0.000015,0.0,0.000000,0.0,1
2,H003505,0.000333,0.000222,0.000111,0.0,0.999662,0.000222,0.000111,0.0,0.000111,...,0.000111,0.000000,0.0,0.0,0.0,0.000111,0.0,0.000000,0.0,1
3,H013931,0.000100,0.000267,0.000033,0.0,0.999974,0.000033,0.000267,0.0,0.000033,...,0.000000,0.000033,0.0,0.0,0.0,0.000033,0.0,0.000000,0.0,1
4,H020330,0.000102,0.000153,0.000051,0.0,0.999957,0.000051,0.000408,0.0,0.000051,...,0.000051,0.000000,0.0,0.0,0.0,0.000051,0.0,0.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4849,H045242,0.000050,0.000150,0.000050,0.0,0.999983,0.000075,0.000050,0.0,0.000025,...,0.000025,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000025,0.0,1
4850,H034491,0.000008,0.000108,0.000042,0.0,0.999999,0.000025,0.000017,0.0,0.000008,...,0.000008,0.000000,0.0,0.0,0.0,0.000008,0.0,0.000000,0.0,1
4851,H061542,0.000115,0.000269,0.000038,0.0,0.999976,0.000154,0.000115,0.0,0.000000,...,0.000038,0.000000,0.0,0.0,0.0,0.000038,0.0,0.000000,0.0,1
4852,H024156,0.000020,0.000087,0.000033,0.0,0.999999,0.000013,0.000007,0.0,0.000007,...,0.000007,0.000000,0.0,0.0,0.0,0.000007,0.0,0.000000,0.0,1
