In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Normalizer

In [2]:
#LOAD NONHOT DATA

#training data
dataDir = 'AnalysisData'
dataSubDir = 'CleanDataNonhot'
filepath = os.path.join(dataDir, dataSubDir, 'testFeaturesNonHot.csv')
test_x_pd_nonhot = pd.read_csv (filepath, index_col = 0)
test_x_nonhot = test_x_pd_nonhot.to_numpy()
original_shape = test_x_pd_nonhot.shape
print(original_shape)

#testing data
dataDir = 'AnalysisData'
dataSubDir = 'CleanDataNonhot'
filepath = os.path.join(dataDir, dataSubDir, 'trainFeaturesNonHot.csv')
train_x_pd_nonhot = pd.read_csv (filepath, index_col = 0)
train_x_nonhot = train_x_pd_nonhot.to_numpy()
print(train_x_nonhot.shape)

(5556, 505)
(22222, 505)


In [3]:
#LOAD CSV WITH DATA CLEANING INSTRUCTIONS
path = 'VariableNanDetails.csv'
data_cleaning_inst = pd.read_csv (path)

In [4]:
#LOAD HOTENCONDED TESTING DATA TO MAKE SURE SIZE MATCHES
dataDir = 'AnalysisData'
dataSubDir = 'CleanDataFinal'

#LOAD TESTING DATA
#features
path = os.path.join(dataDir, dataSubDir, 'testFeaturesFinal.csv')
test_x_pd = pd.read_csv (path, index_col = 'HHX')
test_x = test_x_pd.to_numpy()
shape_wanted = test_x.shape
print(shape_wanted)

(5556, 5290)


## REMOVE DATA RANDOMLY

In [5]:
# FUNCTION TO REMOVE DATA RANDOMLY
def remove_random_data(complete_data, data_percent):
    '''replaces random entries (single features from a row) with NaN.
    complete_data: array containing full dataset
    data_percent: integer indicating what percent of data to remove
    returns: array containing dataset with missing data (NaNs)'''
    #make copy of array 
    full_data = np.copy(complete_data)
    original_shape = full_data.shape
    #flatten data
    full_data = full_data.flatten()
    #make array for incomplete data
    inc_data = np.copy(full_data)

    #calculate number of entries to replace
    entries_to_remove = int(full_data.size*data_percent/100)

    #choose random indeces of data to replace
    i = np.random.choice(full_data.shape[0], entries_to_remove, replace=False)
    #replace random data
    for index in i:
        inc_data[index] = np.nan
    #reshape data back to original shape
    inc_data = np.reshape(inc_data, original_shape)
    return(inc_data)

## DATA IMPUTATION

In [6]:
#remove 15% of data
data_percent = 15
test_x_inc = remove_random_data(test_x_nonhot, data_percent)
#turn to dataframe
test_x_inc_pd = pd.DataFrame(test_x_inc, columns=test_x_pd_nonhot.columns, index=test_x_pd_nonhot.index)
print(original_shape)
print(test_x_inc_pd.shape)

(5556, 505)
(5556, 505)


In [7]:
#ISOLATE NUMERICAL AND CATEGORICAL FEATURES
#numerical features
col_num = data_cleaning_inst.COLUMN_NAME[
    (data_cleaning_inst.DATA_TYPE=='numerical') & (data_cleaning_inst.NANs != 'drop_col')]
data_num = test_x_inc_pd.loc[:, col_num]
print(data_num.shape)
#categorical features
data_cat = test_x_inc_pd.drop(columns=col_num)
print(data_cat.shape)

(5556, 4)
(5556, 501)


## UNIVARIATE IMPUTATION

In [8]:
#NUMERICAL DATA
#turn to array
data_num_array = data_num.to_numpy()
#create imputer object
num_imp = SimpleImputer(missing_values=np.nan, strategy='median')
#fit simple imputer
num_imp.fit(data_num_array)
#impute missing data
data_num_imp = num_imp.transform(data_num_array)
#turn to dataframe
data_num_imp = pd.DataFrame(data_num_imp, columns=data_num.columns, index=test_x_pd_nonhot.index)
data_num_imp.shape

(5556, 4)

In [9]:
#CATEGORICAL DATA
#turn to array
data_cat_array = data_cat.to_numpy()
#create imputer object
cat_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
#fit simple imputer
cat_imp.fit(data_cat_array)
#impute missing data
data_cat_imp = cat_imp.transform(data_cat_array)
#turn to dataframe
data_cat_imp = pd.DataFrame(data_cat_imp, columns=data_cat.columns, index=test_x_pd_nonhot.index)
print(data_cat.shape)
print(data_cat_imp.shape)

(5556, 501)
(5556, 501)


## NORMALIZE NUMERICAL DATA

In [10]:
#USE TRAINING DATA TO TRAIN ENCODER, THEN NORMALIZE NUM TEST DATA 

#isolate numerical training data
col_num = data_cleaning_inst.COLUMN_NAME[
    (data_cleaning_inst.DATA_TYPE=='numerical') & (data_cleaning_inst.NANs != 'drop_col')]
num_train = train_x_pd_nonhot.loc[:, col_num]

#create object and fit to training data
normalizer = Normalizer().fit(num_train)

#test data
num_norm_imp = normalizer.transform (data_num_imp)
#turn to dataframe
num_norm_imp_pd = pd.DataFrame(num_norm_imp, columns = data_num_imp.columns, index = data_num_imp.index)
num_norm_imp_pd.shape

(5556, 4)

## ONE-HOT ENCODING

In [11]:
#USE ALL FEATURES TO TRAIN ENCODER
x_all = pd.concat([test_x_pd_nonhot, train_x_pd_nonhot], axis=0)
x_all.shape

(27778, 505)

In [12]:
#ONE-HOT ENCODE CATEGORICAL DATA
#make list of columns to one-hot encode
cols = data_cleaning_inst.COLUMN_NAME[(data_cleaning_inst.ENCODING == 'one_hot') & 
                                      (data_cleaning_inst.COLUMN_NAME != 'PHQCAT_A')].tolist()

#isolate data to encode as one-hot
x_all_onehot = x_all.loc[:, cols]
#create object
enc = OneHotEncoder()
#fit encoder
enc.fit(x_all_onehot)

#transform data
#isolate data to onehot encode
cat_to_onehot = data_cat_imp.loc[:, cols]
cat_onehot = enc.transform(cat_to_onehot).toarray()
onehot_features = enc.get_feature_names_out(cols)
#turn to df
onehot_df = pd.DataFrame(cat_onehot, columns = onehot_features, index=cat_to_onehot.index)
onehot_df.shape

(5556, 5246)

## JOIN DATA AND SAVE

In [13]:
#join with other data
cat_nonhot = data_cat_imp.drop(columns = cols)
x_train_imp = pd.concat([onehot_df, cat_nonhot, num_norm_imp_pd], axis=1)

#a few checks
print(x_train_imp.shape)
print(shape_wanted)
#print # NaNs
print(x_train_imp.isnull().sum().sum())

x_train_imp

(5556, 5290)
(5556, 5290)
0


Unnamed: 0_level_0,AVAIL_A_1,AVAIL_A_2,AVAIL_A_3,AVAIL_A_8,PROXY_A_1.0,PROXY_A_10.0,PROXYREL_A_1.0,PROXYREL_A_2.0,PROXYREL_A_3.0,PROXYREL_A_4.0,...,COGMEMDFF_A,COMDIFF_A,DIFF_A,HEARINGDF_A,VISIONDF_A,PHSTAT_A,POVRATTC_A,INTV_QRT,FAMINCTC_A,AGEP_A
HHX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
H038561,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,2.0,1.0,2.0,0.000050,0.000020,1.000000,0.000135
H043980,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,2.0,0.000045,0.000018,1.000000,0.000314
H052102,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,2.0,1.0,0.000032,0.000033,0.999999,0.001017
H015803,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,2.0,0.000045,0.000014,1.000000,0.000318
H000267,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,2.0,1.0,1.0,1.0,1.0,3.0,0.000051,0.000021,1.000000,0.000885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
H035143,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,2.0,0.000064,0.000036,1.000000,0.000782
H035351,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.000050,0.000015,1.000000,0.000265
H019908,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.000023,0.000013,1.000000,0.000348
H038324,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.000039,0.000041,1.000000,0.000449


In [14]:
#STORE DATA

#make directory 
dataDir = 'ImputedData'
if not os.path.exists(dataDir):
    os.mkdir(dataDir)

#store imputed testing data
filepath = os.path.join(dataDir, 'SimpleImputedFeatures_05.csv')
x_train_imp.to_csv(filepath) 