In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
#LOAD NONHOT DATA

#training data
dataDir = 'AnalysisData'
dataSubDir = 'CleanDataNonhot'
filepath = os.path.join(dataDir, dataSubDir, 'testFeaturesNonHot.csv')
test_x_pd_nonhot = pd.read_csv (filepath, index_col = 0)
test_x_nonhot = test_x_pd_nonhot.to_numpy()
original_shape = test_x_pd_nonhot.shape
print(original_shape)

#testing data
dataDir = 'AnalysisData'
dataSubDir = 'CleanDataNonhot'
filepath = os.path.join(dataDir, dataSubDir, 'trainFeaturesNonHot.csv')
train_x_pd_nonhot = pd.read_csv (filepath, index_col = 0)
train_x_nonhot = train_x_pd_nonhot.to_numpy()
print(train_x_nonhot.shape)

(5556, 505)
(22222, 505)


In [3]:
#LOAD CSV WITH DATA CLEANING INSTRUCTIONS
path = 'VariableNanDetails.csv'
data_cleaning_inst = pd.read_csv (path)

In [4]:
#LOAD HOTENCONDED TESTING DATA TO MAKE SURE SIZE MATCHES
dataDir = 'AnalysisData'
dataSubDir = 'CleanDataFinal'

#LOAD TESTING DATA
#features
path = os.path.join(dataDir, dataSubDir, 'testFeaturesFinal.csv')
test_x_pd = pd.read_csv (path, index_col = 'HHX')
test_x = test_x_pd.to_numpy()
shape_wanted = test_x.shape
print(shape_wanted)

(5556, 5290)


## REMOVE DATA RANDOMLY

In [5]:
# FUNCTION TO REMOVE DATA RANDOMLY
def remove_random_data(complete_data, data_percent):
    '''replaces random entries (single features from a row) with NaN.
    complete_data: array containing full dataset
    data_percent: integer indicating what percent of data to remove
    returns: array containing dataset with missing data (NaNs)'''
    #make copy of array 
    full_data = np.copy(complete_data)
    original_shape = full_data.shape
    #flatten data
    full_data = full_data.flatten()
    #make array for incomplete data
    inc_data = np.copy(full_data)

    #calculate number of entries to replace
    entries_to_remove = int(full_data.size*data_percent/100)

    #choose random indeces of data to replace
    i = np.random.choice(full_data.shape[0], entries_to_remove, replace=False)
    #replace random data
    for index in i:
        inc_data[index] = np.nan
    #reshape data back to original shape
    inc_data = np.reshape(inc_data, original_shape)
    return(inc_data)

## DATA IMPUTATION

In [6]:
#remove 15% of data
data_percent = 0
test_x_inc = remove_random_data(test_x_nonhot, data_percent)
#turn to dataframe
test_x_inc_pd = pd.DataFrame(test_x_inc, columns=test_x_pd_nonhot.columns, index=test_x_pd_nonhot.index)
print(original_shape)
print(test_x_inc_pd.shape)

(5556, 505)
(5556, 505)


In [7]:
#ISOLATE NUMERICAL AND CATEGORICAL FEATURES
#numerical features
col_num = data_cleaning_inst.COLUMN_NAME[
    (data_cleaning_inst.DATA_TYPE=='numerical') & (data_cleaning_inst.NANs != 'drop_col')]
data_num = test_x_inc_pd.loc[:, col_num]
print(data_num.shape)
#categorical features
data_cat = test_x_inc_pd.drop(columns=col_num)
print(data_cat.shape)

(5556, 4)
(5556, 501)


## UNIVARIATE IMPUTATION

In [8]:
#NUMERICAL DATA
#turn to array
data_num_array = data_num.to_numpy()
#create imputer object
num_imp = SimpleImputer(missing_values=np.nan, strategy='median')
#fit simple imputer
num_imp.fit(data_num_array)
#impute missing data
data_num_imp = num_imp.transform(data_num_array)
#turn to dataframe
data_num_imp = pd.DataFrame(data_num_imp, columns=data_num.columns, index=test_x_pd_nonhot.index)
data_num_imp.shape

(5556, 4)

In [9]:
#CATEGORICAL DATA
#turn to array
data_cat_array = data_cat.to_numpy()
#create imputer object
cat_imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
#fit simple imputer
cat_imp.fit(data_cat_array)
#impute missing data
data_cat_imp = cat_imp.transform(data_cat_array)
#turn to dataframe
data_cat_imp = pd.DataFrame(data_cat_imp, columns=data_cat.columns, index=test_x_pd_nonhot.index)
print(data_cat.shape)
print(data_cat_imp.shape)

(5556, 501)
(5556, 501)


In [10]:
#JOIN WITH TRAINING DATA TO GET NUMBER OF CATEGORIES TO WORK OUT FOR ONE-HOT ENCODING

#drop num columns from training data
col_num = data_cleaning_inst.COLUMN_NAME[
    (data_cleaning_inst.DATA_TYPE=='numerical') & (data_cleaning_inst.NANs != 'drop_col')]
data_cat_train = train_x_pd_nonhot.drop(columns=col_num)
print(data_cat_train.shape)

#join test and train data 
x_all = pd.concat([data_cat_train, data_cat_imp], axis=0)
print(x_all.shape)

(22222, 501)
(27778, 501)


In [11]:
#ONE-HOT ENCODE CATEGORICAL DATA
#make list of columns to one-hot encode
cols = data_cleaning_inst.COLUMN_NAME[(data_cleaning_inst.ENCODING == 'one_hot') & 
                                      (data_cleaning_inst.COLUMN_NAME != 'PHQCAT_A')].tolist()
#isolate data to encode as one-hot
clean_onehot_data = x_all.loc[:, cols]
nonhot_cat = data_cat_imp.drop(columns=cols)
#create object
enc = OneHotEncoder()
#fit encoder
enc.fit(clean_onehot_data)
#transform data
clean_onehot_data_enc = enc.transform(clean_onehot_data).toarray()

#get name of new columns
onehot_features = enc.get_feature_names_out(cols)
#turn to df
onehot_df = pd.DataFrame(clean_onehot_data_enc, columns = onehot_features, index=x_all.index)
#drop training data
test_HHX = test_x_pd_nonhot.index
onehot_df = onehot_df.loc[test_HHX, :]

#join with previous data 
test_x_imp = pd.concat([data_num_imp, onehot_df, nonhot_cat], axis=1)
print(test_x_imp.shape)
print(shape_wanted)

(5556, 5290)
(5556, 5290)


In [None]:
#STORE DATA

#make directory 
dataDir = 'ImputedData'
if not os.path.exists(dataDir):
    os.mkdir(dataDir)

#store imputed testing data
filepath = os.path.join(dataDir, 'SimpleImputedFeatures_02.csv')
test_x_imp.to_csv(filepath) 