## Test Data Organizing

- Following a very similar format to the data organization of the training data
- This is purposeful to ensure that the transformations remain consistent
- Additionally, looking forward, this really illuminates how OOD will be necessary for larger projects as I continue through the program. A custom 'Order & Transform' Class with reusable methods could have enabled me to save time and effort. I will be sure to employ this in Project 3

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn import model_selection #import train_test_split
from sklearn import linear_model #LinearRegression
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
pd.options.mode.chained_assignment = None

In [2]:
data = pd.read_csv('../datasets/test.csv', index_col=0)

In [3]:
data = data.reset_index()

In [4]:
data.columns = data.columns.str.replace(' ', '_').str.lower()

In [5]:
test_ids = pd.DataFrame()
test_ids['id'] =  data.id
test_ids.to_csv('../datasets/test_ids.csv')

In [6]:
metadata = pd.read_csv('../datasets/metadata-cleaned.csv', index_col=0)

In [7]:
metadata = metadata[(metadata.column_name != 'id')]

In [8]:
metadata = metadata[(metadata.column_name != 'pid')]

In [9]:
data = data.drop(columns = ['pool_qc','pool_area'])
data = data.drop(columns = ['alley'])
data = data.drop(columns = ['fence'])
data = data.drop(columns = ['garage_qual'])
data = data.drop(columns = ['mas_vnr_type'])
data = data.drop(columns = ['misc_feature'])

In [10]:
metadata = metadata.reset_index(drop=True)

In [12]:
all_nominals_names = metadata[metadata.variable_type == 'Nominal']

In [13]:
nominals = data[all_nominals_names.column_name]

In [14]:
numeric_names = metadata[metadata.variable_type == 'Numerical']

In [15]:
numerics = data[numeric_names.column_name]

### Transform Method from Ordinal Null Handling

In [16]:
def ord_transform(df, col, transform):
    '''ord method'''
    assert transform
    df[col] = df[col].map(transform)
    metadata.loc[metadata['column_name'] == col, 'ord_transform_map'] = str(transform)
    return df

In [17]:
def check_if_transformed(variable):
    '''ord method'''
    if metadata.loc[metadata['column_name'] == variable, 'ord_transform_map'].isnull().all():
        return False
    else:
        return True

In [18]:
def mark_dont_transform(variable):
    '''ord method'''
    metadata.loc[metadata['column_name'] == variable, 'ord_transform_map'] = 'No Transform'

In [19]:
popular_ord_transform = {'Ex' : 5,  'Gd' : 4, 'TA' : 3, 'Fa': 2, 'Po' : 1, np.nan: 0} 
garage_finish_dict = {'Fin':3,'RFn': 2, 'Unf':1, np.nan: 0}
basement_finish_dict = {'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1, np.nan:0}
basement_exp_dict = {'Gd':3,'Av':2,'Mn':1, 'No':0, np.nan:0}
slope_dict = {'Gtl':3, 'Mod':2, 'Sev':1}
countour_dict = {'Lvl':4, 'Low':3, 'Bnk':2,'HLS':1}
lot_shape_dict = {'Reg':4,'IR1':3,'IR2':2,'IR3':1}
functional_dict = {
    "Typ": 1,
    "Min1": 2,
    "Min2": 3,
    "Mod": 4,
    "Maj1": 5,
    "Maj2": 6,
    "Sev": 7,
    "Sal": 8}
binary_dict = {'Y':1,'N':0}

In [20]:
ordinal_names = metadata[metadata.variable_type == 'Ordinal']

In [21]:
ordinals = data[ordinal_names.column_name]

In [22]:
for var in ordinals.columns:
    if data[var].dtype == 'int64':
        mark_dont_transform(var)

In [23]:
def get_ordinals_untransformed():
    output = []
    for col in ordinals.columns:
        if not check_if_transformed(col):
            output.append((col,data[col].unique()))
    return output

### Applying all Translations Transformations to ordinal data

In [24]:
data = ord_transform(data,'fireplace_qu',popular_ord_transform)

In [25]:
data = ord_transform(data,'bsmt_qual',popular_ord_transform)

In [26]:
data = ord_transform(data,'bsmt_cond',popular_ord_transform)

In [27]:
data = ord_transform(data,'bsmtfin_type_1',basement_finish_dict)

In [28]:
data = ord_transform(data,'bsmt_exposure',basement_exp_dict)

In [29]:
data = ord_transform(data,'garage_finish',garage_finish_dict)

In [30]:
data = ord_transform(data,'garage_cond',popular_ord_transform)

In [31]:
data = ord_transform(data,'exter_cond',popular_ord_transform)

In [32]:
data = ord_transform(data,'heating_qc',popular_ord_transform)

In [33]:
data = ord_transform(data,'exter_qual',popular_ord_transform)

In [34]:
data = ord_transform(data,'kitchen_qual',popular_ord_transform)

In [35]:
data = ord_transform(data,'land_slope',slope_dict)

In [36]:
data = ord_transform(data,'land_contour',countour_dict)

In [37]:
data = ord_transform(data,'lot_shape',lot_shape_dict)

In [38]:
data = ord_transform(data,'paved_drive',{'Y':3,'P':2,'N':1})

In [39]:
data = ord_transform(data,'functional',functional_dict)

In [40]:
data = ord_transform(data,'central_air',binary_dict)

### Data Imputing

In [41]:
def impute(column_names, task, x):
    for col in column_names:
        if task == 'impute_mean':
            impute_item = x[col].mean()
        elif task == 'impute_median':
            impute_item = x[col].median()
        elif task == 'impute_mode':
            impute_item = x[col].mode().iloc[0]
        else:
            raise Exception('insert verified task')
        x[col] = x[col].fillna(impute_item)
    return x

In [42]:
mean_cols = metadata[metadata['impute_mean'] == 1].column_name.values

In [43]:
data = impute(mean_cols,'impute_mean',data)

In [44]:
mode_cols = metadata[metadata['impute_mode'] == 1].column_name.values

In [45]:
data = impute(mode_cols,'impute_mode',data)

In [46]:
median_cols = metadata[metadata['impute_median'] == 1].column_name.values

In [47]:
data = impute(median_cols,'impute_mode',data)

#### One-Hot Encoding Nominal Data

In [48]:
nominal_columns = metadata[metadata.variable_type == 'Nominal'].column_name.values
#just the names of the columns that are nominal

In [49]:
data_encoded = pd.get_dummies(data, columns = nominal_columns,drop_first=True)

#### Normalizing Numeric Data into z-scores based off of test col distributions

In [50]:
numeric_columns = metadata[metadata.variable_type == 'Numerical'].column_name

In [51]:
data_numeric = data[numeric_columns]

In [52]:
scaler = StandardScaler()

In [53]:
scaler.fit(data_numeric)

StandardScaler()

In [54]:
data_num_transformed = scaler.transform(data_numeric)
data_num_trans_df = pd.DataFrame(data_num_transformed,columns=numeric_columns,index=None)

In [55]:
data_not_numeric = data_encoded.drop(columns=numeric_columns,axis=1)

In [56]:
data_not_numeric = data_not_numeric.reset_index(drop=True)

In [57]:
data_final = data_not_numeric.merge(data_num_trans_df,left_index=True, right_index=True)

In [58]:
data_final = data_final.drop(columns = ['id','pid'])

In [59]:
data_final.to_csv('../datasets/test_Prepared.csv')