# House Prices - Adnced Regression Techniques (kaggle competition)
## Predict on Hold Out Set Notebook

In [49]:
# import necessary packages
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import pickle

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [50]:
# import dataset
df_test=pd.read_csv("./data/df_test.csv")

In [51]:
# import pickled items
# model
with open("./data/pickle/model_final", 'rb') as file:
    model = pickle.load(file)
    
# k best cols
with open("./data/pickle/k_best_cols", 'rb') as file:
    k_best_cols = pickle.load(file)

## Data Cleaning 

In [52]:
# impute numeric
imputer = SimpleImputer(missing_values=np.NaN, strategy="median")

# for full df
for col in df_test.columns:
    if np.issubdtype(df_test[col].dtype, np.number):
        df_test[col] = imputer.fit_transform(df_test[col].values.reshape(-1,1))[:,0]

In [53]:
# impute categorical
df_test=df_test.fillna("Unknown")

## Outliers

In [54]:
# iterate through noncategorical numerical features 
for col in df_test.columns:
    if np.issubdtype(df_test[col].dtype, np.number): 
        above_6std = df_test[col].mean()+(6*df_test[col].std())

        # if outliers are above 6 standard devs, reduce to 6 standard devs from mean
        df_test[col] = np.where(df_test[col].values >above_6std, 
                                df_test[col].mean()+6*df_test[col].std(), 
                                df_test[col])

## Predict Holdout

In [55]:
final_predictions = model.predict(df_test[k_best_cols])

In [56]:
# reshape
final_predictions = final_predictions.reshape(-1)
# create final answer df
final_answer_df=pd.DataFrame({'SalePrice':final_predictions})
# create id column
final_answer_df['Id'] = np.arange(1461, 1461 + len(final_answer_df))
# reorder cols
final_answer_df = final_answer_df[['Id','SalePrice']]

final_answer_df.head()

Unnamed: 0,Id,SalePrice
0,1461,111671.985552
1,1462,163324.15631
2,1463,179220.874058
3,1464,184540.460514
4,1465,216483.380081


In [58]:
# export
final_answer_df.to_csv("./data/housing_preds.csv", index=False)