# Import Dependencies

In [19]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

pd.set_option('display.max_columns', None)

import houseprice_package as hp # This is my own package!

## Load Pickle Objects

The following objects have been pickled and extracted from the data analysis and training notebook:
- Pickle: A Python module utilized for serializing and deserializing Python objects. This process involves converting Python objects into a byte stream, enabling their storage in a file. Pickle facilitates the serialization of various data structures such as lists and dictionaries into a binary format. However, it's essential to exercise caution when unpickling data from unknown or untrusted sources, as this action may pose a security risk by potentially executing arbitrary code.

In [20]:
avgdB = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/avg_dB_dict.pickle")
en = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/elastic_net.pickle")
gbr = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/gradient.pickle")
lf_dict = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/lf_dict.pickle")
mas_dict = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/mm_dict.pickle")
oneHot = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/oneHot.pickle")
features = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/selected_ft.pickle")
scaler = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/std_scaler.pickle")
xgbr = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/xg_regressor.pickle")
features_drop = hp.load_objects("C:/Users/nene0/OneDrive/바탕 화면/Python Learning/DataScienceMod2_LFZ/HousePrice_cleaned/train_pickles/features_drop.pickle")

# Validation Data

In [21]:
validation = pd.read_csv(r"C:\Users\nene0\OneDrive\바탕 화면\Python Learning\DataScienceMod2_LFZ\HousePrice_cleaned\house_price_validation.csv")

In [22]:
validation.drop(columns='Id', inplace=True)
validation.drop(columns=['MasVnrType', 'Fence', 'Alley', 'MiscFeature', 'PoolQC'], inplace=True)

In [23]:
# Checked the unique values using data dictionary, 'N/A' means no basement.
validation['BsmtQual'] = np.where(((validation['TotalBsmtSF']==0)&(validation['BsmtQual'].isna())), 'N/A', validation['BsmtQual'])
validation['BsmtCond'] = np.where(((validation['TotalBsmtSF']==0)&(validation['BsmtCond'].isna())), 'N/A', validation['BsmtCond'])
validation['BsmtExposure'] = np.where(((validation['TotalBsmtSF']==0)&(validation['BsmtExposure'].isna())), 'N/A', validation['BsmtExposure'])
validation['BsmtFinType1'] = np.where(((validation['TotalBsmtSF']==0)&(validation['BsmtFinType1'].isna())), 'N/A', validation['BsmtFinType1'])
validation['BsmtFinType2'] = np.where(((validation['TotalBsmtSF']==0)&(validation['BsmtFinType2'].isna())), 'N/A', validation['BsmtFinType2'])

# Fill in missing values by mode for the index 534.
validation['BsmtFinType2'].fillna('Unf', inplace=True)

validation['GarageType'] = np.where(((validation['GarageArea']==0)&(validation['GarageType'].isna())), 'N/A', validation['GarageType'])
validation['GarageYrBlt'] = np.where(((validation['GarageArea']==0)&(validation['GarageYrBlt'].isna())), 0, validation['GarageYrBlt'])
validation['GarageFinish'] = np.where(((validation['GarageArea']==0)&(validation['GarageFinish'].isna())), 'N/A', validation['GarageFinish'])
validation['GarageQual'] = np.where(((validation['GarageArea']==0)&(validation['GarageQual'].isna())), 'N/A', validation['GarageQual'])
validation['GarageCond'] = np.where(((validation['GarageArea']==0)&(validation['GarageCond'].isna())), 'N/A', validation['GarageCond'])

In [24]:
validation['Electrical'].fillna('SBrkr', inplace=True)
validation['FireplaceQu'].fillna('N/A', inplace=True)

validation['MasVnrArea'] = validation.apply(lambda row: mas_dict.get(row['Neighborhood'], row['MasVnrArea']) if pd.isna(row['MasVnrArea']) else row['MasVnrArea'],axis=1)
validation['LotFrontage'] = validation.apply(lambda row: lf_dict.get(row['Neighborhood'], row['LotFrontage']) if pd.isna(row['LotFrontage']) else row['LotFrontage'], axis=1)

In [25]:
# MSSubClass type is integer but it is actually nominal categorical data type.
validation['MSSubClass'] = validation['MSSubClass'].astype('object')

#Change some features into binary.
validation['GarageFinish'] = validation['GarageFinish'].map(lambda x: 1 if x=='Fin' else 0)
validation['CentralAir'] = validation['CentralAir'].map(lambda x: 1 if x=='Y' else 0)
validation['Functional'] = validation['Functional'].map(lambda x: 1 if x=='Typ' else 0)
validation['PavedDrive'] = validation['PavedDrive'].map(lambda x: 1 if x=='Y' else 0)
validation['Fireplaces'] = validation['Fireplaces'].map(lambda x: 1 if x>0 else 0)
validation['Street'] = validation['Street'].map(lambda x: 1 if x=='Pave' else 0)
validation['Utilities'] = validation['Utilities'].map(lambda x: 1 if x=='AllPub' else 0)

qc_dictionary = {'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'N/A':0}
height_dictionary = {'Ex':100, 'Gd':90, 'TA':80, 'Fa':70, 'Po':60, 'N/A':0}

validation['ExterQual'] = validation['ExterQual'].replace(qc_dictionary)
validation['ExterCond'] = validation['ExterCond'].replace(qc_dictionary)
validation['BsmtCond'] = validation['BsmtCond'].replace(qc_dictionary)
validation['HeatingQC'] = validation['HeatingQC'].replace(qc_dictionary)
validation['KitchenQual'] = validation['KitchenQual'].replace(qc_dictionary)
validation['FireplaceQu'] = validation['FireplaceQu'].replace(qc_dictionary)
validation['GarageQual'] = validation['GarageQual'].replace(qc_dictionary)
validation['GarageCond'] = validation['GarageCond'].replace(qc_dictionary)

#Make a feature that is Basement Height instead of quality to prevent confusion.
validation['BsmtHeight'] = validation['BsmtQual'].replace(height_dictionary)
validation.drop(columns='BsmtQual', inplace=True)

validation['OverallQC'] = (validation['OverallQual'] + validation['OverallCond'])/2
validation['ExteriorQC'] = (validation['ExterQual'] + validation['ExterCond'])/2
validation['GarageQC'] = (validation['GarageQual'] + validation['GarageCond'])/2

validation.drop(columns=['OverallQual','OverallCond','ExterQual','ExterCond','GarageQual','GarageCond'], inplace=True)

validation['BsmtFinType1'] = validation['BsmtFinType1'].map(lambda x: 1 if x=='GLQ' else 1 if x=='ALQ' else 0)
validation['BsmtFinType2'] = validation['BsmtFinType2'].map(lambda x: 1 if x=='GLQ' else 1 if x=='ALQ' else 0)

validation['BsmtFinish'] = validation['BsmtFinType1'] + validation['BsmtFinType2']

validation.drop(columns=['BsmtExposure','BsmtFinType1','BsmtFinType2'], inplace=True)

In [26]:
# Total Porch Area
validation['TotalPorchSF'] = validation['OpenPorchSF'] + validation['EnclosedPorch'] + validation['3SsnPorch'] + validation['ScreenPorch']

# Total Living Area
validation['TotalLivSF'] = validation['MasVnrArea'] + validation['TotalBsmtSF'] + validation['1stFlrSF'] + validation['2ndFlrSF'] + validation['GarageArea'] + validation['PoolArea'] + validation['TotalPorchSF'] + validation['WoodDeckSF']

# New Built House
validation['NewHouse'] = validation['SaleType'].map(lambda x: 1 if x=='New' else 0)

# Expensive Neighborhood
validation['ExpNeighborhood'] = validation['Neighborhood'].map(lambda x: 1 if x=='NoRidge' else 1 if x=='NridgHt' else 1 if x=='StoneBr' else 0)

# Basement and Ground Bathrooms
validation['BsmtHalfBath'] = validation['BsmtHalfBath'].map(lambda x: 0.5 if x==1 else 1 if x==2 else 0)
validation['HalfBath'] = validation['HalfBath'].map(lambda x: 0.5 if x==1 else 1 if x==2 else 0)

validation['BsmtBaths'] = validation['BsmtFullBath'] + validation['BsmtHalfBath']
validation['GrBaths'] = validation['FullBath'] + validation['HalfBath']

# Age of the House
validation['HouseAge'] = validation['YrSold'] - validation['YearBuilt']

#Neighbor dB
condition = {'Norm':'Normal', 'Feedr':'Road', 'PosN':'Good', 'Artery':'Road', 'RRAe':'Railroad', 'RRNn':'Railroad', 'RRAn':'Railroad',
             'PosA':'Good', 'RRNe':'Railroad'}
validation['Condition1'] = validation['Condition1'].replace(condition)
validation['Condition2'] = validation['Condition2'].replace(condition)
validation['NeighborCondition'] = validation['Condition1'] + validation['Condition2']

condition2 = {'NormalNormal':60, 'RoadNormal':75, 'GoodNormal':55, 'RoadRoad':80,'RailroadNormal':85,
 'RoadRailroad':100, 'RailroadRoad':100, 'GoodGood':50,'RoadGood':70}
validation['NeighborNoise(dB)'] = validation['NeighborCondition'].replace(condition2)

validation['NeighborAvg_dB'] = validation['Neighborhood'].replace(avgdB)
validation['NeighborAvg_dB'] = round(validation['NeighborAvg_dB'],2)

validation.drop(columns=['MoSold','SaleType','SaleCondition','Condition1','Condition2','NeighborNoise(dB)','NeighborCondition','BsmtFullBath','BsmtHalfBath','YrSold','LotConfig','HalfBath'], inplace=True)

In [27]:
val_cat_ft = validation[validation.select_dtypes('object').columns]
encoded_val = oneHot.transform(val_cat_ft)
encoded_val_df = encoded_val[features]

In [28]:
val_target = np.log(validation['SalePrice'])
validation.drop(columns=['Street', 'Utilities','SalePrice'], inplace=True)
ns_val = validation[['YearBuilt', 'YearRemodAdd', 'BsmtCond', 'HeatingQC', 'CentralAir', 'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'Functional',
                    'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'PavedDrive', 'OverallQC', 'ExteriorQC', 'GarageQC',
                    'BsmtFinish', 'NewHouse', 'ExpNeighborhood', 'BsmtBaths', 'GrBaths']]

n_val = validation[validation.select_dtypes(['float', 'int']).columns].drop(columns=ns_val.columns)

scaled_val = pd.DataFrame(scaler.transform(n_val), columns=n_val.columns)

In [29]:
fin_val = encoded_val_df.join(scaled_val)
fin_val = fin_val.join(ns_val)

In [30]:
en_pred = en.predict(fin_val)
gbr_pred = gbr.predict(fin_val)
xgbr_pred = xgbr.predict(fin_val)

In [31]:
weighted_pred = en_pred * 0.5 + gbr_pred * 0.2 + xgbr_pred * 0.3
print('Weighted RMSE:',mean_squared_error(np.exp(val_target), np.exp(weighted_pred)) ** .5)

Weighted RMSE: 23904.490439255325


The weighted prediction for the test dataset, which was split from the original train dataset for cross-validation, stands at approximately 24K. This represents a difference of approximately 5K from the RMSE observed in the train dataset. Ideally, we aim to minimize the disparity between these two datasets for better model generalization. Nevertheless, for the present moment, we will employ this model to make predictions on the actual test dataset.

# Test Data

In [3]:
test_path = r"C:\Users\nene0\Documents\Datasets\house-prices-advanced-regression-techniques\test.csv"

In [4]:
df = hp.read_data(test_path)
test = hp.drop_id(df)
test = hp.drop_features_ts(test, features_drop)
test = hp.data_imputation_ts(test, mas_dict, lf_dict)
test = hp.data_org(test)
test = hp.test_fill_values(test)
test = hp.feature_eng_ts(test, avgdB)
encoded_test = hp.onehot_encode_ts(test, oneHot)
encoded_test = hp.selected_features_ts(encoded_test, features)
scaled_test, noscale_test = hp.scale_data_ts(test, scaler)
test_final = hp.final_test(encoded_test, scaled_test, noscale_test)
test_en_pred, test_gbr_pred, test_xgbr_pred = hp.price_prediction(test_final, en, gbr, xgbr)
final_pred = hp.weighted_pred_ts(test_en_pred, test_gbr_pred, test_xgbr_pred)

In [5]:
final_pred

Unnamed: 0,SalePrice
0,128250.879389
1,163414.757031
2,190749.234289
3,203481.241532
4,196541.320088
...,...
1454,86164.553925
1455,86349.368901
1456,176765.974424
1457,121650.124802


In [6]:
submission = pd.read_csv("C://Users//nene0//OneDrive//바탕 화면//Datasets//house-prices-advanced-regression-techniques//sample_submission.csv")
submission.drop(columns='SalePrice', inplace=True)
result_df = submission.join(final_pred)
result_df.to_csv("newly_cleaned_result.csv", index=False)
result_df

Unnamed: 0,Id,SalePrice
0,1461,128250.879389
1,1462,163414.757031
2,1463,190749.234289
3,1464,203481.241532
4,1465,196541.320088
...,...,...
1454,2915,86164.553925
1455,2916,86349.368901
1456,2917,176765.974424
1457,2918,121650.124802


In [7]:
result_df

Unnamed: 0,Id,SalePrice
0,1461,128250.879389
1,1462,163414.757031
2,1463,190749.234289
3,1464,203481.241532
4,1465,196541.320088
...,...,...
1454,2915,86164.553925
1455,2916,86349.368901
1456,2917,176765.974424
1457,2918,121650.124802


# Summary

Consequently, this model has shown improvement over the base model and the model that did not account for data leakage prevention in the Kaggle competition. The score of this model has increased by approximately 0.23 in the Kaggle competition. As we discussed in the training notebook, while this approach may not yield the highest possible score in Kaggle competitions, it results in a more generalized model that closely simulates the real-world data modeling process.

In the real world, data leakage is not as prevalent as it often occurs in Kaggle competitions, where test data information should not be incorporated into the prediction model. However, practicing and implementing a more realistic approach to model building can significantly enhance one's skills as a real-world data scientist.