In [199]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from pandas.api.types import is_numeric_dtype

In [185]:
SMALL_SIZE = 14
MEDIUM_SIZE = 16
BIGGER_SIZE = 18

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [186]:
dataset = pd.read_csv('G:/data/HousePrices/train.csv').drop(['Id'], axis=1)
train_dataset = dataset.copy()
test_dataset = pd.read_csv('G:/data/HousePrices/test.csv')
submission = pd.read_csv('G:/data/HousePrices/sample_submission.csv')

In [187]:
dataset

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [188]:
train_dataset

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


# Feature engineering

In [189]:
def check_missing_values_num(dataset):
    features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum() > 1 and is_numeric_dtype(dataset[feature])]
    for feature in features_nan:
        print(f"{feature}: {np.round(dataset[feature].isnull().mean(), 4)}% missing values")
    return features_nan

def replace_missing_values_num(dataset):
    features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum() > 1 and is_numeric_dtype(dataset[feature])]
    for feature in features_nan:
        dataset[feature] = dataset[feature].fillna(dataset[feature].median())

In [190]:
num_nan = check_missing_values_num(train_dataset)

LotFrontage: 0.1774% missing values
MasVnrArea: 0.0055% missing values
GarageYrBlt: 0.0555% missing values


In [191]:
replace_missing_values_num(train_dataset)

In [192]:
train_dataset

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [193]:
features_replacement = {}

In [194]:
features_cat = [feature for feature in dataset.columns if not is_numeric_dtype(dataset[feature])]
for feature in features_cat:
    unique_numbers = train_dataset[feature].unique()
    features_replacement[feature] = (unique_numbers, range(len(unique_numbers)))
    train_dataset[feature].replace(features_replacement[feature][0],
                                    features_replacement[feature][1], inplace=True)

In [195]:
features_year = [feature for feature in dataset.columns if 'Year' in feature]
features_year.append('YrSold')
for feature in features_year:
    unique_numbers = train_dataset[feature].unique()
    features_replacement[feature] = (np.sort(unique_numbers), range(len(unique_numbers)))
    train_dataset[feature].replace(features_replacement[feature][0],
                                    features_replacement[feature][1], inplace=True)

In [197]:
features_replacement

{'MSZoning': (array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object),
  range(0, 5)),
 'Street': (array(['Pave', 'Grvl'], dtype=object), range(0, 2)),
 'Alley': (array([nan, 'Grvl', 'Pave'], dtype=object), range(0, 3)),
 'LotShape': (array(['Reg', 'IR1', 'IR2', 'IR3'], dtype=object), range(0, 4)),
 'LandContour': (array(['Lvl', 'Bnk', 'Low', 'HLS'], dtype=object),
  range(0, 4)),
 'Utilities': (array(['AllPub', 'NoSeWa'], dtype=object), range(0, 2)),
 'LotConfig': (array(['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'], dtype=object),
  range(0, 5)),
 'LandSlope': (array(['Gtl', 'Mod', 'Sev'], dtype=object), range(0, 3)),
 'Neighborhood': (array(['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst',
         'NWAmes', 'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes',
         'SawyerW', 'IDOTRR', 'MeadowV', 'Edwards', 'Timber', 'Gilbert',
         'StoneBr', 'ClearCr', 'NPkVill', 'Blmngtn', 'BrDale', 'SWISU',
         'Blueste'], dtype=object),
  range(0, 25)),
 'Condition

In [202]:
features_num = [feature for feature in train_dataset.columns if is_numeric_dtype(train_dataset[feature])]
train_dataset[features_num] = normalize(train_dataset[features_num], norm='l1', axis=0)

In [203]:
train_dataset

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0.000722,0.0,0.000637,0.000550,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000217,0.000754,0.0,0.000000,0.000789
1,0.000241,0.0,0.000784,0.000625,0.0,0.0,0.000000,0.0,0.0,0.001148,...,0.0,0.0,0.000000,0.000000,0.000000,0.000542,0.000377,0.0,0.000000,0.000687
2,0.000722,0.0,0.000667,0.000733,0.0,0.0,0.001678,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000975,0.000754,0.0,0.000000,0.000846
3,0.000843,0.0,0.000588,0.000622,0.0,0.0,0.001678,0.0,0.0,0.002296,...,0.0,0.0,0.000000,0.000000,0.000000,0.000217,0.000000,0.0,0.001957,0.000530
4,0.000722,0.0,0.000824,0.000929,0.0,0.0,0.001678,0.0,0.0,0.001148,...,0.0,0.0,0.000000,0.000000,0.000000,0.001300,0.000754,0.0,0.000000,0.000946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,0.000722,0.0,0.000608,0.000516,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000867,0.000377,0.0,0.000000,0.000663
1456,0.000241,0.0,0.000833,0.000858,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.002058,0.000000,0.000000,0.000217,0.001509,0.0,0.000000,0.000795
1457,0.000843,0.0,0.000647,0.000589,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.006173,0.015873,0.039374,0.000542,0.001509,0.0,0.000000,0.001009
1458,0.000241,0.0,0.000667,0.000633,0.0,0.0,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000433,0.001509,0.0,0.000000,0.000538


In [6]:
test_dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [7]:
submission

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.683570
3,1464,179317.477511
4,1465,150730.079977
...,...,...
1454,2915,167081.220949
1455,2916,164788.778231
1456,2917,219222.423400
1457,2918,184924.279659


# Model training