# Set-Up

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score
from collections import OrderedDict
L = LinearRegression(n_jobs=-1)
import pandas_profiling
from functools import reduce

# Extract Data

In [2]:
test = pd.read_csv('testhouse.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
Id               1459 non-null int64
MSSubClass       1459 non-null int64
MSZoning         1455 non-null object
LotFrontage      1232 non-null float64
LotArea          1459 non-null int64
Street           1459 non-null object
Alley            107 non-null object
LotShape         1459 non-null object
LandContour      1459 non-null object
Utilities        1457 non-null object
LotConfig        1459 non-null object
LandSlope        1459 non-null object
Neighborhood     1459 non-null object
Condition1       1459 non-null object
Condition2       1459 non-null object
BldgType         1459 non-null object
HouseStyle       1459 non-null object
OverallQual      1459 non-null int64
OverallCond      1459 non-null int64
YearBuilt        1459 non-null int64
YearRemodAdd     1459 non-null int64
RoofStyle        1459 non-null object
RoofMatl         1459 non-null object
Exterior1st      1458 non-

In [3]:
test.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
train = pd.read_csv('trainhouse.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [5]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [6]:
train.shape, test.shape

((1460, 81), (1459, 80))

In [7]:
datasets = [train, test]
combine = pd.concat(datasets, axis=0, sort=False)
combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 81 columns):
Id               2919 non-null int64
MSSubClass       2919 non-null int64
MSZoning         2915 non-null object
LotFrontage      2433 non-null float64
LotArea          2919 non-null int64
Street           2919 non-null object
Alley            198 non-null object
LotShape         2919 non-null object
LandContour      2919 non-null object
Utilities        2917 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
OverallQual      2919 non-null int64
OverallCond      2919 non-null int64
YearBuilt        2919 non-null int64
YearRemodAdd     2919 non-null int64
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2918 non-

In [23]:
y = train['SalePrice']
len(y)

1460

In [9]:
combine.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


# Data Research

In [10]:
combine.isnull().sum().sort_values(ascending=False)

PoolQC           2909
MiscFeature      2814
Alley            2721
Fence            2348
SalePrice        1459
FireplaceQu      1420
LotFrontage       486
GarageQual        159
GarageYrBlt       159
GarageFinish      159
GarageCond        159
GarageType        157
BsmtExposure       82
BsmtCond           82
BsmtQual           81
BsmtFinType2       80
BsmtFinType1       79
MasVnrType         24
MasVnrArea         23
MSZoning            4
Utilities           2
Functional          2
BsmtFullBath        2
BsmtHalfBath        2
GarageArea          1
BsmtFinSF2          1
Exterior1st         1
TotalBsmtSF         1
GarageCars          1
BsmtUnfSF           1
                 ... 
LotConfig           0
Neighborhood        0
HeatingQC           0
ExterQual           0
TotRmsAbvGrd        0
YrSold              0
MoSold              0
MiscVal             0
PoolArea            0
ScreenPorch         0
3SsnPorch           0
EnclosedPorch       0
OpenPorchSF         0
WoodDeckSF          0
PavedDrive

In [None]:
#combine[combine['Electrical'].isnull()]

In [None]:
#combine = combine.drop([1380], axis=0)

In [11]:
pandas_profiling.ProfileReport(combine)

0,1
Number of variables,82
Number of observations,2919
Total Missing (%),6.4%
Total size in memory,1.8 MiB
Average record size in memory,656.0 B

0,1
Numeric,39
Categorical,43
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,1083
Unique (%),37.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1159.6
Minimum,334
Maximum,5095
Zeros (%),0.0%

0,1
Minimum,334.0
5-th percentile,665.9
Q1,876.0
Median,1082.0
Q3,1387.5
95-th percentile,1830.1
Maximum,5095.0
Range,4761.0
Interquartile range,511.5

0,1
Standard deviation,392.36
Coef of variation,0.33837
Kurtosis,6.9565
Mean,1159.6
MAD,303.44
Skewness,1.4704
Sum,3384819
Variance,153950
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
864,46,1.6%,
1040,28,1.0%,
912,19,0.7%,
848,18,0.6%,
960,18,0.6%,
816,18,0.6%,
894,17,0.6%,
936,17,0.6%,
672,17,0.6%,
546,15,0.5%,

Value,Count,Frequency (%),Unnamed: 3
334,1,0.0%,
372,1,0.0%,
407,1,0.0%,
432,1,0.0%,
438,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
3138,1,0.0%,
3228,1,0.0%,
3820,1,0.0%,
4692,1,0.0%,
5095,1,0.0%,

0,1
Distinct count,635
Unique (%),21.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,336.48
Minimum,0
Maximum,2065
Zeros (%),57.1%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,704.0
95-th percentile,1131.2
Maximum,2065.0
Range,2065.0
Interquartile range,704.0

0,1
Standard deviation,428.7
Coef of variation,1.2741
Kurtosis,-0.42226
Mean,336.48
MAD,386.75
Skewness,0.86212
Sum,982196
Variance,183780
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,1668,57.1%,
546,23,0.8%,
728,18,0.6%,
504,17,0.6%,
672,13,0.4%,
600,13,0.4%,
720,13,0.4%,
896,11,0.4%,
886,10,0.3%,
756,9,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0,1668,57.1%,
110,1,0.0%,
125,1,0.0%,
144,1,0.0%,
167,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1818,1,0.0%,
1836,1,0.0%,
1862,1,0.0%,
1872,1,0.0%,
2065,1,0.0%,

0,1
Distinct count,31
Unique (%),1.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.6023
Minimum,0
Maximum,508
Zeros (%),98.7%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,0
Maximum,508
Range,508
Interquartile range,0

0,1
Standard deviation,25.188
Coef of variation,9.6793
Kurtosis,149.41
Mean,2.6023
MAD,5.1386
Skewness,11.382
Sum,7596
Variance,634.44
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2882,98.7%,
153,3,0.1%,
168,3,0.1%,
144,2,0.1%,
180,2,0.1%,
216,2,0.1%,
219,1,0.0%,
176,1,0.0%,
86,1,0.0%,
96,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,2882,98.7%,
23,1,0.0%,
86,1,0.0%,
96,1,0.0%,
120,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
320,1,0.0%,
323,1,0.0%,
360,1,0.0%,
407,1,0.0%,
508,1,0.0%,

0,1
Distinct count,3
Unique (%),0.1%
Missing (%),93.2%
Missing (n),2721

0,1
Grvl,120
Pave,78
(Missing),2721

Value,Count,Frequency (%),Unnamed: 3
Grvl,120,4.1%,
Pave,78,2.7%,
(Missing),2721,93.2%,

0,1
Distinct count,8
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.8602
Minimum,0
Maximum,8
Zeros (%),0.3%

0,1
Minimum,0
5-th percentile,2
Q1,2
Median,3
Q3,3
95-th percentile,4
Maximum,8
Range,8
Interquartile range,1

0,1
Standard deviation,0.82269
Coef of variation,0.28763
Kurtosis,1.9414
Mean,2.8602
MAD,0.58429
Skewness,0.32649
Sum,8349
Variance,0.67682
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
3,1596,54.7%,
2,742,25.4%,
4,400,13.7%,
1,103,3.5%,
5,48,1.6%,
6,21,0.7%,
0,8,0.3%,
8,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,8,0.3%,
1,103,3.5%,
2,742,25.4%,
3,1596,54.7%,
4,400,13.7%,

Value,Count,Frequency (%),Unnamed: 3
3,1596,54.7%,
4,400,13.7%,
5,48,1.6%,
6,21,0.7%,
8,1,0.0%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
1Fam,2425
TwnhsE,227
Duplex,109
Other values (2),158

Value,Count,Frequency (%),Unnamed: 3
1Fam,2425,83.1%,
TwnhsE,227,7.8%,
Duplex,109,3.7%,
Twnhs,96,3.3%,
2fmCon,62,2.1%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),2.8%
Missing (n),82

0,1
TA,2606
Gd,122
Fa,104
(Missing),82

Value,Count,Frequency (%),Unnamed: 3
TA,2606,89.3%,
Gd,122,4.2%,
Fa,104,3.6%,
Po,5,0.2%,
(Missing),82,2.8%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),2.8%
Missing (n),82

0,1
No,1904
Av,418
Gd,276

Value,Count,Frequency (%),Unnamed: 3
No,1904,65.2%,
Av,418,14.3%,
Gd,276,9.5%,
Mn,239,8.2%,
(Missing),82,2.8%,

0,1
Distinct count,992
Unique (%),34.0%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,441.42
Minimum,0
Maximum,5644
Zeros (%),31.8%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,368.5
Q3,733.0
95-th percentile,1274.0
Maximum,5644.0
Range,5644.0
Interquartile range,733.0

0,1
Standard deviation,455.61
Coef of variation,1.0321
Kurtosis,6.9048
Mean,441.42
MAD,370.89
Skewness,1.4257
Sum,1288100
Variance,207580
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,929,31.8%,
24.0,27,0.9%,
16.0,14,0.5%,
300.0,9,0.3%,
288.0,8,0.3%,
384.0,8,0.3%,
600.0,8,0.3%,
20.0,8,0.3%,
602.0,7,0.2%,
500.0,7,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0.0,929,31.8%,
2.0,1,0.0%,
16.0,14,0.5%,
20.0,8,0.3%,
24.0,27,0.9%,

Value,Count,Frequency (%),Unnamed: 3
2257.0,1,0.0%,
2260.0,1,0.0%,
2288.0,1,0.0%,
4010.0,1,0.0%,
5644.0,1,0.0%,

0,1
Distinct count,273
Unique (%),9.4%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,49.582
Minimum,0
Maximum,1526
Zeros (%),88.1%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,435
Maximum,1526
Range,1526
Interquartile range,0

0,1
Standard deviation,169.21
Coef of variation,3.4126
Kurtosis,18.837
Mean,49.582
MAD,87.511
Skewness,4.1475
Sum,144680
Variance,28631
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,2571,88.1%,
180.0,5,0.2%,
294.0,5,0.2%,
435.0,3,0.1%,
147.0,3,0.1%,
483.0,3,0.1%,
168.0,3,0.1%,
144.0,3,0.1%,
539.0,3,0.1%,
374.0,3,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2571,88.1%,
6.0,1,0.0%,
12.0,1,0.0%,
28.0,1,0.0%,
32.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1127.0,1,0.0%,
1164.0,1,0.0%,
1393.0,1,0.0%,
1474.0,1,0.0%,
1526.0,1,0.0%,

0,1
Distinct count,7
Unique (%),0.2%
Missing (%),2.7%
Missing (n),79

0,1
Unf,851
GLQ,849
ALQ,429
Other values (3),711

Value,Count,Frequency (%),Unnamed: 3
Unf,851,29.2%,
GLQ,849,29.1%,
ALQ,429,14.7%,
Rec,288,9.9%,
BLQ,269,9.2%,
LwQ,154,5.3%,
(Missing),79,2.7%,

0,1
Distinct count,7
Unique (%),0.2%
Missing (%),2.7%
Missing (n),80

0,1
Unf,2493
Rec,105
LwQ,87
Other values (3),154
(Missing),80

Value,Count,Frequency (%),Unnamed: 3
Unf,2493,85.4%,
Rec,105,3.6%,
LwQ,87,3.0%,
BLQ,68,2.3%,
ALQ,52,1.8%,
GLQ,34,1.2%,
(Missing),80,2.7%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.1%
Missing (n),2
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.42989
Minimum,0
Maximum,3
Zeros (%),58.4%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,3
Range,3
Interquartile range,1

0,1
Standard deviation,0.52474
Coef of variation,1.2206
Kurtosis,-0.73569
Mean,0.42989
MAD,0.50255
Skewness,0.62406
Sum,1254
Variance,0.27535
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,1705,58.4%,
1.0,1172,40.2%,
2.0,38,1.3%,
3.0,2,0.1%,
(Missing),2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,1705,58.4%,
1.0,1172,40.2%,
2.0,38,1.3%,
3.0,2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,1705,58.4%,
1.0,1172,40.2%,
2.0,38,1.3%,
3.0,2,0.1%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.1%
Missing (n),2
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.061364
Minimum,0
Maximum,2
Zeros (%),93.9%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,1
Maximum,2
Range,2
Interquartile range,0

0,1
Standard deviation,0.24569
Coef of variation,4.0037
Kurtosis,14.848
Mean,0.061364
MAD,0.11537
Skewness,3.932
Sum,179
Variance,0.060362
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,2742,93.9%,
1.0,171,5.9%,
2.0,4,0.1%,
(Missing),2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2742,93.9%,
1.0,171,5.9%,
2.0,4,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0.0,2742,93.9%,
1.0,171,5.9%,
2.0,4,0.1%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),2.8%
Missing (n),81

0,1
TA,1283
Gd,1209
Ex,258

Value,Count,Frequency (%),Unnamed: 3
TA,1283,44.0%,
Gd,1209,41.4%,
Ex,258,8.8%,
Fa,88,3.0%,
(Missing),81,2.8%,

0,1
Distinct count,1136
Unique (%),38.9%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,560.77
Minimum,0
Maximum,2336
Zeros (%),8.3%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,220.0
Median,467.0
Q3,805.5
95-th percentile,1474.9
Maximum,2336.0
Range,2336.0
Interquartile range,585.5

0,1
Standard deviation,439.54
Coef of variation,0.78382
Kurtosis,0.40362
Mean,560.77
MAD,351.61
Skewness,0.91982
Sum,1636300
Variance,193200
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,241,8.3%,
384.0,19,0.7%,
728.0,14,0.5%,
672.0,13,0.4%,
600.0,12,0.4%,
216.0,11,0.4%,
100.0,11,0.4%,
572.0,11,0.4%,
816.0,11,0.4%,
624.0,10,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0.0,241,8.3%,
14.0,1,0.0%,
15.0,1,0.0%,
17.0,1,0.0%,
20.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2062.0,1,0.0%,
2121.0,1,0.0%,
2140.0,1,0.0%,
2153.0,1,0.0%,
2336.0,1,0.0%,

0,1
Distinct count,2
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Y,2723
N,196

Value,Count,Frequency (%),Unnamed: 3
Y,2723,93.3%,
N,196,6.7%,

0,1
Distinct count,9
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0

0,1
Norm,2511
Feedr,164
Artery,92
Other values (6),152

Value,Count,Frequency (%),Unnamed: 3
Norm,2511,86.0%,
Feedr,164,5.6%,
Artery,92,3.2%,
RRAn,50,1.7%,
PosN,39,1.3%,
RRAe,28,1.0%,
PosA,20,0.7%,
RRNn,9,0.3%,
RRNe,6,0.2%,

0,1
Distinct count,8
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0

0,1
Norm,2889
Feedr,13
Artery,5
Other values (5),12

Value,Count,Frequency (%),Unnamed: 3
Norm,2889,99.0%,
Feedr,13,0.4%,
Artery,5,0.2%,
PosN,4,0.1%,
PosA,4,0.1%,
RRNn,2,0.1%,
RRAe,1,0.0%,
RRAn,1,0.0%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),0.0%
Missing (n),1

0,1
SBrkr,2671
FuseA,188
FuseF,50
Other values (2),9

Value,Count,Frequency (%),Unnamed: 3
SBrkr,2671,91.5%,
FuseA,188,6.4%,
FuseF,50,1.7%,
FuseP,8,0.3%,
Mix,1,0.0%,
(Missing),1,0.0%,

0,1
Distinct count,183
Unique (%),6.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,23.098
Minimum,0
Maximum,1012
Zeros (%),84.3%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,176
Maximum,1012
Range,1012
Interquartile range,0

0,1
Standard deviation,64.244
Coef of variation,2.7813
Kurtosis,28.378
Mean,23.098
MAD,38.948
Skewness,4.006
Sum,67424
Variance,4127.3
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2460,84.3%,
112,22,0.8%,
96,13,0.4%,
144,11,0.4%,
192,10,0.3%,
120,9,0.3%,
168,9,0.3%,
84,8,0.3%,
40,8,0.3%,
116,8,0.3%,

Value,Count,Frequency (%),Unnamed: 3
0,2460,84.3%,
16,1,0.0%,
18,1,0.0%,
19,1,0.0%,
20,2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
429,1,0.0%,
432,1,0.0%,
552,1,0.0%,
584,1,0.0%,
1012,1,0.0%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
TA,2538
Gd,299
Fa,67
Other values (2),15

Value,Count,Frequency (%),Unnamed: 3
TA,2538,86.9%,
Gd,299,10.2%,
Fa,67,2.3%,
Ex,12,0.4%,
Po,3,0.1%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
TA,1798
Gd,979
Ex,107

Value,Count,Frequency (%),Unnamed: 3
TA,1798,61.6%,
Gd,979,33.5%,
Ex,107,3.7%,
Fa,35,1.2%,

0,1
Distinct count,16
Unique (%),0.5%
Missing (%),0.0%
Missing (n),1

0,1
VinylSd,1025
MetalSd,450
HdBoard,442
Other values (12),1001

Value,Count,Frequency (%),Unnamed: 3
VinylSd,1025,35.1%,
MetalSd,450,15.4%,
HdBoard,442,15.1%,
Wd Sdng,411,14.1%,
Plywood,221,7.6%,
CemntBd,126,4.3%,
BrkFace,87,3.0%,
WdShing,56,1.9%,
AsbShng,44,1.5%,
Stucco,43,1.5%,

0,1
Distinct count,17
Unique (%),0.6%
Missing (%),0.0%
Missing (n),1

0,1
VinylSd,1014
MetalSd,447
HdBoard,406
Other values (13),1051

Value,Count,Frequency (%),Unnamed: 3
VinylSd,1014,34.7%,
MetalSd,447,15.3%,
HdBoard,406,13.9%,
Wd Sdng,391,13.4%,
Plywood,270,9.2%,
CmentBd,126,4.3%,
Wd Shng,81,2.8%,
BrkFace,47,1.6%,
Stucco,47,1.6%,
AsbShng,38,1.3%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),80.4%
Missing (n),2348

0,1
MnPrv,329
GdPrv,118
GdWo,112
(Missing),2348

Value,Count,Frequency (%),Unnamed: 3
MnPrv,329,11.3%,
GdPrv,118,4.0%,
GdWo,112,3.8%,
MnWw,12,0.4%,
(Missing),2348,80.4%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),48.6%
Missing (n),1420

0,1
Gd,744
TA,592
Fa,74
Other values (2),89
(Missing),1420

Value,Count,Frequency (%),Unnamed: 3
Gd,744,25.5%,
TA,592,20.3%,
Fa,74,2.5%,
Po,46,1.6%,
Ex,43,1.5%,
(Missing),1420,48.6%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.59712
Minimum,0
Maximum,4
Zeros (%),48.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,1
Q3,1
95-th percentile,2
Maximum,4
Range,4
Interquartile range,1

0,1
Standard deviation,0.64613
Coef of variation,1.0821
Kurtosis,0.076424
Mean,0.59712
MAD,0.58096
Skewness,0.73387
Sum,1743
Variance,0.41748
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,1420,48.6%,
1,1268,43.4%,
2,219,7.5%,
3,11,0.4%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1420,48.6%,
1,1268,43.4%,
2,219,7.5%,
3,11,0.4%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1420,48.6%,
1,1268,43.4%,
2,219,7.5%,
3,11,0.4%,
4,1,0.0%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
PConc,1308
CBlock,1235
BrkTil,311
Other values (3),65

Value,Count,Frequency (%),Unnamed: 3
PConc,1308,44.8%,
CBlock,1235,42.3%,
BrkTil,311,10.7%,
Slab,49,1.7%,
Stone,11,0.4%,
Wood,5,0.2%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.568
Minimum,0
Maximum,4
Zeros (%),0.4%

0,1
Minimum,0
5-th percentile,1
Q1,1
Median,2
Q3,2
95-th percentile,2
Maximum,4
Range,4
Interquartile range,1

0,1
Standard deviation,0.55297
Coef of variation,0.35266
Kurtosis,-0.53813
Mean,1.568
MAD,0.52232
Skewness,0.16769
Sum,4577
Variance,0.30578
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2,1530,52.4%,
1,1309,44.8%,
3,64,2.2%,
0,12,0.4%,
4,4,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,12,0.4%,
1,1309,44.8%,
2,1530,52.4%,
3,64,2.2%,
4,4,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,12,0.4%,
1,1309,44.8%,
2,1530,52.4%,
3,64,2.2%,
4,4,0.1%,

0,1
Distinct count,8
Unique (%),0.3%
Missing (%),0.1%
Missing (n),2

0,1
Typ,2717
Min2,70
Min1,65
Other values (4),65

Value,Count,Frequency (%),Unnamed: 3
Typ,2717,93.1%,
Min2,70,2.4%,
Min1,65,2.2%,
Mod,35,1.2%,
Maj1,19,0.7%,
Maj2,9,0.3%,
Sev,2,0.1%,
(Missing),2,0.1%,

0,1
Distinct count,604
Unique (%),20.7%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,472.87
Minimum,0
Maximum,1488
Zeros (%),5.4%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,320.0
Median,480.0
Q3,576.0
95-th percentile,856.15
Maximum,1488.0
Range,1488.0
Interquartile range,256.0

0,1
Standard deviation,215.39
Coef of variation,0.4555
Kurtosis,0.93978
Mean,472.87
MAD,161.35
Skewness,0.2413
Sum,1379800
Variance,46395
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,157,5.4%,
576.0,97,3.3%,
440.0,96,3.3%,
240.0,69,2.4%,
484.0,68,2.3%,
528.0,65,2.2%,
400.0,58,2.0%,
480.0,54,1.8%,
264.0,51,1.7%,
288.0,50,1.7%,

Value,Count,Frequency (%),Unnamed: 3
0.0,157,5.4%,
100.0,1,0.0%,
160.0,3,0.1%,
162.0,2,0.1%,
164.0,2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1348.0,1,0.0%,
1356.0,1,0.0%,
1390.0,1,0.0%,
1418.0,1,0.0%,
1488.0,1,0.0%,

0,1
Distinct count,7
Unique (%),0.2%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.7666
Minimum,0
Maximum,5
Zeros (%),5.4%

0,1
Minimum,0
5-th percentile,0
Q1,1
Median,2
Q3,2
95-th percentile,3
Maximum,5
Range,5
Interquartile range,1

0,1
Standard deviation,0.76162
Coef of variation,0.43112
Kurtosis,0.2382
Mean,1.7666
MAD,0.59785
Skewness,-0.21837
Sum,5155
Variance,0.58007
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2.0,1594,54.6%,
1.0,776,26.6%,
3.0,374,12.8%,
0.0,157,5.4%,
4.0,16,0.5%,
5.0,1,0.0%,
(Missing),1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0.0,157,5.4%,
1.0,776,26.6%,
2.0,1594,54.6%,
3.0,374,12.8%,
4.0,16,0.5%,

Value,Count,Frequency (%),Unnamed: 3
1.0,776,26.6%,
2.0,1594,54.6%,
3.0,374,12.8%,
4.0,16,0.5%,
5.0,1,0.0%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),5.4%
Missing (n),159

0,1
TA,2654
Fa,74
Gd,15
Other values (2),17
(Missing),159

Value,Count,Frequency (%),Unnamed: 3
TA,2654,90.9%,
Fa,74,2.5%,
Gd,15,0.5%,
Po,14,0.5%,
Ex,3,0.1%,
(Missing),159,5.4%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),5.4%
Missing (n),159

0,1
Unf,1230
RFn,811
Fin,719
(Missing),159

Value,Count,Frequency (%),Unnamed: 3
Unf,1230,42.1%,
RFn,811,27.8%,
Fin,719,24.6%,
(Missing),159,5.4%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),5.4%
Missing (n),159

0,1
TA,2604
Fa,124
Gd,24
Other values (2),8
(Missing),159

Value,Count,Frequency (%),Unnamed: 3
TA,2604,89.2%,
Fa,124,4.2%,
Gd,24,0.8%,
Po,5,0.2%,
Ex,3,0.1%,
(Missing),159,5.4%,

0,1
Distinct count,7
Unique (%),0.2%
Missing (%),5.4%
Missing (n),157

0,1
Attchd,1723
Detchd,779
BuiltIn,186
Other values (3),74
(Missing),157

Value,Count,Frequency (%),Unnamed: 3
Attchd,1723,59.0%,
Detchd,779,26.7%,
BuiltIn,186,6.4%,
Basment,36,1.2%,
2Types,23,0.8%,
CarPort,15,0.5%,
(Missing),157,5.4%,

0,1
Distinct count,104
Unique (%),3.6%
Missing (%),5.4%
Missing (n),159
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1978.1
Minimum,1895
Maximum,2207
Zeros (%),0.0%

0,1
Minimum,1895
5-th percentile,1928
Q1,1960
Median,1979
Q3,2002
95-th percentile,2007
Maximum,2207
Range,312
Interquartile range,42

0,1
Standard deviation,25.574
Coef of variation,0.012929
Kurtosis,1.8098
Mean,1978.1
MAD,21.379
Skewness,-0.38215
Sum,5459600
Variance,654.04
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2005.0,142,4.9%,
2006.0,115,3.9%,
2007.0,115,3.9%,
2004.0,99,3.4%,
2003.0,92,3.2%,
1977.0,66,2.3%,
2008.0,61,2.1%,
1998.0,58,2.0%,
2000.0,55,1.9%,
1999.0,54,1.8%,

Value,Count,Frequency (%),Unnamed: 3
1895.0,1,0.0%,
1896.0,1,0.0%,
1900.0,6,0.2%,
1906.0,1,0.0%,
1908.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2007.0,115,3.9%,
2008.0,61,2.1%,
2009.0,29,1.0%,
2010.0,5,0.2%,
2207.0,1,0.0%,

0,1
Distinct count,1292
Unique (%),44.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1500.8
Minimum,334
Maximum,5642
Zeros (%),0.0%

0,1
Minimum,334.0
5-th percentile,861.0
Q1,1126.0
Median,1444.0
Q3,1743.5
95-th percentile,2464.2
Maximum,5642.0
Range,5308.0
Interquartile range,617.5

0,1
Standard deviation,506.05
Coef of variation,0.3372
Kurtosis,4.1216
Mean,1500.8
MAD,384.44
Skewness,1.27
Sum,4380718
Variance,256090
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
864,41,1.4%,
1092,26,0.9%,
1040,25,0.9%,
1456,20,0.7%,
1200,18,0.6%,
894,15,0.5%,
912,14,0.5%,
816,14,0.5%,
1728,13,0.4%,
848,13,0.4%,

Value,Count,Frequency (%),Unnamed: 3
334,1,0.0%,
407,1,0.0%,
438,1,0.0%,
480,1,0.0%,
492,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
4316,1,0.0%,
4476,1,0.0%,
4676,1,0.0%,
5095,1,0.0%,
5642,1,0.0%,

0,1
Distinct count,3
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,0.38027
Minimum,0
Maximum,2
Zeros (%),62.8%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,1
95-th percentile,1
Maximum,2
Range,2
Interquartile range,1

0,1
Standard deviation,0.50287
Coef of variation,1.3224
Kurtosis,-1.0334
Mean,0.38027
MAD,0.47784
Skewness,0.69492
Sum,1110
Variance,0.25288
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,1834,62.8%,
1,1060,36.3%,
2,25,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,1834,62.8%,
1,1060,36.3%,
2,25,0.9%,

Value,Count,Frequency (%),Unnamed: 3
0,1834,62.8%,
1,1060,36.3%,
2,25,0.9%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
GasA,2874
GasW,27
Grav,9
Other values (3),9

Value,Count,Frequency (%),Unnamed: 3
GasA,2874,98.5%,
GasW,27,0.9%,
Grav,9,0.3%,
Wall,6,0.2%,
OthW,2,0.1%,
Floor,1,0.0%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Ex,1493
TA,857
Gd,474
Other values (2),95

Value,Count,Frequency (%),Unnamed: 3
Ex,1493,51.1%,
TA,857,29.4%,
Gd,474,16.2%,
Fa,92,3.2%,
Po,3,0.1%,

0,1
Distinct count,8
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0

0,1
1Story,1471
2Story,872
1.5Fin,314
Other values (5),262

Value,Count,Frequency (%),Unnamed: 3
1Story,1471,50.4%,
2Story,872,29.9%,
1.5Fin,314,10.8%,
SLvl,128,4.4%,
SFoyer,83,2.8%,
2.5Unf,24,0.8%,
1.5Unf,19,0.7%,
2.5Fin,8,0.3%,

0,1
Distinct count,2919
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1460
Minimum,1
Maximum,2919
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,146.9
Q1,730.5
Median,1460.0
Q3,2189.5
95-th percentile,2773.1
Maximum,2919.0
Range,2918.0
Interquartile range,1459.0

0,1
Standard deviation,842.79
Coef of variation,0.57725
Kurtosis,-1.2
Mean,1460
MAD,729.75
Skewness,0
Sum,4261740
Variance,710290
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
1088,1,0.0%,
1084,1,0.0%,
1082,1,0.0%,
1080,1,0.0%,
1078,1,0.0%,
1076,1,0.0%,
1074,1,0.0%,
1072,1,0.0%,
1070,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,
5,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2915,1,0.0%,
2916,1,0.0%,
2917,1,0.0%,
2918,1,0.0%,
2919,1,0.0%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1.0445
Minimum,0
Maximum,3
Zeros (%),0.1%

0,1
Minimum,0
5-th percentile,1
Q1,1
Median,1
Q3,1
95-th percentile,1
Maximum,3
Range,3
Interquartile range,0

0,1
Standard deviation,0.21446
Coef of variation,0.20532
Kurtosis,19.778
Mean,1.0445
MAD,0.08713
Skewness,4.3045
Sum,3049
Variance,0.045994
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
1,2785,95.4%,
2,129,4.4%,
0,3,0.1%,
3,2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3,0.1%,
1,2785,95.4%,
2,129,4.4%,
3,2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,3,0.1%,
1,2785,95.4%,
2,129,4.4%,
3,2,0.1%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),1

0,1
TA,1492
Gd,1151
Ex,205

Value,Count,Frequency (%),Unnamed: 3
TA,1492,51.1%,
Gd,1151,39.4%,
Ex,205,7.0%,
Fa,70,2.4%,
(Missing),1,0.0%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Lvl,2622
HLS,120
Bnk,117

Value,Count,Frequency (%),Unnamed: 3
Lvl,2622,89.8%,
HLS,120,4.1%,
Bnk,117,4.0%,
Low,60,2.1%,

0,1
Distinct count,3
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Gtl,2778
Mod,125
Sev,16

Value,Count,Frequency (%),Unnamed: 3
Gtl,2778,95.2%,
Mod,125,4.3%,
Sev,16,0.5%,

0,1
Distinct count,1951
Unique (%),66.8%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,10168
Minimum,1300
Maximum,215245
Zeros (%),0.0%

0,1
Minimum,1300
5-th percentile,3182
Q1,7478
Median,9453
Q3,11570
95-th percentile,17143
Maximum,215245
Range,213945
Interquartile range,4092

0,1
Standard deviation,7887
Coef of variation,0.77566
Kurtosis,264.95
Mean,10168
MAD,3424.2
Skewness,12.829
Sum,29680725
Variance,62205000
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
9600,44,1.5%,
7200,43,1.5%,
6000,34,1.2%,
9000,29,1.0%,
10800,25,0.9%,
8400,21,0.7%,
7500,21,0.7%,
1680,18,0.6%,
6240,18,0.6%,
6120,17,0.6%,

Value,Count,Frequency (%),Unnamed: 3
1300,1,0.0%,
1470,1,0.0%,
1476,1,0.0%,
1477,2,0.1%,
1484,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
70761,1,0.0%,
115149,1,0.0%,
159000,1,0.0%,
164660,1,0.0%,
215245,1,0.0%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Inside,2133
Corner,511
CulDSac,176
Other values (2),99

Value,Count,Frequency (%),Unnamed: 3
Inside,2133,73.1%,
Corner,511,17.5%,
CulDSac,176,6.0%,
FR2,85,2.9%,
FR3,14,0.5%,

0,1
Distinct count,129
Unique (%),4.4%
Missing (%),16.6%
Missing (n),486
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,69.306
Minimum,21
Maximum,313
Zeros (%),0.0%

0,1
Minimum,21
5-th percentile,32
Q1,59
Median,68
Q3,80
95-th percentile,107
Maximum,313
Range,292
Interquartile range,21

0,1
Standard deviation,23.345
Coef of variation,0.33684
Kurtosis,11.296
Mean,69.306
MAD,16.634
Skewness,1.5033
Sum,168620
Variance,544.98
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
60.0,276,9.5%,
80.0,137,4.7%,
70.0,133,4.6%,
50.0,117,4.0%,
75.0,105,3.6%,
65.0,93,3.2%,
85.0,76,2.6%,
21.0,50,1.7%,
24.0,49,1.7%,
63.0,47,1.6%,

Value,Count,Frequency (%),Unnamed: 3
21.0,50,1.7%,
22.0,1,0.0%,
24.0,49,1.7%,
25.0,1,0.0%,
26.0,3,0.1%,

Value,Count,Frequency (%),Unnamed: 3
174.0,2,0.1%,
182.0,1,0.0%,
195.0,1,0.0%,
200.0,1,0.0%,
313.0,2,0.1%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Reg,1859
IR1,968
IR2,76

Value,Count,Frequency (%),Unnamed: 3
Reg,1859,63.7%,
IR1,968,33.2%,
IR2,76,2.6%,
IR3,16,0.5%,

0,1
Distinct count,36
Unique (%),1.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,4.6944
Minimum,0
Maximum,1064
Zeros (%),98.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,0
Maximum,1064
Range,1064
Interquartile range,0

0,1
Standard deviation,46.397
Coef of variation,9.8834
Kurtosis,174.93
Mean,4.6944
MAD,9.2602
Skewness,12.095
Sum,13703
Variance,2152.7
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2879,98.6%,
80,4,0.1%,
205,2,0.1%,
360,2,0.1%,
390,1,0.0%,
384,1,0.0%,
362,1,0.0%,
312,1,0.0%,
234,1,0.0%,
232,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,2879,98.6%,
53,1,0.0%,
80,4,0.1%,
108,1,0.0%,
114,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
515,1,0.0%,
528,1,0.0%,
572,1,0.0%,
697,1,0.0%,
1064,1,0.0%,

0,1
Distinct count,16
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,57.138
Minimum,20
Maximum,190
Zeros (%),0.0%

0,1
Minimum,20
5-th percentile,20
Q1,20
Median,50
Q3,70
95-th percentile,160
Maximum,190
Range,170
Interquartile range,50

0,1
Standard deviation,42.518
Coef of variation,0.74413
Kurtosis,1.4578
Mean,57.138
MAD,31.664
Skewness,1.3762
Sum,166785
Variance,1807.7
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
20,1079,37.0%,
60,575,19.7%,
50,287,9.8%,
120,182,6.2%,
30,139,4.8%,
160,128,4.4%,
70,128,4.4%,
80,118,4.0%,
90,109,3.7%,
190,61,2.1%,

Value,Count,Frequency (%),Unnamed: 3
20,1079,37.0%,
30,139,4.8%,
40,6,0.2%,
45,18,0.6%,
50,287,9.8%,

Value,Count,Frequency (%),Unnamed: 3
120,182,6.2%,
150,1,0.0%,
160,128,4.4%,
180,17,0.6%,
190,61,2.1%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),0.1%
Missing (n),4

0,1
RL,2265
RM,460
FV,139
Other values (2),51

Value,Count,Frequency (%),Unnamed: 3
RL,2265,77.6%,
RM,460,15.8%,
FV,139,4.8%,
RH,26,0.9%,
C (all),25,0.9%,
(Missing),4,0.1%,

0,1
Distinct count,445
Unique (%),15.2%
Missing (%),0.8%
Missing (n),23
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,102.2
Minimum,0
Maximum,1600
Zeros (%),59.5%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,0.0
Q3,164.0
95-th percentile,466.5
Maximum,1600.0
Range,1600.0
Interquartile range,164.0

0,1
Standard deviation,179.33
Coef of variation,1.7547
Kurtosis,9.2543
Mean,102.2
MAD,129.32
Skewness,2.6026
Sum,295980
Variance,32161
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,1738,59.5%,
120.0,15,0.5%,
200.0,13,0.4%,
176.0,13,0.4%,
180.0,12,0.4%,
216.0,12,0.4%,
144.0,11,0.4%,
72.0,11,0.4%,
108.0,11,0.4%,
16.0,11,0.4%,

Value,Count,Frequency (%),Unnamed: 3
0.0,1738,59.5%,
1.0,3,0.1%,
3.0,1,0.0%,
11.0,1,0.0%,
14.0,4,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1170.0,1,0.0%,
1224.0,2,0.1%,
1290.0,1,0.0%,
1378.0,1,0.0%,
1600.0,1,0.0%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.8%
Missing (n),24

0,1
,1742
BrkFace,879
Stone,249

Value,Count,Frequency (%),Unnamed: 3
,1742,59.7%,
BrkFace,879,30.1%,
Stone,249,8.5%,
BrkCmn,25,0.9%,
(Missing),24,0.8%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),96.4%
Missing (n),2814

0,1
Shed,95
Gar2,5
Othr,4
(Missing),2814

Value,Count,Frequency (%),Unnamed: 3
Shed,95,3.3%,
Gar2,5,0.2%,
Othr,4,0.1%,
TenC,1,0.0%,
(Missing),2814,96.4%,

0,1
Distinct count,38
Unique (%),1.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,50.826
Minimum,0
Maximum,17000
Zeros (%),96.5%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,0
Maximum,17000
Range,17000
Interquartile range,0

0,1
Standard deviation,567.4
Coef of variation,11.164
Kurtosis,564.07
Mean,50.826
MAD,98.065
Skewness,21.958
Sum,148361
Variance,321950
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2816,96.5%,
400,18,0.6%,
500,13,0.4%,
450,9,0.3%,
600,8,0.3%,
700,7,0.2%,
2000,7,0.2%,
650,3,0.1%,
1200,3,0.1%,
1500,3,0.1%,

Value,Count,Frequency (%),Unnamed: 3
0,2816,96.5%,
54,1,0.0%,
80,1,0.0%,
300,1,0.0%,
350,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
6500,1,0.0%,
8300,1,0.0%,
12500,1,0.0%,
15500,1,0.0%,
17000,1,0.0%,

0,1
Distinct count,12
Unique (%),0.4%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6.2131
Minimum,1
Maximum,12
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,2
Q1,4
Median,6
Q3,8
95-th percentile,11
Maximum,12
Range,11
Interquartile range,4

0,1
Standard deviation,2.7148
Coef of variation,0.43694
Kurtosis,-0.45434
Mean,6.2131
MAD,2.1544
Skewness,0.19598
Sum,18136
Variance,7.3699
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
6,503,17.2%,
7,446,15.3%,
5,394,13.5%,
4,279,9.6%,
8,233,8.0%,
3,232,7.9%,
10,173,5.9%,
9,158,5.4%,
11,142,4.9%,
2,133,4.6%,

Value,Count,Frequency (%),Unnamed: 3
1,122,4.2%,
2,133,4.6%,
3,232,7.9%,
4,279,9.6%,
5,394,13.5%,

Value,Count,Frequency (%),Unnamed: 3
8,233,8.0%,
9,158,5.4%,
10,173,5.9%,
11,142,4.9%,
12,104,3.6%,

0,1
Distinct count,25
Unique (%),0.9%
Missing (%),0.0%
Missing (n),0

0,1
NAmes,443
CollgCr,267
OldTown,239
Other values (22),1970

Value,Count,Frequency (%),Unnamed: 3
NAmes,443,15.2%,
CollgCr,267,9.1%,
OldTown,239,8.2%,
Edwards,194,6.6%,
Somerst,182,6.2%,
NridgHt,166,5.7%,
Gilbert,165,5.7%,
Sawyer,151,5.2%,
NWAmes,131,4.5%,
SawyerW,125,4.3%,

0,1
Distinct count,252
Unique (%),8.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,47.487
Minimum,0
Maximum,742
Zeros (%),44.5%

0,1
Minimum,0.0
5-th percentile,0.0
Q1,0.0
Median,26.0
Q3,70.0
95-th percentile,183.1
Maximum,742.0
Range,742.0
Interquartile range,70.0

0,1
Standard deviation,67.575
Coef of variation,1.423
Kurtosis,10.937
Mean,47.487
MAD,48.249
Skewness,2.5364
Sum,138614
Variance,4566.4
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,1298,44.5%,
36,52,1.8%,
48,51,1.7%,
40,44,1.5%,
32,38,1.3%,
24,36,1.2%,
28,35,1.2%,
20,33,1.1%,
30,31,1.1%,
60,29,1.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1298,44.5%,
4,1,0.0%,
6,1,0.0%,
8,1,0.0%,
10,2,0.1%,

Value,Count,Frequency (%),Unnamed: 3
502,1,0.0%,
523,1,0.0%,
547,1,0.0%,
570,1,0.0%,
742,1,0.0%,

0,1
Distinct count,9
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.5646
Minimum,1
Maximum,9
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,4
Q1,5
Median,5
Q3,6
95-th percentile,8
Maximum,9
Range,8
Interquartile range,1

0,1
Standard deviation,1.1131
Coef of variation,0.20004
Kurtosis,1.4794
Mean,5.5646
MAD,0.87878
Skewness,0.57061
Sum,16243
Variance,1.2391
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
5,1645,56.4%,
6,531,18.2%,
7,390,13.4%,
8,144,4.9%,
4,101,3.5%,
3,50,1.7%,
9,41,1.4%,
2,10,0.3%,
1,7,0.2%,

Value,Count,Frequency (%),Unnamed: 3
1,7,0.2%,
2,10,0.3%,
3,50,1.7%,
4,101,3.5%,
5,1645,56.4%,

Value,Count,Frequency (%),Unnamed: 3
5,1645,56.4%,
6,531,18.2%,
7,390,13.4%,
8,144,4.9%,
9,41,1.4%,

0,1
Distinct count,10
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6.0891
Minimum,1
Maximum,10
Zeros (%),0.0%

0,1
Minimum,1
5-th percentile,4
Q1,5
Median,6
Q3,7
95-th percentile,8
Maximum,10
Range,9
Interquartile range,2

0,1
Standard deviation,1.4099
Coef of variation,0.23155
Kurtosis,0.067219
Mean,6.0891
MAD,1.1187
Skewness,0.19721
Sum,17774
Variance,1.988
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
5,825,28.3%,
6,731,25.0%,
7,600,20.6%,
8,342,11.7%,
4,226,7.7%,
9,107,3.7%,
3,40,1.4%,
10,31,1.1%,
2,13,0.4%,
1,4,0.1%,

Value,Count,Frequency (%),Unnamed: 3
1,4,0.1%,
2,13,0.4%,
3,40,1.4%,
4,226,7.7%,
5,825,28.3%,

Value,Count,Frequency (%),Unnamed: 3
6,731,25.0%,
7,600,20.6%,
8,342,11.7%,
9,107,3.7%,
10,31,1.1%,

0,1
Distinct count,3
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Y,2641
N,216
P,62

Value,Count,Frequency (%),Unnamed: 3
Y,2641,90.5%,
N,216,7.4%,
P,62,2.1%,

0,1
Distinct count,14
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2.2518
Minimum,0
Maximum,800
Zeros (%),99.6%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,0
Maximum,800
Range,800
Interquartile range,0

0,1
Standard deviation,35.664
Coef of variation,15.838
Kurtosis,298.63
Mean,2.2518
MAD,4.4835
Skewness,16.907
Sum,6573
Variance,1271.9
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2906,99.6%,
561,1,0.0%,
555,1,0.0%,
519,1,0.0%,
800,1,0.0%,
738,1,0.0%,
648,1,0.0%,
576,1,0.0%,
512,1,0.0%,
480,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,2906,99.6%,
144,1,0.0%,
228,1,0.0%,
368,1,0.0%,
444,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
561,1,0.0%,
576,1,0.0%,
648,1,0.0%,
738,1,0.0%,
800,1,0.0%,

0,1
Distinct count,4
Unique (%),0.1%
Missing (%),99.7%
Missing (n),2909

0,1
Ex,4
Gd,4
Fa,2
(Missing),2909

Value,Count,Frequency (%),Unnamed: 3
Ex,4,0.1%,
Gd,4,0.1%,
Fa,2,0.1%,
(Missing),2909,99.7%,

0,1
Distinct count,8
Unique (%),0.3%
Missing (%),0.0%
Missing (n),0

0,1
CompShg,2876
Tar&Grv,23
WdShake,9
Other values (5),11

Value,Count,Frequency (%),Unnamed: 3
CompShg,2876,98.5%,
Tar&Grv,23,0.8%,
WdShake,9,0.3%,
WdShngl,7,0.2%,
ClyTile,1,0.0%,
Membran,1,0.0%,
Roll,1,0.0%,
Metal,1,0.0%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Gable,2310
Hip,551
Gambrel,22
Other values (3),36

Value,Count,Frequency (%),Unnamed: 3
Gable,2310,79.1%,
Hip,551,18.9%,
Gambrel,22,0.8%,
Flat,20,0.7%,
Mansard,11,0.4%,
Shed,5,0.2%,

0,1
Distinct count,6
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0

0,1
Normal,2402
Partial,245
Abnorml,190
Other values (3),82

Value,Count,Frequency (%),Unnamed: 3
Normal,2402,82.3%,
Partial,245,8.4%,
Abnorml,190,6.5%,
Family,46,1.6%,
Alloca,24,0.8%,
AdjLand,12,0.4%,

0,1
Distinct count,664
Unique (%),22.7%
Missing (%),50.0%
Missing (n),1459
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,180920
Minimum,34900
Maximum,755000
Zeros (%),0.0%

0,1
Minimum,34900
5-th percentile,88000
Q1,129980
Median,163000
Q3,214000
95-th percentile,326100
Maximum,755000
Range,720100
Interquartile range,84025

0,1
Standard deviation,79443
Coef of variation,0.4391
Kurtosis,6.5363
Mean,180920
MAD,57435
Skewness,1.8829
Sum,264140000
Variance,6311100000
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
140000.0,20,0.7%,
135000.0,17,0.6%,
145000.0,14,0.5%,
155000.0,14,0.5%,
190000.0,13,0.4%,
110000.0,13,0.4%,
160000.0,12,0.4%,
115000.0,12,0.4%,
130000.0,11,0.4%,
139000.0,11,0.4%,

Value,Count,Frequency (%),Unnamed: 3
34900.0,1,0.0%,
35311.0,1,0.0%,
37900.0,1,0.0%,
39300.0,1,0.0%,
40000.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
582933.0,1,0.0%,
611657.0,1,0.0%,
625000.0,1,0.0%,
745000.0,1,0.0%,
755000.0,1,0.0%,

0,1
Distinct count,10
Unique (%),0.3%
Missing (%),0.0%
Missing (n),1

0,1
WD,2525
New,239
COD,87
Other values (6),67

Value,Count,Frequency (%),Unnamed: 3
WD,2525,86.5%,
New,239,8.2%,
COD,87,3.0%,
ConLD,26,0.9%,
CWD,12,0.4%,
ConLI,9,0.3%,
ConLw,8,0.3%,
Oth,7,0.2%,
Con,5,0.2%,
(Missing),1,0.0%,

0,1
Distinct count,121
Unique (%),4.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,16.062
Minimum,0
Maximum,576
Zeros (%),91.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,0
95-th percentile,161
Maximum,576
Range,576
Interquartile range,0

0,1
Standard deviation,56.184
Coef of variation,3.4979
Kurtosis,17.777
Mean,16.062
MAD,29.307
Skewness,3.9487
Sum,46886
Variance,3156.7
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,2663,91.2%,
144,13,0.4%,
192,11,0.4%,
168,10,0.3%,
120,9,0.3%,
216,8,0.3%,
180,7,0.2%,
200,7,0.2%,
160,6,0.2%,
224,6,0.2%,

Value,Count,Frequency (%),Unnamed: 3
0,2663,91.2%,
40,1,0.0%,
53,1,0.0%,
60,1,0.0%,
63,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
410,1,0.0%,
440,1,0.0%,
480,1,0.0%,
490,1,0.0%,
576,1,0.0%,

0,1
Distinct count,2
Unique (%),0.1%
Missing (%),0.0%
Missing (n),0

0,1
Pave,2907
Grvl,12

Value,Count,Frequency (%),Unnamed: 3
Pave,2907,99.6%,
Grvl,12,0.4%,

0,1
Distinct count,14
Unique (%),0.5%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,6.4515
Minimum,2
Maximum,15
Zeros (%),0.0%

0,1
Minimum,2
5-th percentile,4
Q1,5
Median,6
Q3,7
95-th percentile,9
Maximum,15
Range,13
Interquartile range,2

0,1
Standard deviation,1.5694
Coef of variation,0.24326
Kurtosis,1.1691
Mean,6.4515
MAD,1.2323
Skewness,0.75876
Sum,18832
Variance,2.463
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
6,844,28.9%,
7,649,22.2%,
5,583,20.0%,
8,347,11.9%,
4,196,6.7%,
9,143,4.9%,
10,80,2.7%,
11,32,1.1%,
3,25,0.9%,
12,16,0.5%,

Value,Count,Frequency (%),Unnamed: 3
2,1,0.0%,
3,25,0.9%,
4,196,6.7%,
5,583,20.0%,
6,844,28.9%,

Value,Count,Frequency (%),Unnamed: 3
11,32,1.1%,
12,16,0.5%,
13,1,0.0%,
14,1,0.0%,
15,1,0.0%,

0,1
Distinct count,1059
Unique (%),36.3%
Missing (%),0.0%
Missing (n),1
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1051.8
Minimum,0
Maximum,6110
Zeros (%),2.7%

0,1
Minimum,0.0
5-th percentile,455.25
Q1,793.0
Median,989.5
Q3,1302.0
95-th percentile,1776.2
Maximum,6110.0
Range,6110.0
Interquartile range,509.0

0,1
Standard deviation,440.77
Coef of variation,0.41907
Kurtosis,9.1511
Mean,1051.8
MAD,327.04
Skewness,1.1629
Sum,3069100
Variance,194270
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0.0,78,2.7%,
864.0,74,2.5%,
672.0,29,1.0%,
912.0,26,0.9%,
1040.0,25,0.9%,
768.0,24,0.8%,
816.0,23,0.8%,
728.0,20,0.7%,
384.0,19,0.7%,
1008.0,19,0.7%,

Value,Count,Frequency (%),Unnamed: 3
0.0,78,2.7%,
105.0,1,0.0%,
160.0,1,0.0%,
173.0,1,0.0%,
190.0,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
3138.0,1,0.0%,
3200.0,1,0.0%,
3206.0,1,0.0%,
5095.0,1,0.0%,
6110.0,1,0.0%,

0,1
Distinct count,3
Unique (%),0.1%
Missing (%),0.1%
Missing (n),2

0,1
AllPub,2916
NoSeWa,1
(Missing),2

Value,Count,Frequency (%),Unnamed: 3
AllPub,2916,99.9%,
NoSeWa,1,0.0%,
(Missing),2,0.1%,

0,1
Distinct count,379
Unique (%),13.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,93.71
Minimum,0
Maximum,1424
Zeros (%),52.2%

0,1
Minimum,0
5-th percentile,0
Q1,0
Median,0
Q3,168
95-th percentile,328
Maximum,1424
Range,1424
Interquartile range,168

0,1
Standard deviation,126.53
Coef of variation,1.3502
Kurtosis,6.7416
Mean,93.71
MAD,101.75
Skewness,1.8434
Sum,273539
Variance,16009
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
0,1523,52.2%,
100,74,2.5%,
192,70,2.4%,
144,61,2.1%,
168,56,1.9%,
120,53,1.8%,
140,29,1.0%,
240,20,0.7%,
224,19,0.7%,
160,17,0.6%,

Value,Count,Frequency (%),Unnamed: 3
0,1523,52.2%,
4,1,0.0%,
12,2,0.1%,
14,1,0.0%,
16,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
728,1,0.0%,
736,1,0.0%,
857,1,0.0%,
870,1,0.0%,
1424,1,0.0%,

0,1
Distinct count,118
Unique (%),4.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1971.3
Minimum,1872
Maximum,2010
Zeros (%),0.0%

0,1
Minimum,1872.0
5-th percentile,1915.0
Q1,1953.5
Median,1973.0
Q3,2001.0
95-th percentile,2007.0
Maximum,2010.0
Range,138.0
Interquartile range,47.5

0,1
Standard deviation,30.291
Coef of variation,0.015366
Kurtosis,-0.51132
Mean,1971.3
MAD,25.246
Skewness,-0.60011
Sum,5754262
Variance,917.57
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2005,142,4.9%,
2006,138,4.7%,
2007,109,3.7%,
2004,99,3.4%,
2003,88,3.0%,
1977,57,2.0%,
1920,57,2.0%,
1976,54,1.8%,
1999,52,1.8%,
2008,49,1.7%,

Value,Count,Frequency (%),Unnamed: 3
1872,1,0.0%,
1875,1,0.0%,
1879,1,0.0%,
1880,5,0.2%,
1882,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2006,138,4.7%,
2007,109,3.7%,
2008,49,1.7%,
2009,25,0.9%,
2010,3,0.1%,

0,1
Distinct count,61
Unique (%),2.1%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1984.3
Minimum,1950
Maximum,2010
Zeros (%),0.0%

0,1
Minimum,1950
5-th percentile,1950
Q1,1965
Median,1993
Q3,2004
95-th percentile,2007
Maximum,2010
Range,60
Interquartile range,39

0,1
Standard deviation,20.894
Coef of variation,0.01053
Kurtosis,-1.3464
Mean,1984.3
MAD,18.961
Skewness,-0.45125
Sum,5792068
Variance,436.57
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
1950,361,12.4%,
2006,202,6.9%,
2007,164,5.6%,
2005,141,4.8%,
2004,111,3.8%,
2000,104,3.6%,
2003,99,3.4%,
2002,82,2.8%,
2008,81,2.8%,
1998,77,2.6%,

Value,Count,Frequency (%),Unnamed: 3
1950,361,12.4%,
1951,14,0.5%,
1952,15,0.5%,
1953,20,0.7%,
1954,28,1.0%,

Value,Count,Frequency (%),Unnamed: 3
2006,202,6.9%,
2007,164,5.6%,
2008,81,2.8%,
2009,34,1.2%,
2010,13,0.4%,

0,1
Distinct count,5
Unique (%),0.2%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,2007.8
Minimum,2006
Maximum,2010
Zeros (%),0.0%

0,1
Minimum,2006
5-th percentile,2006
Q1,2007
Median,2008
Q3,2009
95-th percentile,2010
Maximum,2010
Range,4
Interquartile range,2

0,1
Standard deviation,1.315
Coef of variation,0.00065493
Kurtosis,-1.1551
Mean,2007.8
MAD,1.1362
Skewness,0.13247
Sum,5860747
Variance,1.7291
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2007,692,23.7%,
2009,647,22.2%,
2008,622,21.3%,
2006,619,21.2%,
2010,339,11.6%,

Value,Count,Frequency (%),Unnamed: 3
2006,619,21.2%,
2007,692,23.7%,
2008,622,21.3%,
2009,647,22.2%,
2010,339,11.6%,

Value,Count,Frequency (%),Unnamed: 3
2006,619,21.2%,
2007,692,23.7%,
2008,622,21.3%,
2009,647,22.2%,
2010,339,11.6%,

0,1
Distinct count,1460
Unique (%),50.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,729.25
Minimum,0
Maximum,1459
Zeros (%),0.1%

0,1
Minimum,0.0
5-th percentile,72.9
Q1,364.5
Median,729.0
Q3,1094.0
95-th percentile,1386.0
Maximum,1459.0
Range,1459.0
Interquartile range,729.5

0,1
Standard deviation,421.39
Coef of variation,0.57785
Kurtosis,-1.2
Mean,729.25
MAD,364.88
Skewness,1.2203e-06
Sum,2128681
Variance,177570
Memory size,22.9 KiB

Value,Count,Frequency (%),Unnamed: 3
2,1458,49.9%,
2,956,32.8%,
2,960,32.9%,
2,962,33.0%,
2,964,33.0%,
2,966,33.1%,
2,968,33.2%,
2,970,33.2%,
2,972,33.3%,
2,974,33.4%,

Value,Count,Frequency (%),Unnamed: 3
1,1459,50.0%,
2,1451,49.7%,
2,1449,49.6%,
2,1447,49.6%,
2,1445,49.5%,

Value,Count,Frequency (%),Unnamed: 3
2,40,1.4%,
2,176,6.0%,
2,42,1.4%,
2,84,2.9%,
2,1456,49.9%,

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706.0,Unf,0.0,150.0,856.0,GasA,Ex,Y,SBrkr,856,854,0,1710,1.0,0.0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2.0,548.0,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978.0,Unf,0.0,284.0,1262.0,GasA,Ex,Y,SBrkr,1262,0,0,1262,0.0,1.0,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2.0,460.0,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486.0,Unf,0.0,434.0,920.0,GasA,Ex,Y,SBrkr,920,866,0,1786,1.0,0.0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2.0,608.0,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216.0,Unf,0.0,540.0,756.0,GasA,Gd,Y,SBrkr,961,756,0,1717,1.0,0.0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3.0,642.0,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655.0,Unf,0.0,490.0,1145.0,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1.0,0.0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3.0,836.0,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000.0


In [12]:
combine.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

## Numeric Columns

In [13]:
del combine['LotFrontage']
del combine['MasVnrArea']
combine['GarageYrBlt'] = combine.GarageYrBlt.fillna('No')

In [14]:
combine.select_dtypes(include='number').isnull().sum().sort_values(ascending=False)

SalePrice        1459
BsmtFullBath        2
BsmtHalfBath        2
BsmtFinSF1          1
GarageArea          1
GarageCars          1
BsmtUnfSF           1
TotalBsmtSF         1
BsmtFinSF2          1
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
MSSubClass          0
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea           0
FullBath            0
YrSold              0
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
TotRmsAbvGrd        0
Fireplaces          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
Id                  0
dtype: int64

## String Columns

In [15]:
Fill_None = ['Alley','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Electrical',
            'FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','Fence','MiscFeature']
for Fill_None in combine:
    combine[Fill_None] = combine[Fill_None].fillna('None')

In [16]:
def feat_eng(tr):
    tr.LotShape[tr['LotShape']=='IR3'] = 1
    tr.LotShape[tr['LotShape']=='IR2'] = 2
    tr.LotShape[tr['LotShape']=='IR1'] = 3
    tr.LotShape[tr['LotShape']=='Reg'] = 4
    tr.LotShape = tr.LotShape.astype('float32')
    tr.ExterQual[tr['ExterQual']=='None'] = 0
    tr.ExterQual[tr['ExterQual']=='Po'] = 1
    tr.ExterQual[tr['ExterQual']=='Fa'] = 2
    tr.ExterQual[tr['ExterQual']=='TA'] = 3
    tr.ExterQual[tr['ExterQual']=='Gd'] = 4
    tr.ExterQual[tr['ExterQual']=='Ex'] = 5
    tr.ExterQual = tr.ExterQual.astype('float32')
    tr.ExterCond[tr['ExterCond']=='None'] = 0
    tr.ExterCond[tr['ExterCond']=='Po'] = 1
    tr.ExterCond[tr['ExterCond']=='Fa'] = 2
    tr.ExterCond[tr['ExterCond']=='TA'] = 3
    tr.ExterCond[tr['ExterCond']=='Gd'] = 4
    tr.ExterCond[tr['ExterCond']=='Ex'] = 5
    tr.ExterCond = tr.ExterCond.astype('float32')
    tr.BsmtQual[tr['BsmtQual']=='None'] = 0
    tr.BsmtQual[tr['BsmtQual']=='Po'] = 1
    tr.BsmtQual[tr['BsmtQual']=='Fa'] = 2
    tr.BsmtQual[tr['BsmtQual']=='TA'] = 3
    tr.BsmtQual[tr['BsmtQual']=='Gd'] = 4
    tr.BsmtQual[tr['BsmtQual']=='Ex'] = 5
    tr.BsmtQual = tr.BsmtQual.astype('float32')
    tr.BsmtCond[tr['BsmtCond']=='None'] = 0
    tr.BsmtCond[tr['BsmtCond']=='Po'] = 1
    tr.BsmtCond[tr['BsmtCond']=='Fa'] = 2
    tr.BsmtCond[tr['BsmtCond']=='TA'] = 3
    tr.BsmtCond[tr['BsmtCond']=='Gd'] = 4
    tr.BsmtCond[tr['BsmtCond']=='Ex'] = 5
    tr.BsmtCond = tr.BsmtCond.astype('float32')
    tr.BsmtExposure[tr['BsmtExposure']=='None'] = 0
    tr.BsmtExposure[tr['BsmtExposure']=='No'] = 1
    tr.BsmtExposure[tr['BsmtExposure']=='Mn'] = 2
    tr.BsmtExposure[tr['BsmtExposure']=='Av'] = 3
    tr.BsmtExposure[tr['BsmtExposure']=='Gd'] = 4
    tr.BsmtExposure = tr.BsmtExposure.astype('float32')
    tr.BsmtFinType1[tr['BsmtFinType1']=='None'] = 0
    tr.BsmtFinType1[tr['BsmtFinType1']=='Unf'] = 1
    tr.BsmtFinType1[tr['BsmtFinType1']=='LwQ'] = 2
    tr.BsmtFinType1[tr['BsmtFinType1']=='Rec'] = 3
    tr.BsmtFinType1[tr['BsmtFinType1']=='BLQ'] = 4
    tr.BsmtFinType1[tr['BsmtFinType1']=='ALQ'] = 5
    tr.BsmtFinType1[tr['BsmtFinType1']=='GLQ'] = 6
    tr.BsmtFinType1 = tr.BsmtFinType1.astype('float32')
    tr.BsmtFinType2[tr['BsmtFinType2']=='None'] = 0
    tr.BsmtFinType2[tr['BsmtFinType2']=='Unf'] = 1
    tr.BsmtFinType2[tr['BsmtFinType2']=='LwQ'] = 2
    tr.BsmtFinType2[tr['BsmtFinType2']=='Rec'] = 3
    tr.BsmtFinType2[tr['BsmtFinType2']=='BLQ'] = 4
    tr.BsmtFinType2[tr['BsmtFinType2']=='ALQ'] = 5
    tr.BsmtFinType2[tr['BsmtFinType2']=='GLQ'] = 6
    tr.BsmtFinType2 = tr.BsmtFinType2.astype('float32')
    tr.HeatingQC[tr['HeatingQC']=='None'] = 0
    tr.HeatingQC[tr['HeatingQC']=='Po'] = 1
    tr.HeatingQC[tr['HeatingQC']=='Fa'] = 2
    tr.HeatingQC[tr['HeatingQC']=='TA'] = 3
    tr.HeatingQC[tr['HeatingQC']=='Gd'] = 4
    tr.HeatingQC[tr['HeatingQC']=='Ex'] = 5
    tr.HeatingQC = tr.HeatingQC.astype('float32')
    tr.KitchenQual[tr['KitchenQual']=='None'] = 0
    tr.KitchenQual[tr['KitchenQual']=='Po'] = 1
    tr.KitchenQual[tr['KitchenQual']=='Fa'] = 2
    tr.KitchenQual[tr['KitchenQual']=='TA'] = 3
    tr.KitchenQual[tr['KitchenQual']=='Gd'] = 4
    tr.KitchenQual[tr['KitchenQual']=='Ex'] = 5
    tr.KitchenQual = tr.KitchenQual.astype('float32')
    tr.Functional[tr['Functional']=='None'] = 0
    tr.Functional[tr['Functional']=='Sal'] = 1
    tr.Functional[tr['Functional']=='Sev'] = 2
    tr.Functional[tr['Functional']=='Maj2'] = 3
    tr.Functional[tr['Functional']=='Maj1'] = 4
    tr.Functional[tr['Functional']=='Mod'] = 5
    tr.Functional[tr['Functional']=='Min2'] = 6
    tr.Functional[tr['Functional']=='Min1'] = 7
    tr.Functional[tr['Functional']=='Typ'] = 8
    tr.Functional = tr.Functional.astype('float32')
    tr.FireplaceQu[tr['FireplaceQu']=='None'] = 0
    tr.FireplaceQu[tr['FireplaceQu']=='Po'] = 1
    tr.FireplaceQu[tr['FireplaceQu']=='Fa'] = 2
    tr.FireplaceQu[tr['FireplaceQu']=='TA'] = 3
    tr.FireplaceQu[tr['FireplaceQu']=='Gd'] = 4
    tr.FireplaceQu[tr['FireplaceQu']=='Ex'] = 5
    tr.FireplaceQu = tr.FireplaceQu.astype('float32')
    tr.GarageFinish[tr['GarageFinish']=='None'] = 0
    tr.GarageFinish[tr['GarageFinish']=='Unf'] = 1
    tr.GarageFinish[tr['GarageFinish']=='RFn'] = 2
    tr.GarageFinish[tr['GarageFinish']=='Fin'] = 3
    tr.GarageFinish = tr.GarageFinish.astype('float32')
    tr.GarageQual[tr['GarageQual']=='None'] = 0
    tr.GarageQual[tr['GarageQual']=='Po'] = 1
    tr.GarageQual[tr['GarageQual']=='Fa'] = 2
    tr.GarageQual[tr['GarageQual']=='TA'] = 3
    tr.GarageQual[tr['GarageQual']=='Gd'] = 4
    tr.GarageQual[tr['GarageQual']=='Ex'] = 5
    tr.GarageQual = tr.GarageQual.astype('float32')
    tr.GarageCond[tr['GarageCond']=='None'] = 0
    tr.GarageCond[tr['GarageCond']=='Po'] = 1
    tr.GarageCond[tr['GarageCond']=='Fa'] = 2
    tr.GarageCond[tr['GarageCond']=='TA'] = 3
    tr.GarageCond[tr['GarageCond']=='Gd'] = 4
    tr.GarageCond[tr['GarageCond']=='Ex'] = 5
    tr.GarageCond = tr.GarageCond.astype('float32')
    tr.PoolQC[tr['PoolQC']=='None'] = 0
    tr.PoolQC[tr['PoolQC']=='Po'] = 1
    tr.PoolQC[tr['PoolQC']=='Fa'] = 2
    tr.PoolQC[tr['PoolQC']=='TA'] = 3
    tr.PoolQC[tr['PoolQC']=='Gd'] = 4
    tr.PoolQC[tr['PoolQC']=='Ex'] = 5
    tr.PoolQC = tr.PoolQC.astype('float32')
    return tr

newtrain = feat_eng(combine)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://p

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

In [17]:
combine.BsmtFullBath[combine['BsmtFullBath']=='None'] = 0
combine.BsmtHalfBath[combine['BsmtHalfBath']=='None'] = 0
combine.GarageCars[combine['GarageCars']=='None'] = 0
combine.BsmtUnfSF[combine['BsmtUnfSF']=='None'] = 0
combine.TotalBsmtSF[combine['TotalBsmtSF']=='None'] = 0
combine.GarageArea[combine['GarageArea']=='None'] = 0
combine.BsmtFinSF1[combine['BsmtFinSF1']=='None'] = 0
combine.BsmtFinSF2[combine['BsmtFinSF2']=='None'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [18]:
combine.select_dtypes(exclude='number').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Data columns (total 37 columns):
MSZoning         2919 non-null object
Street           2919 non-null object
Alley            2919 non-null object
LandContour      2919 non-null object
Utilities        2919 non-null object
LotConfig        2919 non-null object
LandSlope        2919 non-null object
Neighborhood     2919 non-null object
Condition1       2919 non-null object
Condition2       2919 non-null object
BldgType         2919 non-null object
HouseStyle       2919 non-null object
RoofStyle        2919 non-null object
RoofMatl         2919 non-null object
Exterior1st      2919 non-null object
Exterior2nd      2919 non-null object
MasVnrType       2919 non-null object
Foundation       2919 non-null object
BsmtFinSF1       2919 non-null object
BsmtFinSF2       2919 non-null object
BsmtUnfSF        2919 non-null object
TotalBsmtSF      2919 non-null object
Heating          2919 non-null object
CentralAir       29

In [19]:
del combine['Alley']
del combine['Fence']
del combine['MiscFeature']

In [20]:
keys = combine.columns.tolist()
values = []
for column in combine:
    if column not in values:
        values.append(combine[column].value_counts(dropna=False).index.tolist())
    else:
        pass

d = OrderedDict(zip(keys, values))
for i, j in d.items():
    print(i)
    print(j)

Id
[2047, 1088, 1084, 1082, 1080, 1078, 1076, 1074, 1072, 1070, 1068, 1066, 1064, 1062, 1060, 1058, 1056, 1054, 1052, 1050, 1048, 1086, 1090, 1044, 1092, 1130, 1128, 1126, 1124, 1122, 1120, 1118, 1116, 1114, 1112, 1110, 1108, 1106, 1104, 1102, 1100, 1098, 1096, 1094, 1046, 1042, 1316, 996, 992, 990, 988, 986, 984, 982, 980, 978, 976, 974, 972, 970, 968, 966, 964, 962, 960, 958, 956, 994, 998, 1040, 1000, 1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024, 1022, 1020, 1018, 1016, 1014, 1012, 1010, 1008, 1006, 1004, 1002, 1132, 1134, 1136, 1270, 1266, 1264, 1262, 1260, 1258, 1256, 1254, 1252, 1250, 1248, 1246, 1244, 1242, 1240, 1238, 1236, 1234, 1232, 1230, 1268, 1272, 1138, 1274, 1312, 1310, 1308, 1306, 1304, 1302, 1300, 1298, 1296, 1294, 1292, 1290, 1288, 1286, 1284, 1282, 1280, 1278, 1276, 1228, 1226, 1224, 1222, 1176, 1174, 1172, 1170, 1168, 1166, 1164, 1162, 1160, 1158, 1156, 1154, 1152, 1150, 1148, 1146, 1144, 1142, 1140, 1178, 1180, 1182, 1204, 1220, 1218, 1216, 1214, 1212, 1210, 1208

# Data Preparation Plan

In [21]:
df = pd.get_dummies(combine.drop(['SalePrice'], axis=1), drop_first=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Columns: 4360 entries, Id to SaleCondition_Partial
dtypes: float32(16), int64(26), uint8(4318)
memory usage: 12.8 MB


In [26]:
newtrain = df[:1460]
newtest = df[1460:]

In [27]:
newtest.shape, newtrain.shape, y.shape

((1459, 4360), (1460, 4360), (1460,))

# Split Training Data

In [29]:
X_train,X_test,y_train,y_test = train_test_split(newtrain, y, test_size = .50, random_state= 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((730, 4360), (730, 4360), (730,), (730,))

# Algorithm Selection

In [30]:
from sklearn.linear_model import Lasso
La = Lasso(alpha=100)

In [31]:
La.fit(X_train, y_train)

Lasso(alpha=100, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [32]:
r2_score(y_train, La.predict(X_train))

0.9415149382633057

In [33]:
from sklearn.ensemble import GradientBoostingRegressor
gbc = GradientBoostingRegressor()

In [34]:
gbc.fit(X_train,y_train)
r2_score(y_train, gbc.predict(X_train))

0.9715385441773555

In [35]:
gbc.fit(X_test,y_test)
r2_score(y_test, gbc.predict(X_test))

0.9739689558131546

## Gradient Boosting Regressor - Higher accuracy score

# Predict

In [36]:
predictions = gbc.predict(newtest)

In [37]:
newtest.loc[:,'SalePrice'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [38]:
newtest.head()

Unnamed: 0,Id,MSSubClass,LotArea,LotShape,OverallQual,OverallCond,YearBuilt,YearRemodAdd,ExterQual,ExterCond,...,SaleType_New,SaleType_None,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,1461,20,11622,4.0,5,6,1961,1961,3.0,3.0,...,0,0,0,1,0,0,0,1,0,121581.35303
1,1462,20,14267,3.0,6,6,1958,1958,3.0,3.0,...,0,0,0,1,0,0,0,1,0,161150.09597
2,1463,60,13830,3.0,5,5,1997,1998,3.0,3.0,...,0,0,0,1,0,0,0,1,0,172905.682376
3,1464,60,9978,3.0,6,6,1998,1998,3.0,3.0,...,0,0,0,1,0,0,0,1,0,187915.265583
4,1465,120,5005,3.0,8,5,1992,1992,4.0,3.0,...,0,0,0,1,0,0,0,1,0,198646.333689


In [39]:
PriceFinal = newtest[['Id','SalePrice']]
len(PriceFinal)

1459

In [40]:
PriceFinal.to_csv('HousePriceFinal.csv',index=False)