## House Price Prediction

In [153]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
%matplotlib inline

## Initially

In [154]:
home_data = df

y = home_data.SalePrice

features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = home_data[features]


train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)


iowa_model = DecisionTreeRegressor(random_state=1)

iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE when not specifying max_leaf_nodes: {:,.0f}".format(val_mae))

# Using best value for max_leaf_nodes
iowa_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)
iowa_model.fit(train_X, train_y)
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE for best value of max_leaf_nodes: {:,.0f}".format(val_mae))

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

Validation MAE when not specifying max_leaf_nodes: 29,653
Validation MAE for best value of max_leaf_nodes: 27,283
Validation MAE for Random Forest Model: 22,762




In [155]:
# To improve accuracy, create a new Random Forest model which you will train on all training data
rf_model_on_full_data = RandomForestRegressor(random_state=1)

# fit rf_model_on_full_data on all data from the training data
rf_model_on_full_data.fit(X,y)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False)

In [156]:
# read test data file using pandas
test_data = pd.read_csv("test.csv")

# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called features
test_X = test_data[features]

# make predictions which we will submit. 
test_preds = rf_model_on_full_data.predict(test_X)
        
# The lines below shows how to save predictions in format used for competition scoring
# Just uncomment them.

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
# output.to_csv('submission.csv', index=False)
output

Unnamed: 0,Id,SalePrice
0,1461,112945.0
1,1462,149770.0
2,1463,178100.0
3,1464,177950.0
4,1465,189000.0
...,...,...
1454,2915,82300.0
1455,2916,91350.0
1456,2917,141674.1
1457,2918,130350.0


## Changing Some Factors and trying again

In [157]:
df = pd.read_csv("train.csv")

In [158]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [159]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

##### Finding corelation 

In [160]:
df[df.columns].corr()['SalePrice'][:]
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'TotRmsAbvGrd']

In [161]:


new_features = ['OverallQual','YearBuilt','YearRemodAdd','TotalBsmtSF','1stFlrSF','2ndFlrSF',
               'GrLivArea','FullBath','TotRmsAbvGrd','GarageCars','GarageArea']

# new_features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'TotRmsAbvGrd']
X1 = df[new_features]

y= df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X1, y, random_state=1)



dtree = DecisionTreeRegressor(random_state=1)
dtree.fit(X_train,y_train)
dtree_pred = dtree.predict(X_test)
dtree_mae = mean_absolute_error(dtree_pred, y_test)
print("Validation MAE using Decision Tress: {:,.0f}".format(dtree_mae))


rfr = RandomForestRegressor(random_state=1)
rfr.fit(X_train,y_train)
rfr_pred= rfr.predict(X_test)
rfr_mae = mean_absolute_error(rfr_pred, y_test)
print("Validation MAE using Random Forests: {:,.0f}".format(rfr_mae))

Validation MAE using Decision Tress: 25,044
Validation MAE using Random Forests: 20,976




## Now training on whole data

In [162]:
test_data = pd.read_csv("test.csv")

rfr_full = RandomForestRegressor(random_state=1)
rfr_full.fit(X1,y)

test_X = test_data[new_features]

test_X.info()

# test_preds = rfr_full.predict(test_X)
        
# output = pd.DataFrame({'Id': test_data.Id,
#                        'SalePrice': test_preds})
# # output.to_csv('submission2.csv', index=False)
# output



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
OverallQual     1459 non-null int64
YearBuilt       1459 non-null int64
YearRemodAdd    1459 non-null int64
TotalBsmtSF     1458 non-null float64
1stFlrSF        1459 non-null int64
2ndFlrSF        1459 non-null int64
GrLivArea       1459 non-null int64
FullBath        1459 non-null int64
TotRmsAbvGrd    1459 non-null int64
GarageCars      1458 non-null float64
GarageArea      1458 non-null float64
dtypes: float64(3), int64(8)
memory usage: 125.5 KB


In [163]:
test_X['TotalBsmtSF'].fillna(test_X['TotalBsmtSF'].mean(),inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [164]:
test_X['GarageCars'].fillna(test_X['GarageCars'].mean(),inplace=True)

In [165]:
test_X['GarageArea'].fillna(test_X['GarageArea'].mean(),inplace=True)

In [166]:
test_X.info()

test_preds = rfr_full.predict(test_X)
        
output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission2.csv', index=False)
output

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 11 columns):
OverallQual     1459 non-null int64
YearBuilt       1459 non-null int64
YearRemodAdd    1459 non-null int64
TotalBsmtSF     1459 non-null float64
1stFlrSF        1459 non-null int64
2ndFlrSF        1459 non-null int64
GrLivArea       1459 non-null int64
FullBath        1459 non-null int64
TotRmsAbvGrd    1459 non-null int64
GarageCars      1459 non-null float64
GarageArea      1459 non-null float64
dtypes: float64(3), int64(8)
memory usage: 125.5 KB


Unnamed: 0,Id,SalePrice
0,1461,124340.000000
1,1462,152575.000000
2,1463,172350.000000
3,1464,179100.000000
4,1465,220940.000000
...,...,...
1454,2915,80100.000000
1455,2916,81916.666667
1456,2917,152898.700000
1457,2918,109610.000000


## Using More features Increased the efficiency

In [167]:
third_try_features = ['OpenPorchSF','LotArea','BsmtFinSF1','BsmtUnfSF','BsmtFullBath','HalfBath','BedroomAbvGr','Fireplaces','WoodDeckSF','OpenPorchSF','ScreenPorch',
                     'OverallQual','YearBuilt','YearRemodAdd','TotalBsmtSF','1stFlrSF','2ndFlrSF',
               'GrLivArea','FullBath','TotRmsAbvGrd','GarageCars','GarageArea','GarageYrBlt']


In [168]:
for i in third_try_features:
    print(i+" == "+str(sum(df[i].isna())))

OpenPorchSF == 0
LotArea == 0
BsmtFinSF1 == 0
BsmtUnfSF == 0
BsmtFullBath == 0
HalfBath == 0
BedroomAbvGr == 0
Fireplaces == 0
WoodDeckSF == 0
OpenPorchSF == 0
ScreenPorch == 0
OverallQual == 0
YearBuilt == 0
YearRemodAdd == 0
TotalBsmtSF == 0
1stFlrSF == 0
2ndFlrSF == 0
GrLivArea == 0
FullBath == 0
TotRmsAbvGrd == 0
GarageCars == 0
GarageArea == 0
GarageYrBlt == 81


In [169]:
df[['GarageYrBlt','YearBuilt','GarageCars','GarageArea']]

Unnamed: 0,GarageYrBlt,YearBuilt,GarageCars,GarageArea
0,2003.0,2003,2,548
1,1976.0,1976,2,460
2,2001.0,2001,2,608
3,1998.0,1915,3,642
4,2000.0,2000,3,836
...,...,...,...,...
1455,1999.0,1999,2,460
1456,1978.0,1978,2,500
1457,1941.0,1941,1,252
1458,1950.0,1950,1,240


In [170]:
df[(df['YearBuilt']!=df['GarageYrBlt']) & (df['GarageYrBlt'].isna()!=True)][['YearBuilt','GarageYrBlt']]

Unnamed: 0,YearBuilt,GarageYrBlt
3,1915,1998.0
15,1929,1991.0
26,1951,2005.0
27,2007,2008.0
29,1927,1920.0
...,...,...
1432,1927,1928.0
1436,1971,1974.0
1440,1922,1993.0
1445,1966,1990.0


## From this we can see that GarageYrBlt is same as YarBuilt for most of the houses but not for 290 houses.

So we can fill the values same as yearBuilt

In [171]:
df['GarageYrBlt'].fillna(df['YearBuilt'],inplace=True)

In [172]:
sum(df['GarageYrBlt'].isna())

0

In [173]:

X2 = df[third_try_features]

y= df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=1)



dtree = DecisionTreeRegressor(random_state=1)
dtree.fit(X_train,y_train)
dtree_pred = dtree.predict(X_test)
dtree_mae = mean_absolute_error(dtree_pred, y_test)
print("Validation MAE using Decision Tress: {:,.0f}".format(dtree_mae))


rfr = RandomForestRegressor(random_state=1)
rfr.fit(X_train,y_train)
rfr_pred= rfr.predict(X_test)
rfr_mae = mean_absolute_error(rfr_pred, y_test)
print("Validation MAE using Random Forests: {:,.0f}".format(rfr_mae))

Validation MAE using Decision Tress: 26,798
Validation MAE using Random Forests: 18,587




## Without considering GarageYrBlt

In [174]:
third_try_features = ['OpenPorchSF','LotArea','BsmtFinSF1','BsmtUnfSF','BsmtFullBath','HalfBath','BedroomAbvGr','Fireplaces','WoodDeckSF','OpenPorchSF','ScreenPorch',
                     'OverallQual','YearBuilt','YearRemodAdd','TotalBsmtSF','1stFlrSF','2ndFlrSF',
               'GrLivArea','FullBath','TotRmsAbvGrd','GarageCars','GarageArea']



X2 = df[third_try_features]

y= df['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X2, y, random_state=1)



dtree = DecisionTreeRegressor(random_state=1)
dtree.fit(X_train,y_train)
dtree_pred = dtree.predict(X_test)
dtree_mae = mean_absolute_error(dtree_pred, y_test)
print("Validation MAE using Decision Tress: {:,.0f}".format(dtree_mae))


rfr = RandomForestRegressor(random_state=1)
rfr.fit(X_train,y_train)
rfr_pred= rfr.predict(X_test)
rfr_mae = mean_absolute_error(rfr_pred, y_test)
print("Validation MAE using Random Forests: {:,.0f}".format(rfr_mae))

Validation MAE using Decision Tress: 27,093




Validation MAE using Random Forests: 18,425


## Since the MAE is less when we ignore the GarageYrBlt so its better to drop that.

In [187]:
test_data = pd.read_csv("test.csv")

rfr_full = RandomForestRegressor(random_state=1)
rfr_full.fit(X2,y)

test_X = test_data[third_try_features]

test_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 22 columns):
OpenPorchSF     1459 non-null int64
LotArea         1459 non-null int64
BsmtFinSF1      1458 non-null float64
BsmtUnfSF       1458 non-null float64
BsmtFullBath    1457 non-null float64
HalfBath        1459 non-null int64
BedroomAbvGr    1459 non-null int64
Fireplaces      1459 non-null int64
WoodDeckSF      1459 non-null int64
OpenPorchSF     1459 non-null int64
ScreenPorch     1459 non-null int64
OverallQual     1459 non-null int64
YearBuilt       1459 non-null int64
YearRemodAdd    1459 non-null int64
TotalBsmtSF     1458 non-null float64
1stFlrSF        1459 non-null int64
2ndFlrSF        1459 non-null int64
GrLivArea       1459 non-null int64
FullBath        1459 non-null int64
TotRmsAbvGrd    1459 non-null int64
GarageCars      1458 non-null float64
GarageArea      1458 non-null float64
dtypes: float64(6), int64(16)
memory usage: 250.9 KB




In [190]:
df['BsmtFinSF1'] = df['BsmtFinSF1'].replace(df[(df['BsmtFinSF1'].isnull())]['BsmtFinSF1'], df['BsmtFinSF1'].mean())

In [191]:
test_X.info()
# test_preds = rfr_full.predict(test_X)
        
# output = pd.DataFrame({'Id': test_data.Id,
#                        'SalePrice': test_preds})
# output.to_csv('submission3.csv', index=False)
# output

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 22 columns):
OpenPorchSF     1459 non-null int64
LotArea         1459 non-null int64
BsmtFinSF1      1458 non-null float64
BsmtUnfSF       1458 non-null float64
BsmtFullBath    1457 non-null float64
HalfBath        1459 non-null int64
BedroomAbvGr    1459 non-null int64
Fireplaces      1459 non-null int64
WoodDeckSF      1459 non-null int64
OpenPorchSF     1459 non-null int64
ScreenPorch     1459 non-null int64
OverallQual     1459 non-null int64
YearBuilt       1459 non-null int64
YearRemodAdd    1459 non-null int64
TotalBsmtSF     1458 non-null float64
1stFlrSF        1459 non-null int64
2ndFlrSF        1459 non-null int64
GrLivArea       1459 non-null int64
FullBath        1459 non-null int64
TotRmsAbvGrd    1459 non-null int64
GarageCars      1458 non-null float64
GarageArea      1458 non-null float64
dtypes: float64(6), int64(16)
memory usage: 250.9 KB
