# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error,accuracy_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# Reading the data

In [2]:
train=pd.read_csv('../input/home-data-for-ml-course/train.csv')
test=pd.read_csv('../input/home-data-for-ml-course/test.csv')

In [3]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


**Test data is similar to train data. The only difference is in that train contains the column SalePrice**

**We will be using train data for training and validation and final testing on test data**

# Understanding the data

# Filling Missing Values

In [4]:
print("Missing values in train data")
missing=train.isnull().sum()
missing[missing>0]

Missing values in train data


LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
print("Missing values in test data")
missing=test.isnull().sum()
missing[missing>0]

Missing values in test data


MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        16
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64

**return_missing_col is a helper function to find the missing columns of a dataset easily**

In [6]:
def return_missing_col(data):
    columns=[col for col in data.columns if data[col].isnull().any()]
    return columns

In [7]:
return_missing_col(train)

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [8]:
return_missing_col(test)

['MSZoning',
 'LotFrontage',
 'Alley',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType']

**Inorder to fill the missing values effectively, you have to understand the data in the dataset. Go through the data description 
    and find which columns should be dropped and which columns to be fillled**

In [9]:
train['LotFrontage'].mean()

70.04995836802665

In [10]:
train['LotFrontage']=train['LotFrontage'].fillna(train['LotFrontage'].mean())
test['LotFrontage']=test['LotFrontage'].fillna(train['LotFrontage']).mean()

In [11]:
train.drop('Alley',axis=1,inplace=True)
test.drop('Alley',axis=1,inplace=True)

In [12]:
train.drop(columns=['MasVnrType','MasVnrArea',],axis=1,inplace=True)
test.drop(columns=['MasVnrType','MasVnrArea',],axis=1,inplace=True)

**check_object is another helper function to help us in creating data for our model**

In [13]:
# the function takes in a type of Series
def check_object(obj):
    objct=[]
    for x in obj:
        if x=='Na':
            objct.append(0)
        else:
            objct.append(1)
    return objct

In [14]:
# we need to fill the misssing values before calling the funnction
train['BsmtQual']=train['BsmtQual'].fillna('Na')
test['BsmtQual']=test['BsmtQual'].fillna('Na')

train['Basement']=pd.DataFrame(check_object(train['BsmtQual']))

test['Basement']=pd.DataFrame(check_object(test['BsmtQual']))

In [15]:
train.drop(columns=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2','BsmtFinSF1','BsmtUnfSF',
                    'TotalBsmtSF'],axis=1,inplace=True)

test.drop(columns=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2','BsmtFinSF1','BsmtUnfSF',
                   'TotalBsmtSF'],axis=1,inplace=True)

In [16]:
train['Electrical'].value_counts()

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64

In [17]:
# filling missing values with 'Mix'
train['Electrical']=train['Electrical'].fillna('Mix')

In [18]:
train.drop(columns=['FireplaceQu'],axis=1,inplace=True)
test.drop(columns=['FireplaceQu'],axis=1,inplace=True)

In [19]:
train['GarageQual']=train['GarageQual'].fillna('Na')
test['GarageQual']=test['GarageQual'].fillna('Na')

train['garage']=pd.DataFrame(check_object(train['GarageQual']))
test['garage']=pd.DataFrame(check_object(test['GarageQual']))

In [20]:
train.drop(columns=['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond'],axis=1,inplace=True)
test.drop(columns=['GarageType','GarageYrBlt','GarageFinish','GarageCars','GarageArea','GarageQual','GarageCond'],axis=1,inplace=True)

In [21]:
train['PoolQC']=train['PoolQC'].fillna('Na')
test['PoolQC']=test['PoolQC'].fillna('Na')

train['pool']=pd.DataFrame(check_object(train['PoolQC']))
test['pool']=pd.DataFrame(check_object(test['PoolQC']))

In [22]:
train.drop(columns=['PoolQC','PoolArea'],axis=1,inplace=True)
test.drop(columns=['PoolQC','PoolArea'],axis=1,inplace=True)

In [23]:
train['Fence']=train['Fence'].fillna('Na')
test['Fence']=test['Fence'].fillna('Na')

train['fence']=pd.DataFrame(check_object(train['Fence']))
test['fence']=pd.DataFrame(check_object(test['Fence']))

In [24]:
train.drop('Fence',axis=1,inplace=True)
test.drop('Fence',axis=1,inplace=True)

In [25]:
train.drop(columns=['MiscFeature','MiscVal','MoSold','YrSold','SaleType','SaleCondition'],axis=1,inplace=True)
test.drop(columns=['MiscFeature','MiscVal','MoSold','YrSold','SaleType','SaleCondition'],axis=1,inplace=True)

In [26]:
print("Columns in the train data with missing values are:",len(return_missing_col(train)))

Columns in the train data with missing values are: 0


In [27]:
print("Columns with missing values are:",return_missing_col(test))

Columns with missing values are: ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional']


In [28]:
train.drop(columns=['MSSubClass'],inplace=True,axis=1)
test.drop(columns=['MSSubClass'],inplace=True,axis=1)

In [29]:
test['MSZoning'].isnull().sum()

4

In [30]:
test['MSZoning']=test['MSZoning'].fillna('RL')

In [31]:
print(" Number of missing values in Utilities are :",test['Utilities'].isnull().sum())
test['Utilities'].value_counts()

 Number of missing values in Utilities are : 2


AllPub    1457
Name: Utilities, dtype: int64

In [32]:
test['Utilities']=test['Utilities'].fillna('AllPub')

In [33]:
test.drop(columns=['Neighborhood','LotConfig','LandSlope','Condition1','Condition2','BldgType','HouseStyle','YearRemodAdd','RoofStyle',
                   'RoofMatl','Exterior1st','Exterior2nd','BsmtFullBath','BsmtHalfBath','KitchenQual','Functional'],axis=1,inplace=True)
train.drop(columns=['Neighborhood','LotConfig','LandSlope','Condition1','Condition2','BldgType','HouseStyle','YearRemodAdd','RoofStyle',
                   'RoofMatl','Exterior1st','Exterior2nd','BsmtFullBath','BsmtHalfBath','KitchenQual','Functional'],axis=1,inplace=True)

In [34]:
print("Columns with missing values are:",len(return_missing_col(test)))

Columns with missing values are: 0


In [35]:
train.columns

Index(['Id', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'OverallQual', 'OverallCond', 'YearBuilt',
       'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'SalePrice', 'Basement',
       'garage', 'pool', 'fence'],
      dtype='object')

In [36]:
test.columns

Index(['Id', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape',
       'LandContour', 'Utilities', 'OverallQual', 'OverallCond', 'YearBuilt',
       'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'Basement', 'garage',
       'pool', 'fence'],
      dtype='object')

# Creating features

In [37]:
features=['MSZoning','LotFrontage','LotArea','YearBuilt', 'Street','Utilities', 'OverallQual', 'OverallCond','ExterQual', 'ExterCond',
          'Foundation', 'Heating','FullBath', 'HeatingQC','CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF','GrLivArea','BedroomAbvGr', 'KitchenAbvGr',
       'TotRmsAbvGrd', 'Fireplaces','PavedDrive', 'Basement', 'garage','pool', 'fence']

In [38]:
X=train[features]
y=train.SalePrice

In [39]:
X.head()

Unnamed: 0,MSZoning,LotFrontage,LotArea,YearBuilt,Street,Utilities,OverallQual,OverallCond,ExterQual,ExterCond,...,GrLivArea,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,PavedDrive,Basement,garage,pool,fence
0,RL,65.0,8450,2003,Pave,AllPub,7,5,Gd,TA,...,1710,3,1,8,0,Y,1,1,0,0
1,RL,80.0,9600,1976,Pave,AllPub,6,8,TA,TA,...,1262,3,1,6,1,Y,1,1,0,0
2,RL,68.0,11250,2001,Pave,AllPub,7,5,Gd,TA,...,1786,3,1,6,1,Y,1,1,0,0
3,RL,60.0,9550,1915,Pave,AllPub,7,5,TA,TA,...,1717,3,1,7,1,Y,1,1,0,0
4,RL,84.0,14260,2000,Pave,AllPub,8,5,Gd,TA,...,2198,4,1,9,1,Y,1,1,0,0


In [40]:
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [41]:
# creates a list of columns with categorical data
s = (X.dtypes == 'object')
categorical_cols = list(s[s].index)

In [42]:
categorical_cols

['MSZoning',
 'Street',
 'Utilities',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'PavedDrive']

# Converting Categorical values to Numerical Values

In [43]:
Encoder_x=LabelEncoder()
for col in categorical_cols:
    X[col]=Encoder_x.fit_transform(X[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [44]:
X.head()

Unnamed: 0,MSZoning,LotFrontage,LotArea,YearBuilt,Street,Utilities,OverallQual,OverallCond,ExterQual,ExterCond,...,GrLivArea,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,PavedDrive,Basement,garage,pool,fence
0,3,65.0,8450,2003,1,0,7,5,2,4,...,1710,3,1,8,0,2,1,1,0,0
1,3,80.0,9600,1976,1,0,6,8,3,4,...,1262,3,1,6,1,2,1,1,0,0
2,3,68.0,11250,2001,1,0,7,5,2,4,...,1786,3,1,6,1,2,1,1,0,0
3,3,60.0,9550,1915,1,0,7,5,3,4,...,1717,3,1,7,1,2,1,1,0,0
4,3,84.0,14260,2000,1,0,8,5,2,4,...,2198,4,1,9,1,2,1,1,0,0


# Building the model for training and evaluation

In [45]:
train_x,val_x,train_y,val_y=train_test_split(X,y,train_size=0.8,test_size=0.2)

In [46]:
test2=test[features]

In [47]:
Encoder=LabelEncoder()
for col in categorical_cols:
    test2[col]=Encoder.fit_transform(test2[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


# Random Forest Regreesor

In [48]:
model=RandomForestRegressor()
model.fit(train_x,train_y)
pred=model.predict(val_x)

In [49]:
print("Mean absolute error:",mean_absolute_error(pred,val_y))
print("Model score",model.score(val_x,val_y))

Mean absolute error: 18006.543938356164
Model score 0.8796866782713438


In [50]:
model=RandomForestRegressor()
model.fit(X,y)
pred=model.predict(test2)

In [51]:
output=pd.DataFrame({'Id':test.Id,'SalePrice':pred})
output.to_csv('submission_rf.csv',index=False)

In [52]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,125391.0
1,1462,148883.0
2,1463,178224.59
3,1464,177838.9
4,1465,197211.12


# XGB Regressor

In [53]:
model=XGBRegressor(n_estimators=500,learning_rate=0.05)
model.fit(train_x,train_y,early_stopping_rounds=5,eval_set=[(val_x,val_y)],verbose=False)
pred=model.predict(val_x)
print("Mean absolute error",mean_absolute_error(pred,val_y))
print("Root mean square error",mean_squared_error(pred,val_y,squared=False))

Mean absolute error 18053.44290453767
Root mean square error 27978.69722656122


In [54]:
model.fit(X,y,early_stopping_rounds=5,eval_set=[(val_x,val_y)],verbose=False)
pred=model.predict(test2)

In [55]:
output=pd.DataFrame({'Id':test.Id,'SalePrice':pred})
output.to_csv('submission_xgb.csv',index=False)

In [56]:
output.head()

Unnamed: 0,Id,SalePrice
0,1461,129556.054688
1,1462,150097.296875
2,1463,175792.640625
3,1464,199926.0625
4,1465,190243.453125
