# Importing the necessary libraries and dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [124]:
df = pd.read_csv('train.csv')

df.sample(6)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
538,539,20,RL,,11553,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,158000
1384,1385,50,RL,60.0,9060,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,10,2009,WD,Normal,105000
1164,1165,80,RL,,16157,Pave,,IR1,Lvl,AllPub,...,0,,,,0,6,2007,WD,Normal,194000
918,919,60,RL,103.0,13125,Pave,,IR1,Lvl,AllPub,...,0,,GdPrv,,0,11,2007,WD,Normal,238000
200,201,20,RM,80.0,8546,Pave,,Reg,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,140000
164,165,40,RM,40.0,5400,Pave,Pave,Reg,Lvl,AllPub,...,0,,,,0,10,2007,WD,Normal,152000


In [125]:
df = df.drop(['Id'],axis=1)

df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## 1. Data Preprocessing

## Getting rid of unnecessary columns and missing values

In [126]:
#Dropping columns that didn't add more information to the dataset
# The choice of columns to drop comes from studying data description file

to_drop_col = ['Alley','Condition2','OverallQual','Exterior2nd','ExterQual','BsmtQual','BsmtExposure','BsmtFinSF1',
              'BsmtFinType2','BsmtFinSF2','BsmtUnfSF','Heating','GrLivArea','BsmtFullBath','BsmtHalfBath','FullBath',
              'HalfBath','GarageCars','GarageQual']

df = df.drop(to_drop_col,axis=1)

In [127]:
#Get number of missing values per feature

for feature in df.columns:
    if df[feature].isna().sum() > 0:
        print(f'{feature} has {df[feature].isnull().sum()} missing values')

LotFrontage has 259 missing values
MasVnrType has 8 missing values
MasVnrArea has 8 missing values
BsmtCond has 37 missing values
BsmtFinType1 has 37 missing values
Electrical has 1 missing values
FireplaceQu has 690 missing values
GarageType has 81 missing values
GarageYrBlt has 81 missing values
GarageFinish has 81 missing values
GarageCond has 81 missing values
PoolQC has 1453 missing values
Fence has 1179 missing values
MiscFeature has 1406 missing values


In [128]:
missing_garage = df[(df['GarageType'].isna()) & (df['GarageYrBlt'].isna()) & (df['GarageCond'].isna())]

print(missing_garage.shape[0])
#As we can see the features ['GarageYrBlt','GarageFinish','GarageCond'] has missing values in all colums
#So by removing all the rows that contain missing values we can potentially lose a lot of data

81


In [150]:
def fill_na(df,cols):
    # Function to fill a string value for missing data
    for col in cols:
        df[col].fillna('other',inplace=True)
        
def fill_median(df,cols):
    #Changing missing value to median number
    for col in cols:
        median = df[col].median()
        df[col].fillna(median,inplace=True)
        
def date_built(dataframe):
    #Assigning Garage Year Built to the House/Building Year Built
    dataframe['GarageYrBlt'].fillna(dataframe.loc[missing_garage.index,'YearBuilt'],inplace=True)

df['MiscFeature'] = [0 if x == None else 1 for x in df['MiscFeature']]
df['PoolQC'] = [1 if x == 'Ex' else 0 for x in df['PoolQC']]
df['Fence'] = [0 if x == None else 1 for x in df['Fence']]
        
fill_na(df,['MasVnrType','BsmtCond','BsmtFinType1','Electrical','FireplaceQu','GarageFinish','GarageCond',
        'GarageType'])
fill_median(df,['LotFrontage','MasVnrArea',])
date_built(df)



In [130]:
df.isnull().sum().sort_values(ascending=False).head()

SalePrice       0
Electrical      0
HeatingQC       0
TotalBsmtSF     0
BsmtFinType1    0
dtype: int64

## Handling categorical data

In [131]:
#Get number of unique categories per feature (X)

for column in df.columns:
    if df[column].dtype == 'object':
        unique_cat = df[column].nunique()
        print(f'{column} has {unique_cat} number of features {df[column].unique()}.')

MSZoning has 5 number of features ['RL' 'RM' 'C (all)' 'FV' 'RH'].
Street has 2 number of features ['Pave' 'Grvl'].
LotShape has 4 number of features ['Reg' 'IR1' 'IR2' 'IR3'].
LandContour has 4 number of features ['Lvl' 'Bnk' 'Low' 'HLS'].
Utilities has 2 number of features ['AllPub' 'NoSeWa'].
LotConfig has 5 number of features ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3'].
LandSlope has 3 number of features ['Gtl' 'Mod' 'Sev'].
Neighborhood has 25 number of features ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste'].
Condition1 has 9 number of features ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe'].
BldgType has 5 number of features ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs'].
HouseStyle has 8 number of features ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '

In [132]:
df.shape

(1460, 61)

In [118]:
dummy_list = []

for x in df.columns:
    if df[x].dtype == 'object':
        dummy_list.append(x)
        
def reduce_categories(df,dummy_list):
    for i in dummy_list:
        df[i] = [1 if x == df[i].value_counts().sort_values(ascending = False).idxmax() else 0 for x in df[i]]
        


In [133]:
reduce_categories(df,dummy_list)


print(df.head(), df.shape)

   MSSubClass  MSZoning  LotFrontage  LotArea  Street  LotShape  LandContour  \
0          60         1         65.0     8450       1         1            1   
1          20         1         80.0     9600       1         1            1   
2          60         1         68.0    11250       1         0            1   
3          70         1         60.0     9550       1         0            1   
4          60         1         84.0    14260       1         0            1   

   Utilities  LotConfig  LandSlope  ...  PoolArea  PoolQC  Fence  MiscFeature  \
0          1          1          1  ...         0       0      1            1   
1          1          0          1  ...         0       0      1            1   
2          1          1          1  ...         0       0      1            1   
3          1          0          1  ...         0       0      1            1   
4          1          0          1  ...         0       0      1            1   

   MiscVal  MoSold  YrSold  Sale

### Applying the model

In [139]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE

In [140]:
X,y = df.iloc[:,:-1], df.iloc[:,-1]
  
train_X, val_X, train_y, val_y = train_test_split(X, y, 
                      test_size = 0.2, random_state = 123) 
  
 
xgb_r = xgb.XGBRegressor(objective ='reg:linear', 
                  n_estimators = 10, seed = 123) 
  
# Fitting the model 
xgb_r.fit(train_X, train_y) 
  
# Predict the model 
pred = xgb_r.predict(val_X) 
  
# RMSE Computation 
rmse = np.sqrt(MSE(val_y, pred)) 
print("RMSE : % f" %(rmse)) 

RMSE :  34749.008366


In [141]:
final_model = xgb_r.fit(X, y) 



### So as we can see the root mean squared error is 34,750.