In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading Data 
path='/Users/krishanubanerjee/Documents/kaggle/'
df_train=pd.read_csv(path+str('train.csv'))

In [3]:
df_train.shape

(1460, 81)

In [4]:
df_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

#### Missing value imputation

##### Rule 1: Drop columns with more than 25% missing value

In [5]:
columns = list(df_train)
columns_to_drop=[]
for column in columns:
    missing_prcntg=round(df_train[column].isnull().sum()/len(df_train),2)
    if missing_prcntg > .25 :
        print (column+' - '+str(missing_prcntg))
        columns_to_drop.append(str(column))

Alley - 0.94
FireplaceQu - 0.47
PoolQC - 1.0
Fence - 0.81
MiscFeature - 0.96


In [6]:
df_train=df_train.drop(columns_to_drop,axis=1)
df_train.shape

(1460, 76)

##### Rule 2: Fill missing values with mean

In [7]:
df_train=pd.get_dummies(df_train)
df_train=df_train.fillna(df_train.mean())

In [8]:
columns = list(df_train)
for column in columns:
    missing_prcntg=round(df_train[column].isnull().sum()/len(df_train),2)
    if missing_prcntg > 0 :
        print (column+' - '+str(missing_prcntg))

###  Building model Random Forest

#### Removing outliers and test train split

In [9]:
""" Removing outlier as mentioned in the document"""
df_train = df_train[df_train.GrLivArea < 4500].reset_index(drop=True)

In [10]:
labels = np.array(df_train['SalePrice'])
features= df_train.drop('SalePrice', axis = 1)
feature_list = list(features.columns)
features = np.array(features)

In [11]:
features.shape

(1458, 271)

In [12]:
"""Split data into training/test sets"""
train_features, test_features, train_labels, test_labels = \
            train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [13]:
""" Random forest with 1000 Decision Trees"""
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
""" Train the model"""
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1000,
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False)

In [14]:
"""Prediction and calculate Mean Absolute Percentage Error (MAPE) and accuracy"""
predictions = rf.predict(test_features)
errors = abs(predictions - test_labels)
mape = 100 * (errors / test_labels)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 89.51 %.


In [18]:
"""Feature Importance """
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = feature_list,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances[:15]

Unnamed: 0,importance
OverallQual,0.551109
GrLivArea,0.127149
TotalBsmtSF,0.060908
1stFlrSF,0.02847
BsmtFinSF1,0.026645
GarageArea,0.020399
2ndFlrSF,0.017175
TotRmsAbvGrd,0.015826
LotArea,0.012137
YearBuilt,0.012128


#### Conclusion
<br> - We can see test accuracy is resonable <br> - We can also see top 
ten features are most important and next thing I will try to build model 
only using these ten features<br> - For simplicity and quick delivery, I have not
done any EDA, cross validation , model tuning or not tried any other model</br>

#### Repeat model building with top 10 features

In [17]:
top_10_feature=feature_importances[:10].reset_index()
top_10_feature=list(top_10_feature['index'])
top_10_feature.append('SalePrice')
top_10_feature

['OverallQual',
 'GrLivArea',
 'TotalBsmtSF',
 '1stFlrSF',
 'BsmtFinSF1',
 'GarageArea',
 '2ndFlrSF',
 'TotRmsAbvGrd',
 'LotArea',
 'YearBuilt',
 'SalePrice']

In [19]:
df_train_sub=df_train[top_10_feature]

In [20]:
df_train_sub.shape

(1458, 11)

In [21]:
labels = np.array(df_train_sub['SalePrice'])
features= df_train_sub.drop('SalePrice', axis = 1)
feature_list = list(features.columns)
features = np.array(features)
train_features, test_features, train_labels, test_labels = \
            train_test_split(features, labels, test_size = 0.2, random_state = 42)
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 88.72 %.


#### As the accuracy not dropped much and very reasonable, we will use this code for dockerize.