# Kaggle competition : Housing Price prediction 

<b>Objective </b>: Predict house price 
    
<b>Problem type</b> : Regression 

<b>Data </b>: described in Data/data_description.txt 
    
<b>Modelisation </b>: 
    
    

## Import libraries 

In [111]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline 

## Import data

In [103]:
from sklearn.model_selection import train_test_split

# Get input data 
X_full = pd.read_csv('Data/train.csv', sep=',', index_col='Id')
X_full_test = pd.read_csv('Data/test.csv', sep=',', index_col='Id')

# Remove rows with missing target
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Separate target from predictors
y = X_full.SalePrice
X = X_full.drop('SalePrice', axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_full_test.select_dtypes(exclude=['object'])


X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)


## Experimentation 1 : Testing different models with a selection of features (only non null values)



In [118]:
from sklearn.model_selection import train_test_split

# Get input data 
X_full = pd.read_csv('Data/train.csv', sep=',', index_col='Id')
X_full_test = pd.read_csv('Data/test.csv', sep=',', index_col='Id')

# Predictors
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_full_test[features].copy()

# Target
y = X_full.SalePrice


X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [119]:
from sklearn.metrics import mean_absolute_error

# Getting mae performance for testing different models
def get_mae(X_train, y_train, X_val, y_val, model):
    '''Function that computes the mae perfomance for a given model given in input
    ----------
    Returns : 
        error (float) : MAE value given the input model and X, y datasets
    -----------
    Agurments :
        X_train, X_test (dataframes) : Input dataframes for test and train sets
        y_train, y_test (Series)     : Series of labels for test and train sets
    
    '''
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    error = mean_absolute_error(predictions, y_val)
    return error


In [120]:
# Testing different models 

from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [121]:
for i, model in enumerate(models) :
    score = get_mae(X_train, y_train, X_val, y_val, model=model)
    print('Score model_{} : '.format(i+1), score)

Score model_1 :  24015.492818003917
Score model_2 :  23740.979228636657
Score model_3 :  23528.78421232877
Score model_4 :  23996.676789668687
Score model_5 :  23706.672864217904


The model 3 is the most performant in this first try ! This is the one we'll use ! 

In [123]:
## Submitting results for model 3 and selected features 

# Fit the model to the training data
model_3.fit(X_train, y_train)

# Generate test predictions
preds_test = model_3.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)


## Experimentation 2 : Handling missing data 

### Import data 

In [138]:
from sklearn.model_selection import train_test_split

# Get input data 
X_full = pd.read_csv('Data/train.csv', sep=',', index_col='Id')
X_full_test = pd.read_csv('Data/test.csv', sep=',', index_col='Id')

# Remove rows with missing target
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Separate target from predictors
y = X_full.SalePrice
X = X_full.drop('SalePrice', axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_full_test.select_dtypes(exclude=['object'])


X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

### Handling missing data 

In [133]:
# Handling missing data 

# Number of lines 
print('total size', len(X_full))

# Columns with missing values 
cols_to_drop = [col for col in X_train.columns if X_train[col].isnull().sum() > 0 ]
print('Columns with missing values : ', cols_missing_values)


# Number of missing values 
S_missing_values = X_full.isnull().sum()
print('Missing values by columns', S_missing_values[S_missing_values > 0], 'Total number of missing entries', S_missing_values.sum() )

total size 1460
Columns with missing values :  ['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
Missing values by columns LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64 Total number of missing entries 6965


### Strategy 1 : Removing columns with missing data 



In [126]:
# Dropping columns with missing values 

reduced_X_train = X_train.drop(cols_to_drop, axis=1)
reduced_X_val = X_val.drop(cols_to_drop, axis=1)


print(get_mae(X_train=reduced_X_train, y_train=y_train,
                    X_val=reduced_X_val, y_val=y_val, model=model_3))

17895.090633561642


### Strategy 2 : Imputing missing data 

In [139]:


from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy='median')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_val = pd.DataFrame(my_imputer.transform(X_val))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_val.columns = X_val.columns

print(get_mae(X_train=imputed_X_train, y_train=y_train,
                    X_val=imputed_X_val, y_val=y_val, model=model_3))

18093.391643835614


The two strategies have almost the same results ! We choose to apply the imputation method. We see that the score have been increased ! 

## Submitting results

In [None]:
## Preparing submission

# Preparing data : handling missing data 

# Fit the model to the training data
model_3.fit(imputed_X_train, y_train)

# Imputing on missing values for X_test
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(X_test))
imputed_X_test.columns = X_test.columns

# Generate test predictions
preds_test = model_3.predict(imputed_X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': imputed_X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

In [137]:
X_test.isnull().sum()

MSSubClass         0
LotFrontage      227
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       78
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64

In [58]:
assert (len(output) == 1459 )