# Kaggle competition : Housing Price prediction 

<b>Objective </b>: Predict house price 
    
<b>Problem type</b> : Regression 

<b>Data </b>: described in Data/data_description.txt 
    
<b>Modelisation </b>: 
    
    

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline 

In [2]:
from sklearn.model_selection import train_test_split

# Get input data 
X_full = pd.read_csv('Data/train.csv', sep=',', index_col='Id')
X_full_test = pd.read_csv('Data/test.csv', sep=',', index_col='Id')

# Get predictors and 
y = X_full.SalePrice
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_full[features].copy()
X_test = X_full_test[features].copy()

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)


In [3]:
# Testing different models 

from sklearn.ensemble import RandomForestRegressor

# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [4]:
from sklearn.metrics import mean_absolute_error

# Getting mae performance for testing different models
def get_mae(X_train, y_train, X_val, y_val, model):
    '''Function that computes the mae perfomance for a given model given in input
    ----------
    Returns : 
        error (float) : MAE value given the input model and X, y datasets
    -----------
    Agurments :
        X_train, X_test (dataframes) : Input dataframes for test and train sets
        y_train, y_test (Series)     : Series of labels for test and train sets
    
    '''
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    error = mean_absolute_error(predictions, y_val)
    return error


In [5]:
for i, model in enumerate(models) :
    score = get_mae(X_train=X_train, y_train=y_train,
                    X_val=X_val, y_val=y_val, model=model)
    print('score MAE model_{}'.format(i+1), score)

score MAE model_1 24015.492818003917
score MAE model_2 23740.979228636657
score MAE model_3 23528.78421232877
score MAE model_4 23996.676789668687
score MAE model_5 23706.672864217904


In [7]:
## Preparing submission

# Fit the model to the training data
model_3.fit(X, y)

# Generate test predictions
preds_test = model_3.predict(X_test)

# Save predictions in format used for competition scoring
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)

In [18]:
assert len(output) == 1459 
