# Real estate price prediction project.
Using "Housing Prices Competition for Kaggle Learn Users" data.


In [9]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# handeling warnings
pd.options.mode.chained_assignment = None  
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load and preprocess data

In [10]:
X = pd.read_csv('train.csv')

# Remove rows with missing data
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# Drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [11]:
# Chose categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Chose columns for label encoding
good_label_cols = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]

numerical_columns = [col for col in X_train if 
                    X_train[col].dtype in ['int64', 'float64']]
        
# Drop columns which were not chosen for label encoding
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        


## Imputation of missing values 

In [12]:
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train[numerical_columns]))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid[numerical_columns]))
imputed_X_train.columns = numerical_columns
imputed_X_valid.columns = numerical_columns

## Label encoding categorical variables

In [13]:
# Drop columns which were not chosen for label encoding
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

my_encoder = LabelEncoder()
for col in set(good_label_cols):
    label_X_train[col] = my_encoder.fit_transform(X_train[col])
    label_X_valid[col] = my_encoder.transform(X_valid[col])


## Prediction models:

###  Basic random forest reggresor

In [14]:
def check_randomforest(X_train, X_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=1)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [15]:
# Label encoded
print(check_randomforest(label_X_train,label_X_valid)) 

17506.210273972603


In [7]:
# Imputated
print(check_randomforest(imputed_X_train,imputed_X_valid))

17954.36284246575


### XGBoost

#### Find the most accurate XGBRegressor model

In [15]:
def check_xdgb(X_train_data, X_valid_data):
    all_scores = []
    for early_stopping_rounds in [3, 5, 7]:
        for max_depth in [7, 9, 11]:
            for n_estimators in [200, 500, 1000]:
                for learning_rate in [0.02, 0.05, 0.1]:
                    for colsample_bytree in [0.5, 0.7, 0.9]:
                        for reg_alpha in [0.1, 1, 10, 100]:
                                XGBmodel = XGBRegressor(early_stopping_rounds=early_stopping_rounds,
                                                    max_depth=max_depth,
                                                   learning_rate=learning_rate,
                                                   colsample_bytree=colsample_bytree,
                                                   reg_alpha=reg_alpha,
                                                   reg_lambda=1,
                                                   objective ='reg:squarederror',
                                                   random_state=1,

                                                   )
                                XGBmodel.fit(X_train_data, y_train)
                                prediction = XGBmodel.predict(X_valid_data)
                                mae = mean_absolute_error(prediction, y_valid)
                                all_scores.append([early_stopping_rounds, max_depth, n_estimators,
                                                   learning_rate,colsample_bytree, reg_alpha, 
                                                mae])
                                print(all_scores[-1])
    return(all_scores)

#### Best models used with label encoded data

In [None]:
check_xdgb(label_X_train, label_X_valid)
max_scores_label = sorted(all_scores, key = lambda x: x[-1], reverse = False)[:5]

[3, 7, 200, 0.02, 0.5, 0.1, 29472.200382598458]
[3, 7, 200, 0.02, 0.5, 1, 29472.21899079623]
[3, 7, 200, 0.02, 0.5, 10, 29471.120612157534]
[3, 7, 200, 0.02, 0.5, 100, 29454.29890839041]
[3, 7, 200, 0.02, 0.7, 0.1, 28922.174550513697]
[3, 7, 200, 0.02, 0.7, 1, 28922.19885755565]
[3, 7, 200, 0.02, 0.7, 10, 28907.299015410958]
[3, 7, 200, 0.02, 0.7, 100, 28934.396685038526]
[3, 7, 200, 0.02, 0.9, 0.1, 28501.49973244863]
[3, 7, 200, 0.02, 0.9, 1, 28491.63025738442]
[3, 7, 200, 0.02, 0.9, 10, 28464.312928082192]
[3, 7, 200, 0.02, 0.9, 100, 28663.677386558218]
[3, 7, 200, 0.05, 0.5, 0.1, 16822.443667059077]
[3, 7, 200, 0.05, 0.5, 1, 16822.441513270547]
[3, 7, 200, 0.05, 0.5, 10, 16785.50667540668]
[3, 7, 200, 0.05, 0.5, 100, 16970.171393407534]
[3, 7, 200, 0.05, 0.7, 0.1, 16282.436630458047]
[3, 7, 200, 0.05, 0.7, 1, 16282.439680543665]
[3, 7, 200, 0.05, 0.7, 10, 16593.645374036816]
[3, 7, 200, 0.05, 0.7, 100, 16802.386250535103]
[3, 7, 200, 0.05, 0.9, 0.1, 17072.76181239298]
[3, 7, 200, 0.

[3, 9, 500, 0.1, 0.9, 0.1, 16939.568158711474]
[3, 9, 500, 0.1, 0.9, 1, 16926.68622913099]
[3, 9, 500, 0.1, 0.9, 10, 17216.106191138697]
[3, 9, 500, 0.1, 0.9, 100, 17290.656209867295]
[3, 9, 1000, 0.02, 0.5, 0.1, 29650.663875214042]
[3, 9, 1000, 0.02, 0.5, 1, 29657.839362157534]
[3, 9, 1000, 0.02, 0.5, 10, 29663.08790400257]
[3, 9, 1000, 0.02, 0.5, 100, 29699.444764019692]
[3, 9, 1000, 0.02, 0.7, 0.1, 29127.094378745718]
[3, 9, 1000, 0.02, 0.7, 1, 29127.12711365582]
[3, 9, 1000, 0.02, 0.7, 10, 29131.289289918663]
[3, 9, 1000, 0.02, 0.7, 100, 29211.123969927226]
[3, 9, 1000, 0.02, 0.9, 0.1, 28845.227565817637]
[3, 9, 1000, 0.02, 0.9, 1, 28845.256902825342]
[3, 9, 1000, 0.02, 0.9, 10, 28803.212101348458]
[3, 9, 1000, 0.02, 0.9, 100, 28894.656383775684]
[3, 9, 1000, 0.05, 0.5, 0.1, 17245.43277771832]
[3, 9, 1000, 0.05, 0.5, 1, 17245.430998501713]
[3, 9, 1000, 0.05, 0.5, 10, 17215.90204944349]
[3, 9, 1000, 0.05, 0.5, 100, 17316.382264019692]
[3, 9, 1000, 0.05, 0.7, 0.1, 16668.932416523974]

[5, 7, 200, 0.1, 0.5, 1, 17024.53533015839]
[5, 7, 200, 0.1, 0.5, 10, 17018.73330479452]
[5, 7, 200, 0.1, 0.5, 100, 17205.877742401542]
[5, 7, 200, 0.1, 0.7, 0.1, 15845.20222870291]
[5, 7, 200, 0.1, 0.7, 1, 15845.980375107021]
[5, 7, 200, 0.1, 0.7, 10, 15809.89070526541]
[5, 7, 200, 0.1, 0.7, 100, 15911.21022848887]
[5, 7, 200, 0.1, 0.9, 0.1, 16663.810399721748]
[5, 7, 200, 0.1, 0.9, 1, 16616.18004869435]
[5, 7, 200, 0.1, 0.9, 10, 16713.045349957192]
[5, 7, 200, 0.1, 0.9, 100, 16852.18751337757]
[5, 7, 500, 0.02, 0.5, 0.1, 29472.200382598458]
[5, 7, 500, 0.02, 0.5, 1, 29472.21899079623]
[5, 7, 500, 0.02, 0.5, 10, 29471.120612157534]
[5, 7, 500, 0.02, 0.5, 100, 29454.29890839041]
[5, 7, 500, 0.02, 0.7, 0.1, 28922.174550513697]
[5, 7, 500, 0.02, 0.7, 1, 28922.19885755565]
[5, 7, 500, 0.02, 0.7, 10, 28907.299015410958]
[5, 7, 500, 0.02, 0.7, 100, 28934.396685038526]
[5, 7, 500, 0.02, 0.9, 0.1, 28501.49973244863]
[5, 7, 500, 0.02, 0.9, 1, 28491.63025738442]
[5, 7, 500, 0.02, 0.9, 10, 28464

[5, 9, 1000, 0.05, 0.9, 0.1, 17568.12598994007]
[5, 9, 1000, 0.05, 0.9, 1, 17609.716248394692]
[5, 9, 1000, 0.05, 0.9, 10, 17605.892685145547]
[5, 9, 1000, 0.05, 0.9, 100, 17546.030741652397]
[5, 9, 1000, 0.1, 0.5, 0.1, 17059.305864726026]
[5, 9, 1000, 0.1, 0.5, 1, 17053.117843000855]
[5, 9, 1000, 0.1, 0.5, 10, 17040.826426048803]
[5, 9, 1000, 0.1, 0.5, 100, 17224.27185894692]
[5, 9, 1000, 0.1, 0.7, 0.1, 15953.90073844178]
[5, 9, 1000, 0.1, 0.7, 1, 15953.943038313357]
[5, 9, 1000, 0.1, 0.7, 10, 15946.830104880137]
[5, 9, 1000, 0.1, 0.7, 100, 16176.89711312072]
[5, 9, 1000, 0.1, 0.9, 0.1, 16939.568158711474]
[5, 9, 1000, 0.1, 0.9, 1, 16926.68622913099]
[5, 9, 1000, 0.1, 0.9, 10, 17216.106191138697]
[5, 9, 1000, 0.1, 0.9, 100, 17290.656209867295]
[5, 11, 200, 0.02, 0.5, 0.1, 29728.04731645976]
[5, 11, 200, 0.02, 0.5, 1, 29728.079609910103]
[5, 11, 200, 0.02, 0.5, 10, 29779.350398651542]
[5, 11, 200, 0.02, 0.5, 100, 29751.50373234161]
[5, 11, 200, 0.02, 0.7, 0.1, 29293.952656785103]
[5, 1

#### Best models used with imputed data

In [17]:
check_xdgb(imputed_X_train, imputed_X_valid)
max_scores_label = sorted(all_scores, key = lambda x: x[-1], reverse = False)[:5]

NameError: name 'check_xdgb' is not defined

#### Chose the best model 

In [40]:
best_model = XGBRegressor(early_stopping_rounds=3,
                                              n_estimators=200,
                                                max_depth=7,
                                               learning_rate=0.1,
                                               colsample_bytree=0.7,
                                               reg_alpha=10,
                                               reg_lambda=1,
                                               objective ='reg:squarederror',
                                               random_state=1,)

In [41]:
best_model.fit(label_X_train, y_train)
prediction = best_model.predict(label_X_valid)
mae = mean_absolute_error(prediction, y_valid)

In [42]:
print(mae)

15854.045390089897
