# House Prices - Other Algorithms - Israel Chaparrro

## 1. Get Data

In [3]:
# import libraries
import pandas as pd
import seaborn as sns
from scipy import stats
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold

# get data
train = pd.read_csv('https://raw.githubusercontent.com/ichaparroc/house-prices/master/train.csv', index_col=0)
test = pd.read_csv('https://raw.githubusercontent.com/ichaparroc/house-prices/master/test.csv', index_col=0)

## 2. Prepare Data

In [5]:
# searching for missing data
total = train.isnull().sum().sort_values(ascending=False)
percent = (train.isnull().sum()/train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
PoolQC,1453,0.995205
MiscFeature,1406,0.963014
Alley,1369,0.937671
Fence,1179,0.807534
FireplaceQu,690,0.472603
LotFrontage,259,0.177397
GarageYrBlt,81,0.055479
GarageType,81,0.055479
GarageQual,81,0.055479
GarageCond,81,0.055479


We choose delete all columns on percent is bigger than 0.15, after, we delete redundant information (Garage, Bsmt, MasVnr) and delete the row of Electrical missing value).

In [6]:
# deleting missing columns and missing rows
train = train.drop((missing_data[missing_data['Total'] > 1]).index,1)
train = train.drop(train.loc[train['Electrical'].isnull()].index)
train.isnull().sum().max() # checking for no missing data missing

0

In [7]:
# deleting missing columns and missing rows in test and set to 0 to all NaN/NULL
test = test.drop((missing_data[missing_data['Total'] > 1]).index,1)
test.fillna(0,inplace=True)
test.isnull().sum().max() # checking for no missing data missing

0

In [8]:
# convert categorial variables to discrete values, number of new cols = number of categories
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [9]:
# match cols in train and test (because is possible that some category does not exist in the train/test and is not discretized), after set 0 this discrete categories
missing_cols = set(train.drop(columns="SalePrice").columns) - set(test.columns)
for cols in missing_cols:
    test[cols] = 0
test = test[train.drop(columns="SalePrice").columns]
test.fillna(0,inplace=True)

In [10]:
# for a transformation of target value before the prediction with a normalized value (y-min)/(max-min)
min_saleprice=train['SalePrice'].min()
max_saleprice=train['SalePrice'].max()

In [11]:
# normalization
for var in train:
    train[var]=(train[var]-train[var].min())/(train[var].max()-train[var].min())
for var in test:
    test[var]=(test[var]-test[var].min())/(test[var].max()-test[var].min())

## 4. Splitting Data

In [12]:
# move SalePrice to y, we need reshape for a good matrix operations
X = train.drop(columns="SalePrice").values
X = np.column_stack((np.ones(X.shape[0]),X)) #Adding space for bias
y = train["SalePrice"].values
y = np.reshape(y,(y.shape[0],1))

In [13]:
# split data to train dataset and test dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# trying use sklearn to pre-processing the data
#from sklearn import preprocessing
#X_train = preprocessing.scale(X_train)
#X_test = preprocessing.scale(X_test)
#y_train = preprocessing.scale(y_train)
#y_test = preprocessing.scale(y_test)

## 5. Sklearn Linear Regression

In [14]:
value=0.0
for train, test in kf.split(X):

    x_train=X[train]
    x_test=X[test ]
    y_train=y[train]
    y_test=y[test]
    
    regression=LinearRegression()
    regression.fit(x_train,y_train)
    
    prediction = regr.predict(x_test)
    value+=np.sqrt(metrics.mean_squared_error(y_test,prediction))
print value/8

0.030811569602733773


## 6. Random Forest Regressor

In [15]:
value=0.0
for train, test in kf.split(X):

    x_train=X[train]
    x_test=X[test]
    y_train=y[train]
    y_test=y[test]
    
    regression=RandomForestRegressor(random_state=0,n_estimators=100)
    regression.fit(x_train,y_train)
    
    prediction = regr.predict(x_test)
    value+=np.sqrt(metrics.mean_squared_error(y_test,prediction))
print value/8

0.02975164651167615643


## 7. Gaussian Process Regressor

In [16]:
from sklearn.gaussian_process import GaussianProcessRegressor
value=0.0
for train, test in kf.split(X):

    x_train=X[train]
    x_test=X[test]
    y_train=y[train]
    y_test=y[test]
    
    regression=GaussianProcessRegressor(random_state=0)
    regression.fit(x_train,y_train)
    
    prediction = regr.predict(x_test)
    value+=np.sqrt(metrics.mean_squared_error(y_test,prediction))
print value/8

0.030811569602733773
