## Gradient boosting on house prices

In [1]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
#get data out
house_train = pd.read_csv('../datasets/house_prices/train.csv')
house_test  = pd.read_csv('../datasets/house_prices/test.csv')
print(house_train.shape, house_test.shape)

(1460, 81) (1459, 80)


In [3]:
#simple preprocessing
dummies_train = pd.get_dummies(house_train)
y_train = np.log1p(dummies_train['SalePrice'])
dummies_test = pd.get_dummies(house_test)
dummies_train.dropna(inplace = True, axis=1)
dummies_test.dropna(inplace = True, axis=1)
dummies_train = dummies_train[dummies_test.columns]
dummies_train.shape, dummies_test.shape


((1460, 260), (1459, 260))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(dummies_train, 
                                                    y_train,
                                                    test_size=0.3, random_state=42)

In [5]:
model = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
        max_depth=1, random_state=0, loss='ls')
model.fit(X_train, y_train)
pred = np.expm1(model.predict(X_test))
model.score(X_test, y_test), mean_squared_error(y_test, pred)

(0.88377455794633375, 35874579296.99369)

### Trying a submission

In [6]:
def submit(df, title='Solution.csv'):
    df.to_csv(title, index=False)

pred = np.expm1(model.predict(dummies_test))
to_submit = pd.DataFrame({"Id":house_test.Id, "SalePrice":pred})
submit(to_submit)