In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
import xgboost as xgb
import pickle
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_squared_error
import gc
%matplotlib inline

In [2]:
dataprocesseddir = '../data/processed'
datarawdir = '../data/raw'
data = pd.read_csv(os.path.join(dataprocesseddir, 'all.csv'))
train_raw = pd.read_csv(os.path.join(datarawdir, 'train.csv'))
test_raw = pd.read_csv(os.path.join(datarawdir, 'test.csv'))

In [3]:
def downcast_type(df):
    float_cols = [c for c in df if df[c].dtype == 'float64']
    int_cols = [c for c in df if df[c].dtype == 'int64']
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    
    return df

In [4]:
data = downcast_type(data)
gc.collect()

7

In [5]:
data.isnull().sum().sort_values(ascending=False).head(7)

LotFrontage     1687
GarageYrBlt     1538
MasVnrArea        15
BsmtFullBath       2
BsmtHalfBath       2
GarageArea         1
BsmtFinSF1         1
dtype: int64

In [6]:
data['GarageYrBlt'].dtype

dtype('float32')

In [7]:
data = data.fillna(data.mean())

In [8]:
data.shape

(2919, 285)

data: (2919, 285)  
train: (1460, 65)  
test: (1459, 80)  

In [9]:
target = train_raw['SalePrice']
train = data[:1460]
test = data[1460:]

In [10]:
print(train.shape)
print(test.shape)
print(target.shape)

(1460, 285)
(1459, 285)
(1460,)


In [12]:
gbr = GradientBoostingRegressor(random_state=0)
param_grid = {
    'n_estimators': [500],
    'max_features': [10, 15],
    'max_depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.15], 
    'subsample': [0.8]
}
model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10)
model.fit(train, target)
print('Gradient boosted tree regression...')
print('Best params:')
print(model.best_params_)
print('Best CV score:')
print(-model.best_score_)

Gradient boosted tree regression...
Best params:
{'learning_rate': 0.05, 'max_depth': 6, 'max_features': 10, 'n_estimators': 500, 'subsample': 0.8}
Best CV score:
-0.8927131440515963


In [13]:
# save model
file = open('../data/processed/rgb_trained_model', 'wb')
s = pickle.dump(model, file)
file.close()

In [11]:
# load model
file = open('../data/processed/rgb_trained_model','rb')
model = pickle.load(file)
file.close()

In [12]:
predictions = model.predict(test)

In [19]:
predictions

array([120713.43723803, 170889.78959987, 181733.92886423, ...,
       160022.8169257 , 128308.89101863, 210027.61954121])

In [21]:
prediction_id = test_raw['Id']
submission_df = {'Id': prediction_id,
                'SalePrice': predictions}
submission = pd.DataFrame(submission_df)
submission.to_csv('../data/processed/submission.csv', index=False)