In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn import cross_validation, metrics
from sklearn.cross_validation import KFold
from xgboost.sklearn import XGBClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
import timeit
import datetime
import random
import copy
import sys

In [None]:
train_df = pd.read_csv('../housing-prices/train.csv')
test_df = pd.read_csv('../housing-prices/test.csv')

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.describe()

In [None]:
train_df.columns

In [None]:
train_df.dtypes.unique()

In [None]:
prices = train_df['SalePrice']

In [None]:
n_train = train_df.shape[0]
n_test = test_df.shape[0]

In [None]:
log_prices = np.log(prices)

In [None]:
hist = sns.distplot(np.log(prices))

In [None]:
correlations = train_df.select_dtypes(include=['float64','int64']).corr()
print correlations

In [None]:
plt.figure(figsize=(22,22))
sns.heatmap(correlations, annot=True)

In [None]:
train_df.drop(['Id', 'SalePrice'], axis=1, inplace=True)
test_df.drop(['Id'], axis=1, inplace=True)

In [None]:
comb_ds = pd.concat((train_df, test_df)).reset_index(drop=True)
comb_ds = pd.get_dummies(comb_ds)
comb_ds = comb_ds.fillna(comb_ds.mean())


'''
object_cols = train_df.select_dtypes(include=['O']).columns

comb_ds = pd.concat((train_df, test_df)).reset_index(drop=True)
try:
    for cat_col in object_cols:
        comb_ds[cat_col] = pd.factorize(comb_ds[cat_col])[0]
except ValueError,e:
    print ValueError, e, cat_col
'''

In [None]:
comb_ds.drop(['1stFlrSF','GarageCars'], axis=1, inplace=True)


In [None]:
train_df = comb_ds[:n_train]
test_df = comb_ds[n_train:]
train_df.isnull().any()

In [None]:
comb_ds.shape

In [None]:
def log_it(params, num_kf, kfolds, n_rounds, score, note=' no note'):
    """Function to log parameters for validation predictions"""
    f = open('kf_output.txt', 'a')
    f.write('**NEW**/n')
    f.write('\n')
    f.write('num_kf: ' + str(num_kf))
    f.write('\nkfolds: ' + str(kfolds))
    f.write('\nn_rounds: ' + str(n_rounds))
    f.write('\n')
    f.write(str(params))
    f.write('\n')
    f.write('SCORE: ' + str(score))
    f.write('\n')
    f.write('note: '+ note)
    f.write('\n')

## Grid Search

I've implemented my own grid searches below to find the best parameters.

In [None]:
def test_it(params, max_rounds, n_fold, early_stopping_rounds = 25):
    n_rounds=max_rounds

    dtrain = xgb.DMatrix(train_df, label = log_prices)

    res = xgb.cv(params, dtrain, num_boost_round=n_rounds, nfold=n_fold, seed=21, stratified=False,
                 early_stopping_rounds = early_stopping_rounds, verbose_eval=500, show_stdv=True)
    score = res.iloc[len(res)-1][0]
    print 'Num Rounds for score: ', score, ' = ', len(res)-1
    log_it(params, 0, 6, n_rounds,score)
       
    return score

def grid_search(params_dict, params_orig, n_folds, max_rounds):
    
    min_score = sys.maxint
    min_params = ''
    
    for param in params_dict:
        params_orig[param] = params_dict[param][0]

    params_arr = list([copy.deepcopy(params_orig)])
    for param in params_dict:
        extend_params = list()
        for i, el in enumerate(params_dict[param]):
            if i == 0:
                continue
            if params_arr:
                for param_arr in params_arr:
                    temp_param_arr = dict(param_arr)
                    temp_param_arr[param] = el
                    extend_params.append(temp_param_arr)
        params_arr.extend(extend_params)

    print 'Length of parameter dictionaries to test:', len(params_arr)

    for params_el in params_arr:
        score = test_it(params_el, max_rounds, n_folds)
        if score < min_score:
                print 'new low score:', score
                min_score = score
                min_params = copy.deepcopy(params_el)
    print min_score, str(min_params)

In [None]:
params_orig = {'max_depth':7, 
               'min_child_weight': 5.3,
               'eta':.025, 
               'silent':1, 
               'objective':'reg:linear', 
               'eval_metric': 'rmse',
               'subsample': .9,
               'colsample_bytree': .6,
               'gamma': 0,
               'reg_alpha': 0
         }



# GRID SEARCH - Change the params_dict to test different parameters

params_dict = {
    'colsample_bytree': [.6,.8,1],
    'max_depth': [5,7,9],
    'subsample': [.8,.9,1],
    'colsample_bytree': [.6,.8,1],
    'eta': [.13]
}

n_folds = 10
max_rounds = 1500

grid_search(params_dict, params_orig, n_folds, max_rounds)

In [None]:
def kf_train(num_kf, t_ds, targets, n_folds, num_rounds, params):
    """Function to train num_kf kfold instances.  Used to blend booster predictions"""
    
    targets = np.array(targets)
    kf_arr = list()
    for i in range(num_kf):
        kf_arr.append(KFold(len(t_ds), n_folds = n_folds, shuffle=True, random_state=(int(random.random()*100000))))
        
    kf = KFold(n_train, n_folds = n_folds, shuffle=True, random_state=442)
    
    bst_arr = list()
    fin_score = 0.0
    for k, kf_i in enumerate(kf_arr):
        for i, (train_idx, test_idx) in enumerate(kf_i):
            dtrain = xgb.DMatrix(t_ds.iloc[train_idx], label=targets[train_idx])
            booster = xgb.train(params, dtrain, num_rounds)
            dtest = xgb.DMatrix(t_ds.iloc[test_idx])
            ypred = booster.predict(dtest)
            score = mean_absolute_error(targets[test_idx], ypred)
            print("Predictions received score: {}".format(score))
            fin_score += score
            bst_arr.append(booster)
            print 'finished ' + str(i+1) + 'th iteration'
    print fin_score / (num_kf*n_folds)
    return bst_arr

def test_bsts(saved_bsts, targets):
    """Generates predictions on targets from array of saved_bsts"""
    
    dtest = xgb.DMatrix(targets)
    ypred_arr = np.ndarray((len(targets),len(saved_bsts)))
    for i,bst in enumerate(saved_bsts): 
        ypred_arr[:,i] = bst.predict(dtest)
    ypred = ypred_arr.mean(axis=1)
    return ypred

In [None]:
params_orig = {'max_depth':7, 
               'min_child_weight': 5.3,
               'eta':.025, 
               'silent':1, 
               'objective':'reg:linear', 
               'eval_metric': 'rmse',
               'subsample': .9,
               'colsample_bytree': .6,
               'gamma': 0,
               'reg_alpha': 0
         }

num_kf = 1
kfolds = 10
n_rounds = 691
bst_arr3 = kf_train(num_kf, train_df, log_prices, kfolds, n_rounds, params_orig)

In [None]:

dtrain = xgb.DMatrix(train_df, label = log_prices)
bst = xgb.train(params_orig, dtrain, 691)

In [None]:
pd.DataFrame(bst.get_fscore().items(), columns=['feature','importance']).sort_values('importance', ascending=False)

In [None]:
#dtest = xgb.DMatrix(test_df)
#ypred = bst.predict(dtest)

In [None]:
fin_arrs = bst_arr3
ypred = test_bsts(fin_arrs, test_df)

In [None]:
sub_file = pd.read_csv('../housing-prices/sample_submission.csv')
print ypred.shape, sub_file.shape
sub_file.iloc[:,1] = np.exp(ypred)
now = datetime.datetime.now().strftime("%d-%m-%y--%H:%M")
sub_file.to_csv('../housing-prices/submissions/my_submission' + now + '.csv', index=None)