In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#  Read Data

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [3]:
df_all = pd.concat((df_train, df_test))

In [4]:
df_all['SalePrice'] = df_all['SalePrice'].apply(np.log)

#  Fill Missing Data

In [5]:
for col in df_all.columns:
    if col in ['Id', 'SalePrice']:
        continue
    
    if df_all.dtypes[col] == 'object':
        df_all[col].fillna(df_all[col].value_counts().index[0], inplace=True)
    else:
        df_all[col].fillna(df_all[col].mean(), inplace=True)

# Factorize Objects

In [6]:
for col in df_all.columns[df_all.dtypes == 'object']:
    df_all[col + '_fact'] = pd.factorize(df_all[col])[0]
    df_all[col + '_fact'].replace(-1, np.nan, inplace=True)

# Cross Validation

In [7]:
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
features = list(df_all.columns[df_all.dtypes != 'object'])
features.remove('Id')
features.remove('SalePrice')

In [9]:
len(features)

79

In [19]:
train_scores = []
val_scores = []
models = []

for i, (train_ind, val_ind) in enumerate(KFold(len(df_train), n_folds=10, shuffle=True, random_state=36)):   
    X_train = df_all[features].values[train_ind]
    X_val = df_all[features].values[val_ind]
    
    y_train = df_all['SalePrice'].values[train_ind]
    y_val = df_all['SalePrice'].values[val_ind]
    
    rfr = RandomForestRegressor(n_estimators=500, min_samples_leaf=5).fit(X_train, y_train)
    models.append(rfr)
    train_scores.append(mean_squared_error(y_train, rfr.predict(X_train))**0.5)                                                  
    val_scores.append(mean_squared_error(y_val, rfr.predict(X_val))**0.5)
    
    print '\t%d\t%.4f\t%.4f' % (i, train_scores[-1], val_scores[-1])

print '%.4f\t%.4f' % (np.mean(train_scores), np.mean(val_scores))

	0	0.0928	0.1634
	1	0.0920	0.1717
	2	0.0936	0.1383
	3	0.0950	0.1130
	4	0.0920	0.1479
	5	0.0929	0.1497
	6	0.0956	0.1053
	7	0.0922	0.1615
	8	0.0941	0.1274
	9	0.0916	0.1704
0.0932	0.1449


# Predict On Test 