In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#  Read Data

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [3]:
df_all = pd.concat((df_train, df_test))

In [4]:
df_all['SalePrice'] = df_all['SalePrice'].apply(np.log)

# Add NB Locations

In [5]:
df_nb_locations = pd.read_csv('data/nb_locations.csv')
df_nb_locations.drop('name', axis=1, inplace=True)
df_nb_locations.rename(columns={'short': 'Neighborhood'}, inplace=True)

In [6]:
df_all = df_all.merge(df_nb_locations, how='left', on='Neighborhood')

#  Fill Missing Data

In [7]:
for col in df_all.columns:
    if col in ['Id', 'SalePrice']:
        continue
    
    if df_all.dtypes[col] == 'object':
        df_all[col].fillna(df_all[col].value_counts().index[0], inplace=True)
    else:
        df_all[col].fillna(df_all[col].mean(), inplace=True)

# Factorize & Dummies Objects

In [12]:
for col in df_all.columns[df_all.dtypes == 'object']:
    df_all[col + '_fact'] = pd.factorize(df_all[col])[0]
    df_all[col + '_fact'].replace(-1, np.nan, inplace=True)
    
    if df_all[col].nunique() > 2:
        df_all = pd.concat((df_all, pd.get_dummies(df_all[col])), axis=1)

# Cross Validation

In [14]:
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [15]:
features = list(df_all.columns[df_all.dtypes != 'object'])
features.remove('Id')
features.remove('SalePrice')

In [16]:
len(features)

326

In [17]:
train_scores = []
val_scores = []
models = []

for i, (train_ind, val_ind) in enumerate(KFold(len(df_train), n_folds=10, 
                                               shuffle=True, random_state=36)):   
    X_train = df_all[features].values[train_ind]
    X_val = df_all[features].values[val_ind]
    
    y_train = df_all['SalePrice'].values[train_ind]
    y_val = df_all['SalePrice'].values[val_ind]
    
    rfr = RandomForestRegressor(n_estimators=500, min_samples_leaf=5).fit(X_train, y_train)
    models.append(rfr)
    train_scores.append(mean_squared_error(y_train, rfr.predict(X_train))**0.5)                                                  
    val_scores.append(mean_squared_error(y_val, rfr.predict(X_val))**0.5)
    
    print '\t%d\t%.4f\t%.4f' % (i, train_scores[-1], val_scores[-1])

print '%.4f\t%.4f' % (np.mean(train_scores), np.mean(val_scores))

	0	0.0898	0.1622
	1	0.0894	0.1695
	2	0.0908	0.1369
	3	0.0916	0.1104
	4	0.0901	0.1443
	5	0.0902	0.1501
	6	0.0923	0.1072
	7	0.0886	0.1550
	8	0.0910	0.1300
	9	0.0889	0.1634
0.0903	0.1429


Above we have two columns, left one is the RMSE (root mean squared error) for the 90% we trained on, whereas the right one is the RMSE for the other 10% we left out. Obviously, the 90% is closer to the sales price since the model learned from that input. 

# View Features Importance

In [20]:
for i in np.argsort(models[6].feature_importances_)[::-1]:
    print '%.4f' % models[6].feature_importances_[i], features[i]

 0.5762 OverallQual
0.1103 GrLivArea
0.0411 GarageCars
0.0399 TotalBsmtSF
0.0262 GarageArea
0.0233 1stFlrSF
0.0184 BsmtFinSF1
0.0154 YearBuilt
0.0130 Min1
0.0115 LotArea
0.0096 YearRemodAdd
0.0085 OverallCond
0.0085 Fireplaces
0.0059 nb_alt
0.0056 nb_lon
0.0046 LotFrontage
0.0045 VinylSd
0.0043

IndexError: list index out of range

# Predict On Test 

## Predict only on the best model

In [21]:
df_sub = pd.DataFrame({
        'Id': df_all[df_all['SalePrice'].isnull()]['Id'].values,
        'SalePrice': np.exp(models[6].predict(df_all[df_all['SalePrice'].isnull()][features].values))
    })




In [22]:
df_sub.to_csv('sub_nblocs_dummies_model6.csv', index=False)

## Predict all models and mean them

In [29]:
df_sub = pd.DataFrame({
        'Id': df_all[df_all['SalePrice'].isnull()]['Id'].values,
        'SalePrice': np.mean([np.exp(models[i].predict(df_all[df_all['SalePrice'].isnull()][features].values)) 
                              for i in xrange(len(models))], axis=0)
    })

In [30]:
df_sub.to_csv('sub_models_mean.csv', index=False)