In [2]:
# load data
import pandas as pd
import numpy as np
from pandas import DataFrame
base_path='D:/kaggle/regression/'
all_data=DataFrame.from_csv(base_path+'cleaned_train.csv',index_col='Id')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 263 entries, LotFrontage to SaleCondition__Partial
dtypes: float64(231), int64(32)
memory usage: 2.9 MB


In [3]:
all_id=all_data.index
all_y=all_data['SalePrice']
all_x=all_data.drop(['SalePrice'],axis=1)
all_x.info()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(all_x, all_y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 262 entries, LotFrontage to SaleCondition__Partial
dtypes: float64(231), int64(31)
memory usage: 2.9 MB


In [4]:
from sklearn.metrics import mean_squared_error

def rmse_log_error(predict,reality):
    predict_log=np.log(predict)
    reality_log=np.log(reality)
    mse=mean_squared_error(reality_log, predict_log)
    rmse=np.sqrt(mse)
    return rmse

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor

# A function to train several models for the train/test pair for one fold
# How to tune parameters is out of scope of this script, check out sklearn's GridSearchCV or RandomizedSearchCV
def one_fold(train_x,train_y,test_x,test_y):
    # Report shape
    print "Training set has shape: ",X_train.shape
    print "Test set has shape: ",X_test.shape

    # Random forest with mse, auto-tuned
    print "Random forest with mse"
    rf0 = RandomForestRegressor(criterion='mse', n_estimators=126,max_features=0.5,max_depth=10,random_state=42)
    print "Fitting random forest with mse"
    rf0.fit(train_x, train_y)
    print 'Predicting on test set'
    rf0_result=rf0.predict(test_x)
    print('RF Auto RMSE {score}'.format(score=rmse_log_error(test_y, rf0_result)))
    
    # Random forest with mse, hand-tuned
    print "Random forest with mse"
    rf1 = RandomForestRegressor(criterion='mse', n_estimators=250,max_features=0.21,max_depth=31,random_state=42)
    print "Fitting random forest with mse"
    rf1.fit(train_x, train_y)
    print 'Predicting on test set'
    rf1_result=rf1.predict(test_x)
    print('RF Manual RMSE {score}'.format(score=rmse_log_error(test_y, rf1_result)))
    
    # Extra tree regressor
    print "ExtraTreesRegressor"
    et = ExtraTreesRegressor(criterion='mse', n_estimators=180,max_features=0.2,max_depth=28,n_jobs=-1,random_state=42)
    print "Fitting extra trees regressor with mse"
    et.fit(train_x,train_y)
    print 'Predicting on test set'
    et_result=et.predict(test_x)   
    print('ET RMSE {score}'.format(score=rmse_log_error(test_y, et_result)))
    
    # Adaboost linear
    print 'Adaboost linear'
    ad0=AdaBoostRegressor(loss='linear',learning_rate=0.2,n_estimators=210,random_state=42)
    print 'Fitting adaboost linear'
    ad0.fit(train_x,train_y)
    print 'Predicting on test set'
    ad_result=ad0.predict(test_x)  
    print('Adaboost Linear RMSE {score}'.format(score=rmse_log_error(test_y, ad_result)))
    
    # Adaboost square
    print 'Adaboost square'
    ad1=AdaBoostRegressor(loss='square',learning_rate=0.526,n_estimators=410,random_state=42)
    print 'Fitting adaboost linear'
    ad1.fit(train_x,train_y)
    print 'Predicting on test set'
    ad1_result=ad1.predict(test_x)  
    print('Adaboost Square RMSE {score}'.format(score=rmse_log_error(test_y, ad1_result)))
    
    # Adaboost exp
    print 'Adaboost exponential'
    ad2=AdaBoostRegressor(loss='exponential',learning_rate=0.342,n_estimators=158,random_state=42)
    print 'Fitting adaboost linear'
    ad2.fit(train_x,train_y)
    print 'Predicting on test set'
    ad2_result=ad2.predict(test_x)  
    print('Adaboost Exponential RMSE {score}'.format(score=rmse_log_error(test_y, ad2_result)))
    
    
    
    regressors={
        'rf0':rf0,
        'rf1':rf1,
        'et':et,
        'ad0':ad0,
        'ad1':ad1,
        'ad2':ad2
    }
    
    return regressors

In [8]:
from sklearn.model_selection import train_test_split

regressors={}
for i in range(10):
    print 'Iteration ',i
    X_train, X_test, y_train, y_test=train_test_split(all_x, all_y, test_size=0.2, random_state=i)
    batch=one_fold(X_train, y_train, X_test, y_test)
    regressors['rf_auto_'+str(i)]=batch['rf0']
    regressors['rf_manual_'+str(i)]=batch['rf1']
    regressors['ad_linear_'+str(i)]=batch['ad0']
    regressors['ad_sqr_'+str(i)]=batch['ad1']
    regressors['ad_exp_'+str(i)]=batch['ad2']
    regressors['et_'+str(i)]=batch['et']
    
print len(regressors),' regressors trained'

Iteration  0
Training set has shape:  (1168, 262)
Test set has shape:  (292, 262)
Random forest with mse
Fitting random forest with mse
Predicting on test set
RF Auto RMSE 0.135227049228
Random forest with mse
Fitting random forest with mse
Predicting on test set
RF Manual RMSE 0.13444192827
ExtraTreesRegressor
Fitting extra trees regressor with mse
Predicting on test set
ET RMSE 0.137454863367
Adaboost linear
Fitting adaboost linear
Predicting on test set
Adaboost Linear RMSE 0.189990058727
Adaboost square
Fitting adaboost linear
Predicting on test set
Adaboost Square RMSE 0.21154859265
Adaboost exponential
Fitting adaboost linear
Predicting on test set
Adaboost Exponential RMSE 0.188599456703
Iteration  1
Training set has shape:  (1168, 262)
Test set has shape:  (292, 262)
Random forest with mse
Fitting random forest with mse
Predicting on test set
RF Auto RMSE 0.142306752728
Random forest with mse
Fitting random forest with mse
Predicting on test set
RF Manual RMSE 0.14237668758
Ext

In [9]:
# The predictions then form the training samples for an L2 model as an ensemble.
l2_train=pd.DataFrame()
l2_test=pd.DataFrame()
for i in range(10):
    clf_names=['rf_auto_'+str(i),'rf_manual_'+str(i),'ad_linear_'+str(i),'ad_sqr_'+str(i),'ad_exp_'+str(i),'et_'+str(i)] # find regressors by name
    print "This batch contains: ",clf_names
    for clf_name in clf_names:
        clf=regressors[clf_name]
            
        # Train
        this_y=clf.predict(X_train)
        l2_train[clf_name]=this_y
        
        # Test
        this_y_cv=clf.predict(X_test)
        l2_test[clf_name]=this_y_cv
        
    print "End of iteration ",i

# Append the target
# l2_train['SalePrice']=y_train
# l2_test['SalePrice']=y_test

l2_train.describe()
l2_test.describe()

This batch contains:  ['rf_auto_0', 'rf_manual_0', 'ad_linear_0', 'ad_sqr_0', 'ad_exp_0', 'et_0']
End of iteration  0
This batch contains:  ['rf_auto_1', 'rf_manual_1', 'ad_linear_1', 'ad_sqr_1', 'ad_exp_1', 'et_1']
End of iteration  1
This batch contains:  ['rf_auto_2', 'rf_manual_2', 'ad_linear_2', 'ad_sqr_2', 'ad_exp_2', 'et_2']
End of iteration  2
This batch contains:  ['rf_auto_3', 'rf_manual_3', 'ad_linear_3', 'ad_sqr_3', 'ad_exp_3', 'et_3']
End of iteration  3
This batch contains:  ['rf_auto_4', 'rf_manual_4', 'ad_linear_4', 'ad_sqr_4', 'ad_exp_4', 'et_4']
End of iteration  4
This batch contains:  ['rf_auto_5', 'rf_manual_5', 'ad_linear_5', 'ad_sqr_5', 'ad_exp_5', 'et_5']
End of iteration  5
This batch contains:  ['rf_auto_6', 'rf_manual_6', 'ad_linear_6', 'ad_sqr_6', 'ad_exp_6', 'et_6']
End of iteration  6
This batch contains:  ['rf_auto_7', 'rf_manual_7', 'ad_linear_7', 'ad_sqr_7', 'ad_exp_7', 'et_7']
End of iteration  7
This batch contains:  ['rf_auto_8', 'rf_manual_8', 'ad_l

Unnamed: 0,rf_auto_0,rf_manual_0,ad_linear_0,ad_sqr_0,ad_exp_0,et_0,rf_auto_1,rf_manual_1,ad_linear_1,ad_sqr_1,...,ad_linear_8,ad_sqr_8,ad_exp_8,et_8,rf_auto_9,rf_manual_9,ad_linear_9,ad_sqr_9,ad_exp_9,et_9
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,...,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,188002.585453,187791.331384,195266.722363,201233.099852,196788.682808,188936.059554,187273.952684,187491.113644,193822.065755,200361.402547,...,193804.506873,199424.947256,194502.868138,189802.831058,185139.883758,185022.188726,190843.354134,196316.120544,192367.778812,185116.864469
std,76706.648014,76188.954908,72908.549084,72957.973637,73181.767297,82261.125305,73669.677542,73287.74116,72136.948184,74870.767073,...,75490.369424,75967.740887,75032.670615,82550.470979,67043.677536,65908.578581,67351.982059,65160.258217,68607.886735,64789.911952
min,60959.015295,62591.344,114596.085443,125644.113475,110208.456349,60000.0,67167.337609,64581.128,117515.840708,124986.055172,...,114372.103053,124821.07989,114787.08982,60000.0,64129.129177,67693.236,113042.623239,123830.006897,112580.648438,70175.361111
25%,134132.633951,135204.709,134325.30228,149490.239264,139015.174771,134206.808712,134980.873707,135451.285,133961.657459,141367.55122,...,135453.09442,138202.860081,131767.183633,136645.676136,134647.950877,135123.249,133053.419056,145071.227116,132600.791611,136282.020139
50%,174746.859796,174209.828,178815.720286,181857.842105,179178.678965,172898.291667,175844.409418,174355.454,176692.94604,185418.410658,...,176029.036036,182434.074836,178471.810695,173450.0,173662.215906,174792.988,175836.159938,180657.395869,177661.001697,175159.233333
75%,217726.730015,219500.746,221293.19403,228820.571154,223441.432665,220626.041667,215140.43824,218815.529,227461.507463,230828.850668,...,220857.370559,235911.010001,221433.558613,223225.0,213624.623929,213798.715,223628.511401,229565.041336,224038.515464,213897.0
max,666643.883929,654331.692,657205.785047,755000.0,655096.118012,755000.0,631410.715552,620335.392,660454.348315,755000.0,...,723421.052632,751190.47619,705000.0,755000.0,461935.073091,480568.748,481307.733096,556776.66951,481882.334728,429208.138889


In [11]:
# Ensemble method 1: Logistic regression
from sklearn.linear_model import LinearRegression

l2_lr=LinearRegression()
l2_lr.fit(l2_train,y_train)
l2_lr_pred=l2_lr.predict(l2_test)
print('LR RMSE score: {score}'.format(score=rmse_log_error(y_test, l2_lr_pred)))

LR RMSE score: 0.118165207204


In [12]:
# Load target data to predict
base_path='D:/kaggle/regression/'
target_data=DataFrame.from_csv(base_path+'cleaned_test.csv',index_col='Id')
target_data=target_data.drop(['SalePrice'],axis=1)

In [13]:
# Generate L2 for test
l2_target=pd.DataFrame()
for i in range(10):
    clf_names=['rf_auto_'+str(i),'rf_manual_'+str(i),'ad_linear_'+str(i),'ad_sqr_'+str(i),'ad_exp_'+str(i),'et_'+str(i)] # find regressors by name
    print "This batch contains: ",clf_names
    for clf_name in clf_names:
        clf=regressors[clf_name]
        
        this_y=clf.predict(target_data)
        l2_target[clf_name]=this_y
        
    print "End of iteration ",i

This batch contains:  ['rf_auto_0', 'rf_manual_0', 'ad_linear_0', 'ad_sqr_0', 'ad_exp_0', 'et_0']
End of iteration  0
This batch contains:  ['rf_auto_1', 'rf_manual_1', 'ad_linear_1', 'ad_sqr_1', 'ad_exp_1', 'et_1']
End of iteration  1
This batch contains:  ['rf_auto_2', 'rf_manual_2', 'ad_linear_2', 'ad_sqr_2', 'ad_exp_2', 'et_2']
End of iteration  2
This batch contains:  ['rf_auto_3', 'rf_manual_3', 'ad_linear_3', 'ad_sqr_3', 'ad_exp_3', 'et_3']
End of iteration  3
This batch contains:  ['rf_auto_4', 'rf_manual_4', 'ad_linear_4', 'ad_sqr_4', 'ad_exp_4', 'et_4']
End of iteration  4
This batch contains:  ['rf_auto_5', 'rf_manual_5', 'ad_linear_5', 'ad_sqr_5', 'ad_exp_5', 'et_5']
End of iteration  5
This batch contains:  ['rf_auto_6', 'rf_manual_6', 'ad_linear_6', 'ad_sqr_6', 'ad_exp_6', 'et_6']
End of iteration  6
This batch contains:  ['rf_auto_7', 'rf_manual_7', 'ad_linear_7', 'ad_sqr_7', 'ad_exp_7', 'et_7']
End of iteration  7
This batch contains:  ['rf_auto_8', 'rf_manual_8', 'ad_l

In [14]:
# Predict with L2 LR
target_y=l2_lr.predict(l2_target)

# Inspect
y_df=pd.Series(target_y)
y_df.describe()

count      1459.000000
mean     179012.990105
std       69995.144681
min       51274.558471
25%      130919.936175
50%      160126.587229
75%      208582.195514
max      497846.869298
dtype: float64

In [15]:
target_data.index

Int64Index([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470,
            ...
            2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919],
           dtype='int64', name=u'Id', length=1459)

In [16]:
# Output
y_df.index=target_data.index
y_df.to_csv(base_path+'target_handtuned_10rounds.csv')

In [None]:
# Tune LR not used

print "Linear regression L2 tuning"
params={
    'n_estimators':[42,43,44],
    'max_features':[0.0002,0.0003,0.0004],
    'max_depth':[1]
        }
from sklearn.grid_search import GridSearchCV
rfc = RandomForestRegressor(criterion='mse', n_jobs=-1)
gs = GridSearchCV(rfc, params,cv=5,verbose=2)
gs.fit(X_train, y_train)
print 'Report scores'
print gs.grid_scores_
print("Report best params for random forest")
best_parameters, score, _ = min(gs.grid_scores_, key=lambda x: x[1])
reportParams(best_parameters, score)