In [1]:
# load data
import pandas as pd
import numpy as np
from pandas import DataFrame
base_path='D:/kaggle/regression/'
all_data=DataFrame.from_csv(base_path+'cleaned_train.csv',index_col='Id')
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 263 entries, LotFrontage to SaleCondition__Partial
dtypes: float64(231), int64(32)
memory usage: 2.9 MB


In [2]:
all_id=all_data.index
all_y=all_data['SalePrice']
all_x=all_data.drop(['SalePrice'],axis=1)
all_x.info()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(all_x, all_y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Columns: 262 entries, LotFrontage to SaleCondition__Partial
dtypes: float64(231), int64(31)
memory usage: 2.9 MB


In [3]:
from sklearn.metrics import mean_squared_error

def rmse_log_error(predict,reality):
    predict_log=np.log(predict)
    reality_log=np.log(reality)
    mse=mean_squared_error(reality_log, predict_log)
    rmse=np.sqrt(mse)
    return rmse

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor

# A function to train several models for the train/test pair for one fold
# How to tune parameters is out of scope of this script, check out sklearn's GridSearchCV or RandomizedSearchCV
def one_fold(train_x,train_y,test_x,test_y):
    # Report shape
    print "Training set has shape: ",X_train.shape
    print "Test set has shape: ",X_test.shape

    # Random forest with mse
    print "Random forest with mse"
    params={
    'n_estimators':[126],
    'max_features':[0.5],
    'max_depth':[10]
        }
    rf = RandomForestRegressor(criterion='mse', n_estimators=126,max_features=0.5,max_depth=10,random_state=42)
    print "Fitting random forest with mse"
    rf.fit(train_x, train_y)
    print 'Predicting on test set'
    rf_result=rf.predict(test_x)
    
    #print 'Target is ',test_y
    #print 'Got ',rf_result
    
    print('RF RMSE {score}'.format(score=rmse_log_error(test_y, rf_result)))
    
    # Extra tree regressor
    print "ExtraTreesRegressor"
    et = ExtraTreesRegressor(criterion='mse', n_estimators=180,max_features=0.2,max_depth=28,n_jobs=-1,random_state=42)
    print "Fitting extra trees regressor with mse"
    et.fit(train_x,train_y)
    print 'Predicting on test set'
    et_result=et.predict(test_x)
    
    #print 'Target is ',test_y
    #print 'Got ',et_result
    
    print('ET RMSE {score}'.format(score=rmse_log_error(test_y, et_result)))
    
    # Adaboost
    print 'Adaboost linear'
    ad=AdaBoostRegressor(loss='linear',learning_rate=0.2,n_estimators=210,random_state=42)
    print 'Fitting adaboost linear'
    ad.fit(train_x,train_y)
    print 'Predicting on test set'
    ad_result=ad.predict(test_x)
    
    #print 'Target is ',test_y
    #print 'Got ',ad_result
    
    print('Adaboost MSE {score}'.format(score=rmse_log_error(test_y, ad_result)))
    
    
    regressors={
        'rf':rf,
        'et':et,
        'ad':ad
    }
    
    return regressors

In [5]:
regressors=one_fold(X_train, y_train, X_test, y_test)

Training set has shape:  (1168, 262)
Test set has shape:  (292, 262)
Random forest with mse
Fitting random forest with mse
Predicting on test set
RF RMSE 0.151704956487
ExtraTreesRegressor
Fitting extra trees regressor with mse
Predicting on test set
ET RMSE 0.159554597793
Adaboost linear
Fitting adaboost linear
Predicting on test set
Adaboost MSE 0.21884931864


In [6]:
# The predictions then form the training samples for an L2 model as an ensemble.
l2_train=pd.DataFrame()
l2_test=pd.DataFrame()
for i in range(1):
    clf_names=['rf','ad','et'] # find classifiers by name
    print "This batch contains: ",clf_names
    for clf_name in clf_names:
        clf=regressors[clf_name]
            
        # Train
        this_y=clf.predict(X_train)
        l2_train[clf_name]=this_y
        
        # Test
        this_y_cv=clf.predict(X_test)
        l2_test[clf_name]=this_y_cv
        
    print "End of iteration ",i

# Append the target
# l2_train['SalePrice']=y_train
# l2_test['SalePrice']=y_test

l2_train.describe()
l2_test.describe()

This batch contains:  ['rf', 'ad', 'et']
End of iteration  0


Unnamed: 0,rf,ad,et
count,292.0,292.0,292.0
mean,178431.200438,186703.920696,179022.143654
std,73248.84637,69423.81899,71460.784868
min,63847.903728,119165.646259,69520.833333
25%,126765.907777,131431.501485,127829.240694
50%,154424.859306,165716.841685,156062.633333
75%,209801.394072,218838.713041,207618.598611
max,562652.378571,557264.052265,480795.877778


In [7]:
# Ensemble method 1: Logistic regression
from sklearn.linear_model import LinearRegression

l2_lr=LinearRegression()
l2_lr.fit(l2_train,y_train)
l2_lr_pred=l2_lr.predict(l2_test)
print('LR RMSE score: {score}'.format(score=rmse_log_error(y_test, l2_lr_pred)))

LR RMSE score: 0.159557296817


In [10]:
# Load target data to predict
base_path='D:/kaggle/regression/'
target_data=DataFrame.from_csv(base_path+'cleaned_test.csv',index_col='Id')
target_data=target_data.drop(['SalePrice'],axis=1)

In [11]:
# Generate L2 for test
l2_target=pd.DataFrame()
for i in range(1):
    clf_names=['rf','ad','et'] # find classifiers by name
    print "This batch contains: ",clf_names
    for clf_name in clf_names:
        clf=regressors[clf_name]
        
        this_y=clf.predict(target_data)
        l2_target[clf_name]=this_y
        
    print "End of iteration ",i

This batch contains:  ['rf', 'ad', 'et']
End of iteration  0


In [13]:
# Predict with L2 LR
target_y=l2_lr.predict(l2_target)

# Inspect
y_df=pd.Series(target_y)
y_df.describe()

count      1459.000000
mean     179256.079999
std       68934.180289
min       55025.858421
25%      131492.258922
50%      159540.840510
75%      208405.733719
max      458751.979009
dtype: float64

In [17]:
target_data.index

Int64Index([1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469, 1470,
            ...
            2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918, 2919],
           dtype='int64', name=u'Id', length=1459)

In [19]:
# Output
y_df.index=target_data.index
y_df.to_csv(base_path+'target.csv')

In [None]:
# Tune LR not used

print "Linear regression L2 tuning"
params={
    'n_estimators':[42,43,44],
    'max_features':[0.0002,0.0003,0.0004],
    'max_depth':[1]
        }
from sklearn.grid_search import GridSearchCV
rfc = RandomForestRegressor(criterion='mse', n_jobs=-1)
gs = GridSearchCV(rfc, params,cv=5,verbose=2)
gs.fit(X_train, y_train)
print 'Report scores'
print gs.grid_scores_
print("Report best params for random forest")
best_parameters, score, _ = min(gs.grid_scores_, key=lambda x: x[1])
reportParams(best_parameters, score)