In [1]:
import numpy as np
import pandas as pd
def transform_features(df):

    ##### MSSubClass #####
    # Convert to string, since it's a code
    df['MSSubClass'] = df['MSSubClass'].astype(str)

    ##### OverallCond #####
    # Convert to string, since it doesn't look linearly related to price
    df['OverallCond'] = df['OverallCond'].astype(str)

    ##### LotFrontage #####
    # Impute missing value with mean
    df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].mean())
    # Add squared term to account for non-linearity
    df['sqLotFrontage'] = df['LotFrontage'] ** 2

    ##### LotArea #####
    # Add square-root term based on shape of date wrt response
    df['sqrtLotArea'] = np.sqrt(df['LotArea'])

    ##### YearBuilt #####
    # Add square term
    df['sqYearBuilt'] = df['YearBuilt'] ** 2

    ##### YearBuilt #####
    # Add square term
    df['sqYearBuilt'] = df['YearBuilt'] ** 2

    ##### MasVnrArea #####
    # Impute missing values for MasVnrArea
    df['MasVnrArea'] = df['MasVnrArea'].fillna(df['MasVnrArea'].mean())

    ##### BsmtFinSF1 #####
    # Add squared term to account for no basement case non-linearity
    df['sqBsmtFinSF1'] = df['BsmtFinSF1'] ** 2

    ##### BsmtUnfSF #####
    # Add squared term to account for no basement case non-linearity
    df['sqBsmtUnfSF'] = df['BsmtUnfSF'] ** 2

    ##### GrLivArea #####
    # Add squared term for non-linearity.
    df['GrLivArea_sq'] = df['GrLivArea'] ** 2

    ##### FullBath #####
    # Impute 0 values with mean
    df['FullBath'] = np.where((df['FullBath'] == 0), df['FullBath'].mean(), df['FullBath'])

    ##### BedroomAbvGr #####
    # Impute 0 values with mean
    df['BedroomAbvGr'] = np.where((df['BedroomAbvGr']==0), df['BedroomAbvGr'].mean(), df['BedroomAbvGr'])

    ##### add square term for garage year built
    df['sqGarageYrBlt'] = df['GarageYrBlt'] ** 2

    ##### add square term of GarageCars #####
    df['sqGarageCars'] = df['GarageCars'] ** 2

    ##### add square term of Garage Area #####
    df['sqGarageArea'] = df['GarageArea'] ** 2

    ##### add square term for WoodDeckSF #####
    df['sqWoodDeckSF'] = df['WoodDeckSF'] ** 2

    ##### add square term for OpenPorchSF #####
    df['sqOpenPorchSF'] = df['OpenPorchSF'] ** 2

    ##### add square term for EnclosedPorch #####
    df['sqEnclosedPorch'] = df['EnclosedPorch'] ** 2

    ##### add square term for ScreenPorch #####
    df['sqScreenPorch'] = df['ScreenPorch'] ** 2

    ##### add square term for 3SsnPorch #####
    df['sq3SsnPorch'] = df['3SsnPorch'] ** 2

    ##### add square term for PoolArea #####
    df['sqPoolArea'] = df['PoolArea'] ** 2

    ##### add square term of MoSold #####
    df['sqMoSold'] = df['MoSold'] ** 2

    ##### add square term of YrSold #####
    df['sqYrSold'] = df['YrSold'] ** 2

    ##### PoolQC #####
    df['HasPool'] = pd.notnull(df['PoolQC']).astype('int')
    # Drop this column, there's not enough data in any category
    df= df.drop(['PoolQC'], 1)

    ##### Id #####
    # Drop this column, it's just an identifier
    df = df.drop(['Id'], 1)

    return df



def transform_target(df):
    ##### Response = SalePrice #####
    df['logSalePrice'] = np.log(df['SalePrice'])
    df = df.drop('SalePrice', 1)
    return df

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Remove huge basement outlier from training data
df_train = df_train.drop(df_train[df_train['TotalBsmtSF'] > 4000].index)

# Split train into x and y
df_train_y = df_train.loc[:, ['SalePrice']]
df_train_x = df_train.drop(['SalePrice'], 1)


# Feature Engineering
df_train_y = transform_target(df_train_y) # train y
df_train_x = transform_features(df_train_x) # train x
df_test_x = transform_features(df_test) # test x


In [3]:
# For categorical features, remove any categories with less than 3 values
categorical_cols = list(df_train_x.select_dtypes(include=['object']).columns.values)
for col in categorical_cols:
    category_to_remove = list(df_train_x[col].value_counts()[df_train_x[col].value_counts() <= 2].index)
    if category_to_remove:
        df_train_x[col] = df_train_x[col].replace(category_to_remove, np.nan)
        df_test_x[col] = df_test_x[col].replace(category_to_remove, np.nan)

In [4]:
# For numeric feature, log transform skewed distributions
from scipy.stats import skew
numeric_cols = list(df_train_x.select_dtypes(exclude=['object']).columns.values)
for col in numeric_cols:
    if skew(df_train_x[col]) > 0.75 or skew(df_train_x[col]) < -0.75:
        df_train_x[col] = np.log1p(df_train_x[col])
        df_test_x[col] = np.log1p(df_test_x[col])

# Encode Categorical Variables and put into sci-kit friendly format
df_train_x = pd.get_dummies(df_train_x, drop_first=True, dummy_na=False)
df_train_x = df_train_x.fillna(0)
df_test_x = pd.get_dummies(df_test_x, drop_first=True, dummy_na=False)
df_test_x = df_test_x.fillna(0)

In [5]:
# Ensure test and training have the same variables
test_cols = set(df_test_x.columns.values)
train_cols = set(df_train_x.columns.values)
# Remove cols from train that are not in test
df_train_x = df_train_x.drop(list(train_cols-test_cols),1)
# Remove cols from train that are not in test
df_test_x = df_test_x.drop(list(test_cols-train_cols),1)

In [6]:
# Double check that all columns are the same in test and train
for test_col, train_col in zip(list(df_test_x.columns.values),list(df_train_x.columns.values)):
    if test_col != train_col:
        print test_col, train_col, 'not the same'

In [7]:
# Convert to matrix
train_x = df_train_x.as_matrix()
train_y = df_train_y.as_matrix().ravel()
test_x = df_test_x.as_matrix()

In [8]:
########### XGBoosting Regression #############

print "XGBoosting Regression"
from xgboost import XGBRegressor
from utils import optimize_and_evaluate_model
from sklearn.pipeline import Pipeline
import operator

pipeline_xg = Pipeline(steps=[('xg',XGBRegressor(n_estimators=50,learning_rate=1))])
parameters_xg = {'xg__n_estimators': (40, 50), 'xg__learning_rate': (0.1, 1)}

# fit the model, find the index of the top 10 features and their scores.
pipeline_xg.fit(train_x,train_y)
score = pipeline_xg.named_steps['xg'].booster().get_fscore()
sorted_score_top10 = sorted(score.items(), key=operator.itemgetter(1), reverse=True)[0:10]
index = [int(pair[0][1:]) for pair in sorted_score_top10]
feature_score = [int(pair[1]) for pair in sorted_score_top10]

# Print features by importance in model
print"The top ten most significant coefficients for random xg boost regression are:"
for i in range(len(index)):
    print df_train_x.columns[i], feature_score[i]

# Search for best parameters
optimize_and_evaluate_model(pipeline_xg, parameters_xg, "XGBoost Regression", train_x, train_y)

XGBoosting Regression
The top ten most significant coefficients for random xg boost regression are:
LotFrontage 28
LotArea 19
OverallQual 19
YearBuilt 16
YearRemodAdd 15
MasVnrArea 15
BsmtFinSF1 14
BsmtFinSF2 12
BsmtUnfSF 12
TotalBsmtSF 10
Pipeline(steps=[('xg', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))])

Method is XGBoost Regression
Root Mean Square Error 0.157798122244

#################################################



In [11]:
########### Lasso Regression #############

# Define model pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
pipeline_lasso = Pipeline(steps=[('lasso',Lasso(alpha=0.0001, normalize=True))])
parameters_lasso = {'lasso__alpha': (0.00005, 0.0001)}

# Print largest coefficients in model
pipeline_lasso.fit(train_x, train_y)
coef = pd.Series(pipeline_lasso.named_steps['lasso'].coef_, index=df_train_x.columns).sort_values()
print"The top ten most significant coefficients for lasso regression are:"
imp_coef = abs(coef).nlargest(10)
print coef.loc[imp_coef.index]

# Search for best parameters
optimize_and_evaluate_model(pipeline_lasso, parameters_lasso, "Lasso Regression", train_x, train_y)

The top ten most significant coefficients for lasso regression are:
GrLivArea               0.340011
OverallCond_3          -0.156932
KitchenAbvGr           -0.153132
Functional_Maj2        -0.148665
Neighborhood_StoneBr    0.110758
Neighborhood_Crawfor    0.108176
RoofMatl_WdShngl        0.093379
SaleType_New            0.091178
Heating_Grav           -0.086204
Neighborhood_NridgHt    0.085754
dtype: float64
Pipeline(steps=[('lasso', Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

Method is Lasso Regression
Root Mean Square Error 0.120335683338

#################################################



In [13]:
print "Random Forest Feature Importance Analysis"
from sklearn.ensemble import RandomForestRegressor

pipeline_forest = Pipeline(steps=[('randomforest',RandomForestRegressor(n_estimators=80, max_features=20, random_state=0))])
parameters_forest = {'randomforest__n_estimators': (60, 80),
                     'randomforest__max_features': (20, 40)}

# Print features by importance in model
pipeline_forest.fit(train_x,train_y)
importances = pd.Series(pipeline_forest.named_steps['randomforest'].feature_importances_, index=df_train_x.columns).sort_values()
print"The top ten most significant coefficients for random forest regression are:"
top_importances = importances.nlargest(10)
print top_importances.cumsum()

# Search for best parameters
optimize_and_evaluate_model(pipeline_forest, parameters_forest, "Random Forest Regression", train_x, train_y)


Random Forest Feature Importance Analysis
The top ten most significant coefficients for random forest regression are:
OverallQual     0.079252
GrLivArea       0.152157
GrLivArea_sq    0.218719
GarageYrBlt     0.262324
sqYearBuilt     0.304416
TotalBsmtSF     0.344887
YearBuilt       0.385255
sqGarageCars    0.424084
GarageCars      0.461972
1stFlrSF        0.496030
dtype: float64
Pipeline(steps=[('randomforest', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=40, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=80, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False))])

Method is Random Forest Regression
Root Mean Square Error 0.142368909872

#################################################



In [14]:
########### AdaBoosting Regression #############

print "Ada Boosting Regression"
from sklearn.ensemble import AdaBoostRegressor

pipeline_ada = Pipeline(steps=[('ada',AdaBoostRegressor(n_estimators=50,learning_rate=1.0))])
parameters_ada = {'ada__n_estimators': (40, 50),
                  'ada__learning_rate': (0.1, 1)}

# Print features by importance in model
pipeline_ada.fit(train_x,train_y)
importances = pd.Series(pipeline_ada.named_steps['ada'].feature_importances_, index=df_train_x.columns).sort_values()
print"The top ten most significant coefficients for random ada boost regression are:"
top_importances = importances.nlargest(10)
print top_importances.cumsum()

# Search for best parameters
optimize_and_evaluate_model(pipeline_ada, parameters_ada, "AdaBoost Regression", train_x, train_y)

Ada Boosting Regression
The top ten most significant coefficients for random ada boost regression are:
OverallQual      0.320269
GrLivArea        0.492357
GrLivArea_sq     0.620169
1stFlrSF         0.663917
TotalBsmtSF      0.702550
YearRemodAdd     0.733894
CentralAir_Y     0.758380
sqGarageYrBlt    0.778045
GarageCars       0.793958
BsmtFinSF1       0.808965
dtype: float64
Pipeline(steps=[('ada', AdaBoostRegressor(base_estimator=None, learning_rate=1, loss='linear',
         n_estimators=40, random_state=None))])

Method is AdaBoost Regression
Root Mean Square Error 0.174674706755

#################################################



In [15]:
########### PCA Regression #############
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression

pca = PCA(n_components=20)
selection = SelectKBest(k=5)
combined_features = FeatureUnion([("pca", pca), ("kbest", selection)])

pipeline_pca = Pipeline(steps=[("features", combined_features), ("lr", LinearRegression())])
parameters_pca = {'features__pca__n_components': (20,40,80)}

# Search for best parameters
optimize_and_evaluate_model(pipeline_pca, parameters_pca, "PCA plus Linear Regression", train_x, train_y)



Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('pca', PCA(copy=True, iterated_power='auto', n_components=80, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('kbest', SelectKBest(k=5, score_func=<function f_classif at 0x1182fe578>))],
       transformer_weights=None)), ('lr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

Method is PCA plus Linear Regression
Root Mean Square Error 0.128993631282

#################################################



In [12]:
print df_train_x.shape

(1459, 245)
