This notebook produces a cross validation comparison of the linear models Ridge, Lasso and ElasticNet with $\alpha$ tuning. At the time of publication this script performs in the top ~15% of submissions. Using a RMSE for evaluation it is found that ElasticNet appears slightly better than the Lasso and Ridge models. Some basic preprocessing and feature creation is also included but it should be emphasised that the median imputation used on missing values is very crude. For example, Area features with missing values may be this way because the property does not have that feature (e.g. a pool) so it would make more sense to set this to zero. Feature creation is done by taking the square root of all numerical area features.

In [40]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler

train = pd.read_csv("train.csv",index_col="Id")
test = pd.read_csv("test.csv",index_col="Id")

def print_full(x):
    """
    Full printing of dataframes for error checking
    """
    pd.set_option('display.max_columns', 999)
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_columns')
    pd.reset_option('display.max_rows')

def clean(df):
    """
    Cleans NaNs and creates new features
    """
    
    # List of new features to be created: (new_feature, original_feature, transform_function)
    transform = [("sqLotArea","LotArea",np.sqrt),
                 ("sqGrLivArea","GrLivArea",np.sqrt),
                 ("sqBsmtFinSF1","BsmtFinSF1",np.sqrt),
                 ("sqBsmtFinSF2","BsmtFinSF2",np.sqrt),
                 ("sqBsmtUnfSF","BsmtUnfSF",np.sqrt),
                 ("sqTotalBsmtSF","TotalBsmtSF",np.sqrt),
                 ("sq1stFlrSF","1stFlrSF",np.sqrt),
                 ("sq2ndFlrSF","2ndFlrSF",np.sqrt),
                 ("sqLotFrontage","LotFrontage",np.sqrt),
                 ("sqMasVnrArea","MasVnrArea",np.sqrt),
                 ("sqPoolArea","PoolArea",np.sqrt),
                 ("sqGarageArea","GarageArea",np.sqrt),
                 ("sqWoodDeckSF","WoodDeckSF",np.sqrt),
                 ("sqOpenPorchSF","OpenPorchSF",np.sqrt),
                 ("sqEnclosedPorch","EnclosedPorch",np.sqrt),
                ]
 
    transform += [("lnLotArea","LotArea",lambda x: np.log(x+1)),
                 ("lnGrLivArea","GrLivArea",lambda x: np.log(x+1)),
                 ("lnBsmtFinSF1","BsmtFinSF1",lambda x: np.log(x+1)),
                 ("lnBsmtFinSF2","BsmtFinSF2",lambda x: np.log(x+1)),
                 ("lnBsmtUnfSF","BsmtUnfSF",lambda x: np.log(x+1)),
                 ("lnTotalBsmtSF","TotalBsmtSF",lambda x: np.log(x+1)),
                 ("ln1stFlrSF","1stFlrSF",lambda x: np.log(x+1)),
                 ("ln2ndFlrSF","2ndFlrSF",lambda x: np.log(x+1)),
                 ("lnLotFrontage","LotFrontage",lambda x: np.log(x+1)),
                 ("lnMasVnrArea","MasVnrArea",lambda x: np.log(x+1)),
                 ("lnPoolArea","PoolArea",lambda x: np.log(x+1)),
                 ("lnGarageArea","GarageArea",lambda x: np.log(x+1)),
                 ("lnWoodDeckSF","WoodDeckSF",lambda x: np.log(x+1)),
                 ("lnOpenPorchSF","OpenPorchSF",lambda x: np.log(x+1)),
                 ("lnEnclosedPorch","EnclosedPorch",lambda x: np.log(x+1)),
                ]

    # Find categorical and numerical features
    categoricals = df.select_dtypes(include=["object"]).columns.values
    numericals = [feat for feat in df.select_dtypes(include=["int","float"]).columns.values]

    cat_nan_nonestr = ["PoolQC","Fence","MiscFeature","FireplaceQu","GarageType",
                    "GarageFinish","GarageQual","GarageCond","BsmtQual","BsmtCond",
                    "BsmtExposure","BsmtFinType1","BsmtFinType2","MasVnrType","Alley"]
    
    num_nan_zero = ["LotFrontage","MasVnrArea","GarageYrBlt"]
    
    # Transform to create new features, scale using MinMaxScaler
    for (new_feature,original_feature,f) in transform: 
        df[new_feature] = df[original_feature].fillna(df[original_feature].median(), inplace = False)
        df[new_feature] = MinMaxScaler().fit_transform(f(df[new_feature].apply(float)).reshape(-1,1))
    # Scale and remove NaNs for numerical features by imputing median value
    for feature in numericals: 
        if feature in num_nan_zero:
            df[feature].fillna(0, inplace = True)
        else:
            df[feature].fillna(df[feature].median(), inplace = True)
        df[feature] = MinMaxScaler().fit_transform(df[feature].apply(float).reshape(-1,1))
    # Impute NaNs for categorical features
    for feature in categoricals: 
        if feature in cat_nan_nonestr:
            df[feature].fillna("None", inplace = True)
        else:
            df[feature].fillna(df[feature].value_counts().idxmax(), inplace = True)
            
    # Perform one hot encoding on the categorical features
    for cat in categoricals:
        dummies = pd.get_dummies(df[cat])
        dummies.columns = [col_name + cat for col_name in dummies.columns.values]            
        df = df.drop(cat,axis=1)
        df = df.join(dummies)
    
    return df

target = train["SalePrice"] # Note that we will take the Log of this when fitting - check the histogram of this feature
train = train.drop("SalePrice",axis=1)

dd = clean(pd.concat([train,test]))

train = dd[:len(train)]
test = dd[len(train):]

In [42]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LassoLarsCV, ElasticNetCV
from sklearn import cross_validation
from sklearn.cross_validation import cross_val_score

def rmse_cv(model): # Cross val using the competition scoring metric
    return(np.sqrt(-cross_val_score(model, train, np.log(target), scoring="mean_squared_error", cv = 10)))

print("=====Lasso Regression=====")
for a in np.arange(1.0e-4,1.0e-3,1.0e-4):
    print(a, rmse_cv(Lasso(alpha = a,max_iter=100000)).mean())
    
print("=====Ridge Regression=====")
for a in np.arange(1,10,1):
    print(a, rmse_cv(Ridge(alpha = a)).mean())
    
print("=====ElasticNet=====")

encv = ElasticNetCV(l1_ratio=[0.25, 0.5, 0.75, 0.9],
             eps=0.0000001,
             n_alphas=100,
             max_iter=100000,
             cv=10,
             verbose=False,
             precompute=True,
             random_state=1, n_jobs=-1).fit(train, np.log(target))
print(" Best alpha value: %f" % encv.alpha_ )
print(" Best l1_ratio value: %f" % encv.l1_ratio_ )
print(rmse_cv(ElasticNet(alpha = encv.alpha_,
                            l1_ratio=encv.l1_ratio_,
                            max_iter=100000,
                            random_state=1).fit(train, np.log(target))
                ).mean())

best_model = ElasticNet(alpha = encv.alpha_,
                            l1_ratio=encv.l1_ratio_,
                            max_iter=100000,
                            random_state=1).fit(train, np.log(target))

# Output to CSV
test["SalePrice"] = np.exp(best_model.predict(test))
test[["SalePrice"]].to_csv("submit.txt")

=====Lasso Regression=====
0.0001 0.129152055831
0.0002 0.126566053252
0.0003 0.126499332894
0.0004 0.127162438599
0.0005 0.127937700184
0.0006 0.128494979297
0.0007 0.129068253425
0.0008 0.129874977002
0.0009 0.130792734432
=====Ridge Regression=====
1 0.131146319871
2 0.130913804663
3 0.130994640069
4 0.131131428147
5 0.131274562701
6 0.131412255129
7 0.131542155299
8 0.131664527782
9 0.131780299091
=====ElasticNet=====




 Best alpha value: 0.000393
 Best l1_ratio value: 0.500000
0.125544566389


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
