In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../input/train_houseprices.csv')
test= pd.read_csv('../input/test_houseprices.csv')

In [3]:
print(train.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [4]:
#append the two files together
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

In [5]:
#have a look and see which are the features with nulls
numeric_feats_ix = all_data.dtypes[all_data.dtypes != "object"].index
numeric_feats = all_data[numeric_feats_ix]
numeric_feats.isnull().sum().sort_values(ascending = False).head(10)

LotFrontage     486
GarageYrBlt     159
MasVnrArea       23
BsmtHalfBath      2
BsmtFullBath      2
GarageArea        1
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
dtype: int64

In [6]:
#could use a predictive model for LotFrontage but that is way too complicated!
#just replace with mean
#get dummies for the discrete features
#look more at feature selection later

all_data = all_data.fillna(all_data.mean())
all_data = pd.get_dummies(all_data)

In [7]:
#log the target
train['SalePrice'] = np.log1p(train['SalePrice'])

In [8]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import LinearSVR, SVR
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold

In [9]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [10]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

In [11]:
linear = LinearRegression()

print(rmse_cv(linear))
np.mean(rmse_cv(linear))

[ 0.13884313  0.16644603  0.16937877  0.11088197  0.20005448]


0.15712087765357144

In [12]:
svmlinear = LinearSVR()

print(rmse_cv(svmlinear))
np.mean(rmse_cv(svmlinear))

[ 0.40439225  0.51818198  0.16379671  0.1872586   0.3519267 ]


0.22870010872278615

In [13]:
ridge = Ridge()

print(rmse_cv(ridge))
np.mean(rmse_cv(ridge))

[ 0.12049647  0.15443951  0.14303145  0.11682595  0.18935375]


0.14482942756188194

In [14]:
lasso = Lasso()

print(rmse_cv(lasso))
np.mean(rmse_cv(lasso))

[ 0.16077516  0.20437131  0.17676701  0.1812766   0.2651208 ]


0.19766217639629496

In [15]:
ridge.fit(X_train,y)
y_pred = np.expm1(ridge.predict(X_test))
my_solution = pd.DataFrame(y_pred, test['Id'], columns = ["SalePrice"])
my_solution.to_csv("my_solution.csv", index_label = ["Id"])

In [16]:
from scipy.stats import kurtosis, skew, norm, kurtosistest
from scipy import stats

skewed_feats = train[numeric_feats_ix].apply(lambda x: skew(x.dropna())).sort_values(ascending = False)
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [17]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [18]:
ridge = Ridge()

print(rmse_cv(ridge))
np.mean(rmse_cv(ridge))

[ 0.11379378  0.13875197  0.1369379   0.1118998   0.15534677]


0.13134604442827524

In [19]:
ridge.fit(X_train,y)
y_pred = np.expm1(ridge.predict(X_test))
my_solution = pd.DataFrame(y_pred, test['Id'], columns = ["SalePrice"])
my_solution.to_csv("my_solution.csv", index_label = ["Id"])

In [20]:
from sklearn.linear_model import RidgeCV, LassoCV
ridge_cv = RidgeCV(alphas = [0.1,0.5,1,5,10]).fit(X_train,y)
rmse_cv(ridge_cv).mean()

0.12774263096062427

In [21]:
y_pred = np.expm1(ridge_cv.predict(X_test))
my_solution = pd.DataFrame(y_pred, test['Id'], columns = ["SalePrice"])
my_solution.to_csv("my_solution.csv", index_label = ["Id"])

In [22]:
lasso_cv = LassoCV(alphas = [0.001,0.01,0.1,0.5,1,5,10]).fit(X_train,y)
rmse_cv(lasso_cv).mean()

0.12419212473784114

In [23]:
y_pred = np.expm1(lasso_cv.predict(X_test))
my_solution = pd.DataFrame(y_pred, test['Id'], columns = ["SalePrice"])
my_solution.to_csv("my_solution.csv", index_label = ["Id"])

In [24]:
#need to work a bit more on understanding the difference between ridge and cv
#for the purposes of improving this score we could do a bit more work on feature engineering
#or try using elasticnet

In [43]:
print(pd.Series(ridge_cv.coef_,index = X_train.columns).sort_values(ascending = False).head(10))
print(pd.Series(ridge_cv.coef_,index = X_train.columns).sort_values(ascending = True).head(10))

GrLivArea               0.206554
1stFlrSF                0.122074
Neighborhood_StoneBr    0.091053
Neighborhood_Crawfor    0.088960
LotArea                 0.075260
Neighborhood_NoRidge    0.073211
Functional_Typ          0.068281
RoofMatl_WdShngl        0.067558
Exterior1st_BrkFace     0.064793
Neighborhood_NridgHt    0.064021
dtype: float64
MSZoning_C (all)       -0.146395
RoofMatl_ClyTile       -0.105275
Condition2_PosN        -0.080140
Neighborhood_Edwards   -0.077574
Functional_Maj2        -0.060640
Neighborhood_IDOTRR    -0.052025
PoolQC_Gd              -0.050552
Heating_Grav           -0.047282
Neighborhood_MeadowV   -0.047117
Neighborhood_Mitchel   -0.046543
dtype: float64


In [46]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

en = ElasticNet().fit(X_train,y)
rmse_cv(en).mean()

0.26438260271981762

In [47]:
en_cv = ElasticNetCV(alphas = [0.001,0.01,0.1,0.5,1,5,10]).fit(X_train,y)
rmse_cv(en_cv).mean()

0.12290679609954605

In [50]:
y_pred = np.expm1(en_cv.predict(X_test))
my_solution = pd.DataFrame(y_pred, test['Id'], columns = ["SalePrice"])
my_solution.to_csv("my_solution.csv", index_label = ["Id"])

In [49]:
print(pd.Series(en_cv.coef_,index = X_train.columns).sort_values(ascending = False).head(10))
print(pd.Series(en_cv.coef_,index = X_train.columns).sort_values(ascending = True).head(10))

GrLivArea               0.360786
Neighborhood_StoneBr    0.110711
Neighborhood_Crawfor    0.102794
Neighborhood_NoRidge    0.086011
Neighborhood_NridgHt    0.073382
LotArea                 0.070376
Exterior1st_BrkFace     0.070358
Functional_Typ          0.069802
KitchenQual_Ex          0.059783
OverallQual             0.054062
dtype: float64
RoofMatl_ClyTile        -0.366751
MSZoning_C (all)        -0.266619
Condition2_PosN         -0.122939
Neighborhood_Edwards    -0.056610
SaleCondition_Abnorml   -0.048185
LandContour_Bnk         -0.037324
MSZoning_RM             -0.036164
GarageCond_Fa           -0.030799
SaleType_WD             -0.030207
Functional_Maj2         -0.024449
dtype: float64


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Correlation map to see how features are correlated with SalePrice
corrmat = all_data[numeric_feats_ix].corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

In [None]:
a = 'GarageArea'

print(skew(all_data[a]))
print(kurtosistest(all_data[a]))

In [None]:
sns.distplot(all_data[a] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(all_data[a])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
print(skew(all_data[a]))
print(kurtosis(all_data[a]))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title(a + ' distribution')

#Get also the QQ-plot
#fig = plt.figure()
#res = stats.probplot(train['MSSubClass'], plot=plt)
plt.show()