In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr

%config InlineBackend.figure_format = 'png'
%matplotlib inline

In [None]:
# Regularized Linear Regression
# based on https://www.kaggle.com/apapiu/regularized-linear-models

In [None]:
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train.head()

In [None]:
all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'], test.loc[:, 'MSSubClass':'SaleCondition']))
all_data

In [None]:
# data preprocessing

In [None]:
# transform the skewed numeric features by taking log(feature + 1) to make it more normal

In [None]:
sns.histplot(train['SalePrice'])

In [None]:
sns.histplot(np.log1p(train['SalePrice']))

In [None]:
# log transform the target
train['SalePrice'] = np.log1p(train['SalePrice'])

In [None]:
# locate the numeric features
numeric_features = all_data.dtypes[all_data.dtypes != 'object'].index
# compute the skewness of the numeric features and find skewed features
skewed_features = train[numeric_features].apply(lambda x: skew(x.dropna()))
skewed_features = skewed_features[skewed_features > 0.75]
skewed_features = skewed_features.index
# log transform the skewed features
all_data[skewed_features] = np.log1p(all_data[skewed_features])

In [None]:
# create dummy variables for categorical features

In [None]:
all_data = pd.get_dummies(all_data)

In [None]:
# replace the missing values with the mean of their respective columns

In [None]:
all_data = all_data.fillna(all_data.mean())

In [None]:
# create matrices for sklearn
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [None]:
# Try regularized linear regression model (l1 and l2; lasso and ridge)

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

In [None]:
model_ridge = Ridge()

In [None]:
# a function that returns the cross-validation rmse error so we can evaluate our models and pick the best tuning pair
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', cv = 5))
    return rmse

In [None]:
# the main tuning parameter for the ridge model is alpha
# a regularization paramter that measures how flexible our model is
# the higher the regularization the less prone our model will be overfit 
# but it may lose flexibility and fail to capture all signal in the data
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean() for alpha in alphas]

In [None]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")

In [None]:
cv_ridge.min()

In [None]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)

In [None]:
rmse_cv(model_lasso).mean()

In [None]:
# the lasso model performs better
# the lasso model kinda does the feature selection for you because it sets some coefficients of features to 0
coef = pd.Series(model_lasso.coef_, X_train.columns)
coef

In [None]:
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

In [None]:
# However, the features selected are not necessarily the "correct" ones - especially since there are a lot of collinear features in this dataset
# One idea to try here is run Lasso a few times on boostrapped samples and see how stable the feature selection is

In [None]:
# take a look directly at what the most important coefficients are

In [None]:
imp_coef = pd.concat([coef.sort_values().head(10),coef.sort_values().tail(10)])
imp_coef

In [None]:
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model");

In [None]:
# let's look at the residuals as well

In [None]:
preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(kind='scatter', x='preds', y='residuals')

In [None]:
# xgboost...