# Ridge Regression With Python

## import libraries

In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np

## read data

In [2]:
df_train=pd.read_csv('../input/train.csv')
df_test=pd.read_csv('../input/test.csv')

## store Ids of homes

In [3]:
df_train=df_train.drop('Id', axis=1)
y_id=df_test['Id'].copy()
df_test=df_test.drop('Id', axis=1)

## Define y_train

In [4]:
y_train=df_train['SalePrice'].values.reshape(-1,1)
df_train=df_train.drop('SalePrice', axis=1)

## Transform y_train to match the evaluation metric

In [5]:
y_train=np.log(y_train+1)

## concate df_train and df_test

In [6]:
df=pd.concat([df_train, df_test], axis=0, ignore_index=True)

## select columns with non null values

In [7]:
df=df.dropna(axis=1)

## Transform categorical variables into dummy variables

In [8]:
df=pd.get_dummies(df, drop_first=True)

## create X_train and X_test

In [9]:
X_train=df.iloc[:df_train.shape[0],]
X_test=df.iloc[df_train.shape[0]:,]

## import libraries

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

## steps 

In [11]:
steps = [('scaler', StandardScaler()),
         ('ridge', Ridge())]

## Create the pipeline: pipeline

In [12]:
pipeline = Pipeline(steps)

## Specify the hyperparameter space

In [14]:
parameters = {'ridge__alpha':np.logspace(-4, 0, 50)}


## Create the GridSearchCV object: cv

In [15]:
cv = GridSearchCV(pipeline, parameters, cv=3)

## Fit to the training set

In [16]:
cv.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'ridge__alpha': array([  1.00000e-04,   1.20679e-04,   1.45635e-04,   1.75751e-04,
         2.12095e-04,   2.55955e-04,   3.08884e-04,   3.72759e-04,
         4.49843e-04,   5.42868e-04,   6.55129e-04,   7.90604e-04,
         9.54095e-04,   1.15140e-03,   1.38950e-03,   1.67683e-03,
    ...    3.90694e-01,   4.71487e-01,   5.68987e-01,   6.86649e-01,
         8.28643e-01,   1.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## predict on train set

In [17]:
y_pred_train=cv.predict(X_train)

## Predict test set

In [18]:
y_pred_test=cv.predict(X_test)

## rmse on train set

In [19]:
rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 0.11740579781500343


## shape to export

In [20]:
output=pd.concat([y_id, DataFrame(np.exp(y_pred_test)-1)], axis=1, ignore_index=True)
output.columns=['Id', 'SalePrice']

## export

In [21]:
output.to_csv('./submission.csv', sep=',', index=False)