In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [2]:
data = pd.read_csv('train.csv', usecols = ['LotArea', 'YearBuilt','GarageCars', 'OverallCond', 'SalePrice'])

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 5 columns):
LotArea        1460 non-null int64
OverallCond    1460 non-null int64
YearBuilt      1460 non-null int64
GarageCars     1460 non-null int64
SalePrice      1460 non-null int64
dtypes: int64(5)
memory usage: 57.2 KB


In [4]:
# y - target value (value that we gonna predict)
X = data[['LotArea', 'YearBuilt','GarageCars', 'OverallCond']].values
y = data.SalePrice

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 42)

In [6]:
lr_r = Ridge(alpha=1.0)
lr_r.fit(X_train, y_train)
test_pred = lr_r.predict(X_test)
train_pred = lr_r.predict(X_train)
print('rmse on train', math.sqrt(mean_squared_error(y_train, train_pred)))
print('rmse on test', math.sqrt(mean_squared_error(y_test, test_pred)))

rmse on train 54340.14422715648
rmse on test 62162.40346436413


In [7]:
from sklearn.model_selection import GridSearchCV

param_grid={'alpha': [0.01, 0.1,1,10,100],
            'fit_intercept': [True, False],
            'solver':  ['auto', 'saga']}
lr_r = Ridge()
gs = GridSearchCV(lr_r, param_grid, cv=5)

gs.fit(X_train, y_train)
print("Best: %f using %s" % (gs.best_score_, gs.best_params_))

Best: 0.495083 using {'alpha': 0.01, 'fit_intercept': True, 'solver': 'auto'}


In [8]:
lr_r = Ridge(alpha=0.01)
lr_r.fit(X_train, y_train)
test_pred = lr_r.predict(X_test)
train_pred = lr_r.predict(X_train)
print('rmse on train', math.sqrt(mean_squared_error(y_train, train_pred)))
print('rmse on test', math.sqrt(mean_squared_error(y_test, test_pred)))

rmse on train 54340.10252170124
rmse on test 62160.25671371721


In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_grid_rand={'alpha': uniform(0, 100)}

lr_r = Ridge()
gs = RandomizedSearchCV(lr_r, param_grid_rand, cv=5)

gs.fit(X_train, y_train)
print("Best: %f using %s" % (gs.best_score_, gs.best_params_))

Best: 0.494701 using {'alpha': 10.829343303536564}


In [14]:
lr_r = Ridge(alpha=10.25)
lr_r.fit(X_train, y_train)
test_pred = lr_r.predict(X_test)
train_pred = lr_r.predict(X_train)
print('rmse on train', math.sqrt(mean_squared_error(y_train, train_pred)))
print('rmse on test', math.sqrt(mean_squared_error(y_test, test_pred)))

rmse on train 54344.311330247234
rmse on test 62185.3407808536
