In [7]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale 
boston = load_boston()
X,y = scale(boston.data), boston.target

In [8]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [9]:
print('R2 %0.3f' % regression.score(X,y))

R2 0.741


In [10]:
print([a+':'+str(round(b,1)) for a,b in zip(boston.feature_names, regression.coef_)])

['CRIM:-0.9', 'ZN:1.1', 'INDUS:0.1', 'CHAS:0.7', 'NOX:-2.1', 'RM:2.7', 'AGE:0.0', 'DIS:-3.1', 'RAD:2.7', 'TAX:-2.1', 'PTRATIO:-2.1', 'B:0.8', 'LSTAT:-3.7']


**Conclusion**



Concentration of lower income resident (LSAT) has the largest (negative) effect on housing prices. 

**Polynomial Expansion**
Given a certain degree, creates powers of each feature up to the degree-power and degree-combinations of all the terms. As the degree grows, so does the number of derived terms. The expansion catches nonlinear relationships in data that require a curve, not a line, to predict correctly. By expanding the number of reatures, you reduce the bias of the predictions at the expense of overfitting.




In [11]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
pf = PolynomialFeatures(degree=2)
poly_x = pf.fit_transform(X)

X_train, X_test, y_train, y_test = (
    train_test_split(poly_x, y, test_size=.33, random_state=42))

from sklearn.linear_model import Ridge
reg_regression = Ridge(alpha=0.1, normalize=True)
reg_regression.fit(X_train, y_train)
print('R2: %0.3f'
    % r2_score(y_test, reg_regression.predict(X_test)))


R2: 0.820
