In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

In [43]:
#loading housing dataset
from sklearn.datasets import load_boston
boston_dataset = load_boston()

In [44]:
boston_dataset.DESCR   

".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000

In [45]:
boston_dataset.feature_names #from DESCR we can see 'MEDV' is missing

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [46]:
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)


In [47]:

boston['MEDV'] = boston_dataset.target #adding 'MEDV'

In [48]:
X = pd.DataFrame(np.c_[boston['LSTAT'], boston['RM']], columns = ['LSTAT','RM']) #selected these according to corelation matrix performed in the linearreg program
Y = boston['MEDV']
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(404, 2)
(102, 2)
(404,)
(102,)


In [49]:
from sklearn.preprocessing import PolynomialFeatures

def create_polynomial_regression_model(degree):
    "Creates a polynomial regression model for the given degree"

    p_features = PolynomialFeatures(degree=degree)

    # transforming existing features to higher degree features.
    X_train_p = p_features.fit_transform(X_train)

    # fit to Linear Regression
    poly_model = LinearRegression()
    poly_model.fit(X_train_p, Y_train)

    # predicting on training data-set
    y_train_predicted = poly_model.predict(X_train_p)

    # predicting on test data-set
    y_test_predict = poly_model.predict(p_features.fit_transform(X_test))

    
    rmse_train = np.sqrt(mean_squared_error(Y_train, y_train_predicted))
    r2_train = r2_score(Y_train, y_train_predicted)


    rmse_test = np.sqrt(mean_squared_error(Y_test, y_test_predict))
    r2_test = r2_score(Y_test, y_test_predict)

    print("The model performance for the train set")
    print("RMSE of train set is {}".format(rmse_train))
    print("R2 score of train set is {}".format(r2_train))

    print("\n")

    print("The model performance for the test set:")
    print("RMSE of test set is {}".format(rmse_test))
    print("R2 score of test set is {}".format(r2_test))

In [50]:
create_polynomial_regression_model(2)

The model performance for the train set
RMSE of train set is 4.703071027847754
R2 score of train set is 0.7425094297364767


The model performance for the test set:
RMSE of test set is 3.784819884545034
R2 score of test set is 0.8170372495892184
