# The Boston Housing Dataset

The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:

- [CRIM] - per capita crime rate by town
- [ZN] - proportion of residential land zoned for lots over 25,000 sq.ft.
- [INDUS] - proportion of non-retail business acres per town.
- [CHAS] - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
- [NOX] - nitric oxides concentration (parts per 10 million)
- [RM] - average number of rooms per dwelling
- [AGE] - proportion of owner-occupied units built prior to 1940
- [DIS] - weighted distances to five Boston employment centres
- [RAD] - index of accessibility to radial highways
- [TAX] - full-value property-tax rate per 10,000 USD 
- [PTRATIO] - pupil-teacher ratio by town
- [B] - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- [LSTAT] - % lower status of the population
- [MEDV[ - Median value of owner-occupied homes in $1000's

In [23]:
from platform import python_version 
print(python_version())
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from scipy import stats


def get_data ():
    df = pd.read_csv("data/boston_housing.csv")  
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    cols = list (df.columns)
    features = cols[:-1]
    print(features)
    label = cols[-1]
    X = df[features].copy().to_numpy()
    y = df[label].to_list()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    return X_train, y_train, X_test, y_test 


if __name__ == "__main__":
    X_train, y_train, X_test, y_test  = get_data ()

    lin_model = linear_model.LinearRegression()
    lin_model.fit(X_train, y_train)

    # model evaluation for training set
    y_train_predict = lin_model.predict(X_train)
    rmse = (np.sqrt(mean_squared_error(y_train, y_train_predict)))
    r2 = r2_score(y_train, y_train_predict)

    print("The model performance for training set")
    print("--------------------------------------")
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))
    print("\n")

    # model evaluation for testing set
    y_test_predict = lin_model.predict(X_test)
    rmse = (np.sqrt(mean_squared_error(y_test, y_test_predict)))
    r2 = r2_score(y_test, y_test_predict)

    print("The model performance for testing set")
    print("--------------------------------------")
    print('RMSE is {}'.format(rmse))
    print('R2 score is {}'.format(r2))

3.9.16
['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
The model performance for training set
--------------------------------------
RMSE is 4.6520331848801675
R2 score is 0.7508856358979672


The model performance for testing set
--------------------------------------
RMSE is 4.9286021826653625
R2 score is 0.6687594935356285
