<a href="https://colab.research.google.com/github/ksimhadr/learn/blob/master/01_Project_NNvsLinearReg_for_BostonHousing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


###Project: NN for Boston Housing Dataset
###Name: Karthik Simhadri


In [1]:
#Download Boston housing data
import urllib.request
url = 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv'
urllib.request.urlretrieve(url, 'BostonHousing.csv')

('BostonHousing.csv', <http.client.HTTPMessage at 0x79785e9c9a50>)

In [7]:
#scikit learn
#used for ML, Statistical Modelling, Classification, Regression, Clustering, Dim-red

#Sequential NN/ Feed Forward NN (Data flows from i-->o, no loops/cycles):
# A Sequential model is appropriate for a plain stack of layers
# where each layer has exactly one input tensor and one output tensor.
# Tensor: Tensors are multidimensional matrices. Eg: Image in RGB

#Dense Layer:
#dense layer is a simple Layer of neurons in which each neuron receives
#input from all the neurons of the previous layer

#import necessary libraries
from sklearn import datasets
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import StandardScaler #unit normalization: (x-u)/sigma, scaling data
from sklearn.model_selection import train_test_split

#metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


import numpy as np #Computational power of C & FORTRAN (scientific computing)
import pandas as pd #Data analysis and manipulation

BosData = pd.read_csv('BostonHousing.csv') #Reads csv file
BosData.head()
print(BosData.shape)

X = BosData.iloc[:,0:11]
y = BosData.iloc[:, 13] #MEDV: Median value of owner-occupied homes in $1000s #Response: Median home price, 1 bias
#print(X.shape)
#print(y.shape)

ss = StandardScaler()
#Standardize features by removing the mean and scaling to unit variance.
#Data normalized, lies between -1 to 1
X = ss.fit_transform(X)
#print(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

model = Sequential()
model.add(Dense(11, input_dim = 11, activation = "relu"))
model.add(Dense(1))
#Adam opt: Stochastic/random gradient desc method, based on 1st & 2nd order moments
model.compile(loss = "mean_squared_error", optimizer = "adam", metrics = ["accuracy"])

#Batch size: # of training samples
#Epochs: # of times that the entire dataset is passed through the Feed Fwd NN
history = model.fit(X_train, y_train, epochs = 150, batch_size = 10)
y_pred = model.predict(X_test)
y_pred = y_pred[:,0]

rmse = (np.sqrt(mean_squared_error(y_test, y_pred)))
r2 = r2_score(y_test, y_pred) #r2 = 1, ytest = ypred (best case); r2 = 0, ypred = mean (worst case)

print("Test RMSE = ", rmse)
print("Test R2 Score = ", r2)


(506, 14)
Epoch 1/150


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 555.3577
Epoch 2/150
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 577.0214
Epoch 3/150
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 538.2821
Epoch 4/150
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 495.9488
Epoch 5/150
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 484.0898
Epoch 6/150
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 434.8019
Epoch 7/150
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 386.5083
Epoch 8/150
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0000e+00 - loss: 392.8058
Epoch 9/150




[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 47ms/step



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Test RMSE =  3.5323762354808617
Test R2 Score =  0.8508519086781274


#Using Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


BosData = pd.read_csv('BostonHousing.csv') #Reads csv file
BosData.head()
print(BosData.shape)

X = BosData.iloc[:,0:11]
y = BosData.iloc[:, 13] #MEDV: Median value of owner-occupied homes in $1000s #Response: Median home price, 1 bias
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) #test: 20%, 80% training,
#fix set of random rows choosen for test and train in a particular run
reg = LinearRegression()
reg.fit(X_train, y_train)

y_train_predict = reg.predict(X_train) #use model learnt predict response for training set
rmse = np.sqrt(mean_squared_error(y_train, y_train_predict)) #MSE between training response & prediction from the model
r2 = r2_score(y_train, y_train_predict)

print("--------------------------------------")
print("The model performance for training set")
print("--------------------------------------")
print('Train RMSE is {}'.format(rmse))
print('Train R2 score is {}'.format(r2))
print("--------------------------------------\n")

y_test_predict = reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_test_predict))
r2 = r2_score(y_test, y_test_predict)

print("--------------------------------------")
print("The model performance for testing set")
print("--------------------------------------")
print('Test RMSE is {}'.format(rmse))
print('Test R2 score is {}'.format(r2))
print("--------------------------------------")

(506, 14)
(506, 11)
(506,)
--------------------------------------
The model performance for training set
--------------------------------------
Train RMSE is 5.511467677842388
Train R2 score is 0.646383286658382
--------------------------------------

--------------------------------------
The model performance for testing set
--------------------------------------
Test RMSE is 4.287105260205545
Test R2 score is 0.7652527354155104
--------------------------------------
