The Boston Housing Dataset


The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA. The following describes the dataset columns:


CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

RM - average number of rooms per dwelling 

AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT - % lower status of the population

MEDV - Median value of owner-occupied homes in $1000's


In [467]:
import pandas as pd
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
                 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df=pd.read_csv('BostonHousPrice\housing.csv',sep=r'[ ]+', names=column_names)
df.head()

  df=pd.read_csv('BostonHousPrice\housing.csv',sep=r'[ ]+', names=column_names)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [468]:
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 59.3 KB


In [469]:
Y=pd.Series(df['MEDV'])
del(df['MEDV'])
Y


0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64

In [470]:
from sklearn.model_selection import train_test_split
import numpy as np

arr_df=np.array(df)
arr_y=np.array(Y)
X_train, X_test, y_train, y_test = train_test_split(arr_df, arr_y, random_state=1)


In [471]:
import torch
from torch import autograd

In [472]:
X_train = torch.tensor(X_train, dtype=torch.double)
X_test = torch.tensor(X_test, dtype=torch.double)
y_train = torch.tensor(y_train, dtype=torch.double)
y_test = torch.tensor(y_test, dtype=torch.double)


In [473]:
print(type(X_train),
      type(X_test),
      type(y_train),
      type(y_test))


<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [474]:

print(f'X_train.shape={X_train.shape}\n'
      f'X_test.shape= {X_test.shape}\n'
      f'y_train.shape={y_train.shape}\n'
      f'y_test.shape= {y_test.shape}\n')


X_train.shape=torch.Size([379, 13])
X_test.shape= torch.Size([127, 13])
y_train.shape=torch.Size([379])
y_test.shape= torch.Size([127])



In [475]:
import random

def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    #random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        j = indices[i: min(i + batch_size, num_examples)]
        yield features[j, :], labels[j]

In [476]:
batch_size = 10
for X, y in data_iter(batch_size, X_train, y_train):
    print(X,'\n', y)
    break

tensor([[4.5270e-02, 0.0000e+00, 1.1930e+01, 0.0000e+00, 5.7300e-01, 6.1200e+00,
         7.6700e+01, 2.2875e+00, 1.0000e+00, 2.7300e+02, 2.1000e+01, 3.9690e+02,
         9.0800e+00],
        [1.3914e-01, 0.0000e+00, 4.0500e+00, 0.0000e+00, 5.1000e-01, 5.5720e+00,
         8.8500e+01, 2.5961e+00, 5.0000e+00, 2.9600e+02, 1.6600e+01, 3.9690e+02,
         1.4690e+01],
        [4.1130e-02, 2.5000e+01, 4.8600e+00, 0.0000e+00, 4.2600e-01, 6.7270e+00,
         3.3500e+01, 5.4007e+00, 4.0000e+00, 2.8100e+02, 1.9000e+01, 3.9690e+02,
         5.2900e+00],
        [1.8836e-01, 0.0000e+00, 6.9100e+00, 0.0000e+00, 4.4800e-01, 5.7860e+00,
         3.3300e+01, 5.1004e+00, 3.0000e+00, 2.3300e+02, 1.7900e+01, 3.9690e+02,
         1.4150e+01],
        [4.0202e-01, 0.0000e+00, 9.9000e+00, 0.0000e+00, 5.4400e-01, 6.3820e+00,
         6.7200e+01, 3.5325e+00, 4.0000e+00, 3.0400e+02, 1.8400e+01, 3.9521e+02,
         1.0360e+01],
        [2.8750e-02, 2.8000e+01, 1.5040e+01, 0.0000e+00, 4.6400e-01, 6.2110e+00,

In [477]:
w = torch.randn((len(X_train[0])))
b = torch.zeros((1,))
w.requires_grad_()
b.requires_grad_()

tensor([0.], requires_grad=True)

In [478]:
def linreg(X, w1, b1):
  return torch.mv(X.float(),w1.float())+b1.float()

def squared_loss(y_hat, y):
  return ((y_hat-y.reshape(y_hat.shape)) ** 2).mean()

def sgd(params, lr):
  for param in params:
    param.data[:] = param - lr*param.grad

In [479]:
lr = 1e-6  
num_epochs = 1000
batch_size=3

w = torch.randn((len(X_train[0])))
b = torch.zeros((1,))
w.requires_grad_()
b.requires_grad_()

w,b

(tensor([-0.1940, -1.1125, -0.7683, -1.2179, -0.9495, -0.0887,  2.2234, -0.3470,
          1.1305,  0.4547, -0.3772, -0.1435,  0.4130], requires_grad=True),
 tensor([0.], requires_grad=True))

In [480]:
for epoch in range(num_epochs):
    
    for X, y in data_iter(batch_size, X_train, y_train):   
        
        
        l = squared_loss(linreg(X,w,b), y)
        l.backward()
        

        with torch.no_grad():
            for j in range(len(w)):
                w[j] -= lr * w.grad[j]
            b -= lr * b.grad
            
            # Manually zero the gradients after updating weights
            w.grad=None
            b.grad=None

    train_l = squared_loss(linreg(X_train, w, b), y_train)
    print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
    


epoch 1, loss 3346.071590
epoch 2, loss 2215.242406
epoch 3, loss 1479.702873
epoch 4, loss 1001.442875
epoch 5, loss 690.520052
epoch 6, loss 488.430294
epoch 7, loss 357.118781
epoch 8, loss 271.832332
epoch 9, loss 216.469874
epoch 10, loss 180.557896
epoch 11, loss 157.283971
epoch 12, loss 142.217125
epoch 13, loss 132.475788
epoch 14, loss 126.185918
epoch 15, loss 122.129208
epoch 16, loss 119.513857
epoch 17, loss 117.825303
epoch 18, loss 116.729669
epoch 19, loss 116.010354
epoch 20, loss 115.526924
epoch 21, loss 115.188741
epoch 22, loss 114.937359
epoch 23, loss 114.735279
epoch 24, loss 114.558550
epoch 25, loss 114.392100
epoch 26, loss 114.226615
epoch 27, loss 114.056324
epoch 28, loss 113.878026
epoch 29, loss 113.689876
epoch 30, loss 113.491068
epoch 31, loss 113.281485
epoch 32, loss 113.061279
epoch 33, loss 112.830923
epoch 34, loss 112.590987
epoch 35, loss 112.341910
epoch 36, loss 112.084540
epoch 37, loss 111.819482
epoch 38, loss 111.547382
epoch 39, loss 11

In [481]:
w,b

(tensor([-0.1229,  0.1241,  0.0019, -1.1362, -0.9047,  1.0291,  0.0901, -0.1621,
          0.1455, -0.0123,  0.5093,  0.0288, -0.7253], requires_grad=True),
 tensor([0.1201], requires_grad=True))

In [482]:
squared_loss(linreg(X_test,w,b), y_test)

tensor(55.3358, dtype=torch.float64, grad_fn=<MeanBackward0>)