## Import Statements

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

## Loading Data

In [2]:
#Load in data and display the first few rows
housing_data = pd.read_csv('../Datasets/housing.csv')
print('Data Loaded Successfully')
display(housing_data.head())

#Check Shape
num_samples = housing_data.shape[0]
num_features = housing_data.shape[1]
print('Number of Samples:', num_samples)
print('Number of Features:', num_features)
print('Data Shape:', housing_data.shape) #(Samples, Features)

Data Loaded Successfully


Unnamed: 0,town,tract,longitude,latitude,crime,residential,industrial,river,nox,rooms,older,distance,highway,tax,ptratio,lstat,cmedv
0,Nahant,2011,-70.955002,42.255001,0.00632,18.0,2.31,no,0.538,6.575,65.199997,4.09,1,296,15.3,4.98,24.0
1,Swampscott,2021,-70.949997,42.287498,0.02731,0.0,7.07,no,0.469,6.421,78.900002,4.9671,2,242,17.799999,9.14,21.6
2,Swampscott,2022,-70.935997,42.283001,0.02729,0.0,7.07,no,0.469,7.185,61.099998,4.9671,2,242,17.799999,4.03,34.700001
3,Marblehead,2031,-70.928001,42.292999,0.03237,0.0,2.18,no,0.458,6.998,45.799999,6.0622,3,222,18.700001,2.94,33.400002
4,Marblehead,2032,-70.921997,42.298,0.06905,0.0,2.18,no,0.458,7.147,54.200001,6.0622,3,222,18.700001,5.33,36.200001


Number of Samples: 506
Number of Features: 17
Data Shape: (506, 17)


## Preprocessing Data

In [3]:
#Drop town and river column
housing_data = housing_data.drop(columns=['town','river'])

## Examining Data

In [4]:
#Print all the keys in the dataset
print('Keys in the dataset:')
print(housing_data.keys(), end='\n\n')

#Display correlation matrix
corr_mat = housing_data.corr()
corr_mat = (
    corr_mat
    .style
    .background_gradient(
        cmap='coolwarm',
        vmin=-1,
        vmax=1,
        axis=None
    )
)
display(corr_mat)


Keys in the dataset:
Index(['tract', 'longitude', 'latitude', 'crime', 'residential', 'industrial',
       'nox', 'rooms', 'older', 'distance', 'highway', 'tax', 'ptratio',
       'lstat', 'cmedv'],
      dtype='object')



Unnamed: 0,tract,longitude,latitude,crime,residential,industrial,nox,rooms,older,distance,highway,tax,ptratio,lstat,cmedv
tract,1.0,-0.22089,-0.225543,-0.547165,0.367292,-0.575705,-0.569807,0.305207,-0.487465,0.496841,-0.828829,-0.793602,-0.532678,-0.522485,0.428252
longitude,-0.22089,1.0,0.143055,0.065101,-0.21808,0.062702,0.160869,-0.257111,0.204736,-0.011241,0.034065,0.050661,0.312602,0.195629,-0.322947
latitude,-0.225543,0.143055,1.0,-0.084294,-0.129668,-0.041093,-0.0686,-0.069317,0.079035,-0.082981,-0.207013,-0.167718,-0.004527,0.04566,0.006825
crime,-0.547165,0.065101,-0.084294,1.0,-0.200469,0.406583,0.420972,-0.219247,0.352734,-0.37967,0.625505,0.582764,0.289946,0.455621,-0.389582
residential,0.367292,-0.21808,-0.129668,-0.200469,1.0,-0.533828,-0.516604,0.311991,-0.569537,0.664408,-0.311948,-0.314563,-0.391679,-0.412995,0.360386
industrial,-0.575705,0.062702,-0.041093,0.406583,-0.533828,1.0,0.763651,-0.391676,0.644779,-0.708027,0.595129,0.72076,0.383248,0.6038,-0.484754
nox,-0.569807,0.160869,-0.0686,0.420972,-0.516604,0.763651,1.0,-0.302188,0.73147,-0.76923,0.611441,0.668023,0.188933,0.590879,-0.4293
rooms,0.305207,-0.257111,-0.069317,-0.219247,0.311991,-0.391676,-0.302188,1.0,-0.240265,0.205246,-0.209847,-0.292048,-0.355502,-0.613808,0.696304
older,-0.487465,0.204736,0.079035,0.352734,-0.569537,0.644779,0.73147,-0.240265,1.0,-0.747881,0.456022,0.506456,0.261515,0.602339,-0.377999
distance,0.496841,-0.011241,-0.082981,-0.37967,0.664408,-0.708027,-0.76923,0.205246,-0.747881,1.0,-0.494588,-0.534432,-0.232471,-0.496996,0.249315


## Setting Target and Features

In [5]:
#Set the target variable
target_data = housing_data['cmedv']
print('Target variable set')

#Drop the target variable from the data
housing_data = housing_data.drop(columns=['cmedv'])
print('Target variable dropped from data')

Target variable set
Target variable dropped from data
