# Housing price prediction
## About this file:
Boston Housing dataset contains 509 rows and 14 columns, and includes data on various features of residential homes in the 
Boston area for regression analysis and modeling.

## Attribute Information:
1) CRIM: per capita crime rate by town.
2) ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
3) INDUS: proportion of non-retail business acres per town.
4) CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
5) NOX: nitric oxides concentration (parts per 10 million).
6) RM: average number of rooms per dwelling.
7) AGE: proportion of owner-occupied units built prior to 1940.
8) DIS: weighted distances to five Boston employment centres.
9) RAD: index of accessibility to radial highways.
10) TAX: full-value property-tax rate per $10,000.

11)  PTRATIO: pupil-teacher ratio by town.
12) B: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
13) LSTAT: % lower status of the population.
14) MEDV: Median value of owner-occupied homes in $1000's.


In [42]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [43]:
# Read the dataset
df=pd.read_csv("Housing.csv")
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1.0,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2.0,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2.0,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3.0,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3.0,222,18.7,396.9,5.33,36.2


In [44]:
# summary of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509 entries, 0 to 508
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     509 non-null    float64
 1   ZN       509 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     509 non-null    int64  
 4   NOX      507 non-null    float64
 5   RM       509 non-null    float64
 6   AGE      508 non-null    float64
 7   DIS      509 non-null    float64
 8   RAD      508 non-null    float64
 9   TAX      509 non-null    int64  
 10  PTRATIO  509 non-null    float64
 11  B        509 non-null    float64
 12  LSTAT    508 non-null    float64
 13  MEDV     509 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.8 KB


In [45]:
## Descriptive summary of dataset
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,509.0,509.0,506.0,509.0,507.0,509.0,508.0,509.0,508.0,509.0,509.0,509.0,508.0,509.0
mean,3.707516,11.29666,11.198281,0.068762,0.555216,6.279845,68.579134,3.787705,9.610236,409.21611,18.463851,356.664892,12.705276,22.501572
std,8.732089,23.269781,6.856713,0.253298,0.115633,0.703449,28.114744,2.101852,8.735069,168.814161,2.161553,91.562469,7.131979,9.183497
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.08221,0.0,5.19,0.0,0.449,5.88,45.075,2.1007,4.0,279.0,17.4,375.33,7.0925,17.0
50%,0.26169,0.0,9.69,0.0,0.538,6.202,77.15,3.1827,5.0,330.0,19.1,391.45,11.43,21.2
75%,3.69311,12.5,18.1,0.0,0.624,6.619,94.1,5.118,24.0,666.0,20.2,396.24,16.9925,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [46]:
# shape of dataset
df.shape

(509, 14)

In [47]:
# List down all the columns
df.columns

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')

In [48]:
# check missing value
df.isnull().sum()

CRIM       0
ZN         0
INDUS      3
CHAS       0
NOX        2
RM         0
AGE        1
DIS        0
RAD        1
TAX        0
PTRATIO    0
B          0
LSTAT      1
MEDV       0
dtype: int64

## Observation
Data has missing values

In [49]:
# Handelling missing value
df.fillna(0)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1.0,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2.0,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2.0,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3.0,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3.0,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
504,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1.0,273,21.0,391.99,9.67,22.4
505,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1.0,273,21.0,396.90,9.08,20.6
506,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1.0,273,21.0,396.90,5.64,23.9
507,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1.0,273,21.0,393.45,6.48,22.0


In [50]:
# find unique value
df['TAX'].unique()

array([296, 242, 222, 311, 307, 279, 252, 233, 243, 469, 226, 313, 256,
       284, 216, 337, 345, 305, 398, 281, 247, 270, 276, 384, 432, 188,
       437, 403, 193, 265, 255, 329, 402, 348, 224, 277, 300, 330, 315,
       244, 264, 223, 254, 198, 285, 241, 293, 245, 289, 358, 304, 287,
       430, 422, 370, 352, 351, 280, 335, 411, 187, 334, 666, 711, 391,
       273])

In [51]:
# Conclusion ...imbalanced dataset
df['TAX'].value_counts()

666    134
307     40
403     30
437     15
304     14
      ... 
285      1
198      1
256      1
244      1
313      1
Name: TAX, Length: 66, dtype: int64

## Observation
imbalanced dataset

In [52]:
df.shape

(509, 14)

In [53]:
# Find duplicate
df[df.duplicated()]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
428,37.6619,0.0,18.1,0,0.679,6.202,78.7,1.8629,24.0,666,20.2,18.82,14.52,10.9
470,13.0751,0.0,18.1,0,0.58,5.713,56.7,2.8237,24.0,666,20.2,396.9,14.76,20.1
479,15.0234,0.0,18.1,0,0.614,5.304,97.3,2.1007,24.0,666,20.2,349.48,24.91,12.0
499,0.2896,0.0,9.69,0,0.585,5.39,72.9,2.7986,6.0,391,19.2,396.9,21.14,19.7


In [54]:
# Removing duplicate records
df.drop_duplicates(inplace=True)

In [55]:
df.shape

(505, 14)

In [57]:
df.duplicated().sum()

0

In [58]:
# Check no. of unique value in each column
df.nunique()


CRIM       503
ZN          26
INDUS       76
CHAS         2
NOX         81
RM         446
AGE        354
DIS        411
RAD          9
TAX         66
PTRATIO     46
B          356
LSTAT      453
MEDV       228
dtype: int64

In [59]:
# Check the statistictes od dataset
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,505.0,505.0,502.0,505.0,503.0,505.0,504.0,505.0,504.0,505.0,505.0,505.0,504.0,505.0
mean,3.606091,11.386139,11.16004,0.069307,0.554745,6.284816,68.517063,3.798725,9.531746,407.726733,18.452079,357.188772,12.656647,22.555644
std,8.608447,23.34008,6.862845,0.254227,0.115918,0.703302,28.187932,2.106167,8.697191,168.312294,2.165696,90.64742,7.128445,9.191851
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.08199,0.0,5.19,0.0,0.449,5.885,44.85,2.1007,4.0,279.0,17.4,375.52,6.99,17.1
50%,0.25387,0.0,9.69,0.0,0.538,6.209,77.15,3.2157,5.0,330.0,19.0,391.45,11.36,21.2
75%,3.67367,12.5,18.1,0.0,0.624,6.625,94.1,5.2119,24.0,666.0,20.2,396.23,16.945,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0
