<a href="https://colab.research.google.com/github/karri-ten/boston_housing_model/blob/main/boston_housing_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## This dataset contains detailed information about various attributes related to housing in the Boston area. The dataset includes the following columns:

crim: Per capita crime rate by town.
zn: Proportion of residential land zoned for large lots (over 25,000 sq. ft.).
indus: Proportion of non-retail business acres per town.
chas: Charles River dummy variable (1 if tract bounds river; 0 otherwise).
nox: Nitric oxides concentration (parts per 10 million).
rm: Average number of rooms per dwelling.
age: Proportion of owner-occupied units built prior to 1940.
dis: Weighted distances to five Boston employment centers.
rad: Index of accessibility to radial highways.
tax: Full-value property-tax rate per $10,000.
This dataset provides valuable insights into the housing market dynamics and socio-economic factors influencing property values in Boston.

## Importing Libariry

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error , mean_squared_error  , r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge,Lasso

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestRegressor

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/BostonHousing.csv')
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [None]:
df.shape

(506, 14)

## Data Cleaning

In [None]:
df.duplicated().sum()

0

In [None]:
df.isnull().sum()

Unnamed: 0,0
crim,0
zn,0
indus,0
chas,0
nox,0
rm,0
age,0
dis,0
rad,0
tax,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [None]:
new_column_names = [
    "crime_Rate",
    "zoned_Land",
    "industrial_Acres",
    "river_Proximity",
    "nitric_Oxides",
    "rooms_per_dwelling",
    "age_of_home",
    "employment_Distance",
    "highway_Accessibility",
    "property_Tax_Rate",
    "pupil_teacher_ratio",
    "black_Residents_Rate",
    "lower_Status_Population",
    "median_Home_Value"

]

for i in range(len(df.columns)):
    df.rename(columns={df.columns[i]: new_column_names[i]}, inplace=True)

df.head()

Unnamed: 0,crime_Rate,zoned_Land,industrial_Acres,river_Proximity,nitric_Oxides,rooms_per_dwelling,age_of_home,employment_Distance,highway_Accessibility,property_Tax_Rate,pupil_teacher_ratio,black_Residents_Rate,lower_Status_Population,median_Home_Value
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Preprocessing of data

In [None]:
df1 = df.copy()

In [None]:
def data_preprocessing(df):
# Split data into independent (X) and independent (y) variables
  X_names= df.columns.drop('median_Home_Value')
  y_name = 'median_Home_Value'
  X_data = df[X_names]
  y_data = df[y_name]

  scaler = StandardScaler()
  X_data = scaler.fit_transform(X_data)

  X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)

  return (X_train,y_train),(X_test , y_test)

In [None]:
(X_train, y_train), (X_test, y_test) = data_preprocessing(df1)
print(X_train[:1])
print(y_train[:1])
print(X_test[:1])
print(y_test[:1])

[[ 1.32780421 -0.48772236  1.01599907 -0.27259857  0.51229565 -1.39706929
   1.02148094 -0.80543822  1.66124525  1.53092646  0.80657583 -0.07887794
   1.7181012 ]]
477    12.0
Name: median_Home_Value, dtype: float64
[[-0.40983668 -0.48772236 -1.03402724 -0.27259857 -0.38609067  0.18715116
   0.55208139 -0.54607682 -0.52300145 -0.66660821 -0.85792914  0.42570183
  -0.50645674]]
173    23.6
Name: median_Home_Value, dtype: float64


In [77]:
def train_ridge(X_train,y_train):
  ridge_model = Ridge()
  ridge_model.fit(X_train,y_train)
  print("Ridge score for x_train and y_train:",ridge_model.score(X_train,y_train))
  print("Ridge score for x_text and y_text:",ridge_model.score(X_test,y_test))
  return ridge_model

In [78]:
ridge = train_ridge(X_train,y_train)
print(ridge.intercept_)
print(ridge.coef_)

Ridge score for x_train and y_train: 0.7508700636102708
Ridge score for x_text and y_text: 0.6684401592810273
22.484854796146763
[-0.96203397  0.68255872  0.25071604  0.71033762 -1.96157279  3.12232106
 -0.17845861 -3.0100255   2.20195592 -1.71784558 -1.97082455  1.12414013
 -3.61478723]


In [None]:
def predict_ridge(model,x_test):
  predict = model.predict(x_test)
  return predict

In [None]:
predict_ridge(ridge,X_test)

array([28.9738342 , 35.97440065, 14.93696503, 25.02154769, 18.76386372,
       23.26265213, 17.68271125, 14.37965106, 22.96268127, 20.65540957,
       24.87971632, 18.67409794, -6.03944167, 21.78382044, 19.24243232,
       26.16134727, 20.55148626,  5.7938853 , 40.44087067, 17.60963864,
       27.20905252, 30.00547984, 11.39064279, 24.17052829, 17.84543022,
       15.79522884, 22.7894579 , 14.594494  , 22.43919822, 19.22385378,
       22.41253268, 25.21899404, 25.92213161, 17.68209526, 16.73809544,
       16.9955771 , 31.23585621, 20.13174238, 23.7728774 , 24.62788682,
       13.97728083, 32.17038932, 42.57078051, 17.36906875, 27.27979727,
       16.97200298, 14.10330725, 25.89878752, 20.26205127, 29.9805763 ,
       21.31035544, 34.29142081, 16.08793892, 26.25049558, 39.49279802,
       22.55549187, 18.83803147, 32.64403299, 25.07064871, 12.92646066,
       22.69684479, 30.4498829 , 31.48171694, 15.92440567, 20.33503052,
       16.71870705, 20.50657898, 25.95217865, 30.56396947, 11.60

In [None]:
def train_lasso(X_train,y_train):
  lasso_model = Lasso()
  lasso_model.fit(X_train,y_train)
  print("lasso score for x_train and y_train:",lasso_model.score(X_train,y_train))
  print("lasso score for x_text and y_text:",lasso_model.score(X_test,y_test))
  return lasso_model

In [None]:
lasso = train_lasso(X_train,y_train)
print(lasso.intercept_)
print(lasso.coef_)

lasso score for x_train and y_train: 0.6788596102594677
lasso score for x_text and y_text: 0.6242880038311525
22.49255763571744
[-0.04761506  0.         -0.          0.05114134 -0.          3.06589786
 -0.         -0.         -0.         -0.         -1.21080069  0.4443653
 -3.35395653]


In [80]:
def predict_lasso(model,x_test):
  predict = model.predict(x_test)
  return predict

In [81]:
predict_lasso(lasso,X_test)

array([25.99849954, 29.7314965 , 17.87614264, 24.64234536, 19.51569893,
       22.74049926, 18.40593082, 15.22054232, 21.34300545, 20.36291059,
       20.11286088, 21.15013987,  0.81181168, 22.42358064, 19.93484421,
       25.08815012, 18.15984293,  6.93960427, 36.45080237, 18.6562333 ,
       25.26836533, 26.60358992, 13.91170233, 23.99523757, 18.65483156,
       15.13181605, 22.43004402, 18.85305607, 19.20800028, 19.45308732,
       19.66231324, 25.23414423, 25.73115585, 18.74719349, 16.06907229,
       20.13859548, 31.02832161, 21.24373321, 20.83140321, 24.41136486,
       14.37859221, 27.59905302, 37.00169103, 19.2230386 , 25.42327764,
       17.07522505, 15.85043241, 25.58070371, 19.47025995, 29.33681634,
       23.02254183, 31.43774604, 17.78803597, 25.82985601, 35.35418813,
       22.75147962, 19.54315133, 29.17825795, 24.55126256, 16.5963188 ,
       25.55824351, 30.90035729, 28.5511857 , 17.57337129, 27.30736469,
       13.79753956, 20.35970075, 25.25481873, 28.04867154, 15.47