In [93]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [94]:
data = pd.read_csv("boston.csv")
data.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311.0,15.2,386.71,17.1,18.9


CRIM - per capita crime rate by town

ZN - proportion of residential land zoned for lots over 25,000 sq.ft.

INDUS - proportion of non-retail business acres per town.

CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)

NOX - nitric oxides concentration (parts per 10 million)

RM - average number of rooms per dwelling

AGE - proportion of owner-occupied units built prior to 1940

DIS - weighted distances to five Boston employment centres

RAD - index of accessibility to radial highways

TAX - full-value property-tax rate per $10,000

PTRATIO - pupil-teacher ratio by town

B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town

LSTAT - % lower status of the population

MEDV - Median value of owner-occupied homes in $1000's

In [95]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NX       506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [96]:
data.shape

(506, 14)

In [97]:
data_copy =data.copy()

data_copy = data_copy.replace(0,np.NaN)

data_copy.isnull().sum()

CRIM         0
ZN         372
INDUS        0
CHAS       471
NX           0
RM           0
AGE          0
DIS          0
RAD          0
TAX          0
PTRATIO      0
B            0
LSTAT        0
MEDV         0
dtype: int64

In [98]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# fit the model to the data
model = IsolationForest()
model.fit(data)

# use the model to predict which rows are outliers
predictions = model.predict(data)

# select only the rows that are not outliers
data = data[predictions != -1]

data.shape

(421, 14)

In [99]:
#seperating data and labels
X = data.drop(columns=['MEDV','CHAS'])
y = data['MEDV']

In [100]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [101]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Define pipeline steps
steps = [
    ('scalar',StandardScaler()),
    ('regressor', RandomForestRegressor())
]

# Create pipeline
pipeline = Pipeline(steps)

# Fit pipeline to training data
pipeline.fit(X_train, y_train)

# Evaluate pipeline on test data
print("score: {:.3f}".format(pipeline.score(X_test, y_test)))

kfold = KFold(n_splits=5)
scores = cross_val_score(pipeline, X, y, cv=kfold)
print("CV scores:",scores)
print("Average CV score: {:.3f}".format(scores.mean()))

score: 0.822
CV scores: [0.71642737 0.76990938 0.78579564 0.72407695 0.42511036]
Average CV score: 0.684


In [102]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(pipeline['regressor'].feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')

importances

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
RM,0.454
LSTAT,0.363
CRIM,0.051
NX,0.028
PTRATIO,0.021
AGE,0.017
DIS,0.017
TAX,0.016
INDUS,0.014
B,0.012
