In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score

In [2]:
data = pd.read_csv("boston.csv")
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NX       506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [4]:
data.shape

(506, 14)

In [5]:
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# fit the model to the data
model = IsolationForest(contamination=0.16)
model.fit(data)

# use the model to predict which rows are outliers
predictions = model.predict(data)

# select only the rows that are not outliers
data = data[predictions != -1]

data.shape



(425, 14)

In [6]:
#seperating data and labels
X = data.drop(columns=['MEDV','CHAS'])
y = data['MEDV']

In [7]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
# Define pipeline steps
steps = [
    ('scalar',StandardScaler()),
    ('regressor', RandomForestRegressor())
]

# Create pipeline
pipeline = Pipeline(steps)

# Fit pipeline to training data
pipeline.fit(X_train, y_train)

# Evaluate pipeline on test data
print("score: {:.3f}".format(pipeline.score(X_test, y_test)))

kfold = KFold(n_splits=5)
scores = cross_val_score(pipeline, X, y, cv=kfold)
print("CV scores:",scores)
print("Average CV score: {:.3f}".format(scores.mean()))

score: 0.811
CV scores: [0.76589098 0.76954485 0.75792019 0.73847322 0.45007696]
Average CV score: 0.696


In [19]:
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(pipeline['regressor'].feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')

importances

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
RM,0.573
LSTAT,0.275
CRIM,0.04
NX,0.018
AGE,0.018
DIS,0.017
TAX,0.017
PTRATIO,0.016
B,0.01
INDUS,0.009
