In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, RFE, SelectFromModel
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
mel_data = pd.read_csv('/content/melbourne_housing_raw.csv')
mel_data.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,3/9/16,2.5,3067.0,2.0,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,3/12/16,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,4/2/16,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,4/2/16,2.5,3067.0,3.0,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,4/3/17,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [14]:
missing_p=mel_data.isnull().mean()*100
clmns_to_remove=[col for col in missing_p.index if missing_p[col]> 20 and col!='Price']
redu_data=mel_data.drop(columns=clmns_to_remove)
redu_data=redu_data.dropna(subset=['Price'])

In [15]:
M=redu_data.drop(columns=['Price','Date','Suburb','Type','Method','SellerG','CouncilArea','Regionname'])
n=redu_data['Price']
M.fillna(M.mean(),inplace=True)
M_train,M_test,n_train,n_test=train_test_split(M,n,test_size=0.2,random_state=42)
corr_mat=M_train.corr().abs()
h_corr_pairs=np.where(corr_mat>0.85)
h_corr_features=set([M_train.columns[i] for i in h_corr_pairs[0] if i!=h_corr_pairs[1][i]])
M_train_corr_filter=M_train.drop(columns=h_corr_features)
M_test_corr_filter=M_test.drop(columns=h_corr_features)
var_filter=VarianceThreshold(threshold=0.01)
M_train_l_var=var_filter.fit_transform(M_train_corr_filter)
M_test_l_var=var_filter.transform(M_test_corr_filter)

In [18]:
linear_model=LinearRegression()
RFE_selector=RFE(estimator=linear_model,n_features_to_select=5,step=1)
RFE_selector.fit(M_train_l_var,n_train)
M_train_frd=RFE_selector.transform(M_train_l_var)
M_test_frd=RFE_selector.transform(M_test_l_var)
RF_model=RandomForestRegressor(random_state=42)
RFE_brd=RFE(estimator=RF_model,n_features_to_select=5,step=1)
RFE_brd.fit(M_train_l_var,n_train)
M_train_brd=RFE_brd.transform(M_train_l_var)
M_test_brd=RFE_brd.transform(M_test_l_var)
RF_model=RandomForestRegressor(random_state=42)
RF_model.fit(M_train,n_train)
important_features=SelectFromModel(RF_model,threshold="mean",prefit=True)
M_train_rf_select=important_features.transform(M_train)
M_test_rf_select=important_features.transform(M_test)



In [21]:
def evaluate_model(M_train,M_test,n_train,n_test):
    mdl=RandomForestRegressor(random_state=42)
    mdl.fit(M_train,n_train)
    preds=mdl.predict(M_test)
    MSE=mean_squared_error(n_test,preds)
    return MSE
output = {
    "With No Feature Selection,the Accuracy value is": evaluate_model(M_train,M_test,n_train,n_test),
    "After High Correlation Filter,the Accuracy value is:": evaluate_model(M_train_corr_filter,M_test_corr_filter,n_train,n_test),
    "After Low Variance Filter,the Accuracy value is:": evaluate_model(M_train_l_var,M_test_l_var,n_train,n_test),
    "After Forward Selection,the Accuracy value is:": evaluate_model(M_train_frd,M_test_frd,n_train,n_test),
    "After Backward Elimination,the Accuracy value is:": evaluate_model(M_train_brd,M_test_brd,n_train,n_test),
    "After Random Forest Selection,the Accuracy value is:": evaluate_model(M_train_rf_select,M_test_rf_select,n_train,n_test)
}
for method,result in output.items():
    print(f"{method}:{result}")

With No Feature Selection,the Accuracy value is:143875373039.6255
After High Correlation Filter,the Accuracy value is::143875373039.6255
After Low Variance Filter,the Accuracy value is::143875373039.6255
After Forward Selection,the Accuracy value is::143875373039.6255
After Backward Elimination,the Accuracy value is::143875373039.6255
After Random Forest Selection,the Accuracy value is::147811757361.13766
