In [22]:
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd

from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, plot_tree


In [2]:
outliers_df = pd.read_csv('outliers-data.csv')
outliers_df.drop(columns=["Unnamed: 0"],inplace=True)

In [3]:
outliers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7757 entries, 0 to 7756
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bed             7757 non-null   float64
 1   bath            7757 non-null   float64
 2   acre_lot        7757 non-null   float64
 3   zip_code        7757 non-null   float64
 4   house_size      7757 non-null   float64
 5   prev_sold_date  7757 non-null   object 
 6   price           7757 non-null   float64
dtypes: float64(6), object(1)
memory usage: 424.3+ KB


In [4]:
outliers_df["prev_sold_year"] = pd.to_datetime(outliers_df["prev_sold_date"], format='%Y-%m-%d').dt.year
outliers_df.drop(columns=["prev_sold_date"],inplace=True)
outliers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7757 entries, 0 to 7756
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bed             7757 non-null   float64
 1   bath            7757 non-null   float64
 2   acre_lot        7757 non-null   float64
 3   zip_code        7757 non-null   float64
 4   house_size      7757 non-null   float64
 5   price           7757 non-null   float64
 6   prev_sold_year  7757 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 424.3 KB


In [5]:
num_features_to_select = 5
random_state_value = 42

In [6]:
X = outliers_df[["bed","bath","acre_lot","zip_code","house_size","prev_sold_year"]]
y = outliers_df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state_value)

In [7]:
lgr_rfe = RFE(LinearRegression(), n_features_to_select=num_features_to_select)
lgr_rfe.fit(X, y)
lgr_rfe_selected_features = lgr_rfe.get_support()
print('The selected features are:', list(X.columns[lgr_rfe_selected_features]))

The selected features are: ['bed', 'bath', 'acre_lot', 'zip_code', 'prev_sold_year']


In [8]:
lgr_sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=num_features_to_select)
lgr_sfs.fit(X, y)
lgr_sfs_selected_features = lgr_sfs.get_support()
print('The selected features are:', list(X.columns[lgr_sfs_selected_features]))

The selected features are: ['bed', 'bath', 'zip_code', 'house_size', 'prev_sold_year']


In [10]:
knn_sfs = SequentialFeatureSelector(KNeighborsRegressor(), n_features_to_select=num_features_to_select)
knn_sfs.fit(X, y)
knn_sfs_selected_features = knn_sfs.get_support()
print('The selected features are:', list(X.columns[knn_sfs_selected_features]))

The selected features are: ['bed', 'bath', 'acre_lot', 'zip_code', 'prev_sold_year']


In [12]:
dtree_rfe = RFE(DecisionTreeRegressor(), n_features_to_select=num_features_to_select)
dtree_rfe.fit(X, y)
dtree_rfe_selected_features = dtree_rfe.get_support()
print('The selected features are:', list(X.columns[dtree_rfe_selected_features]))

The selected features are: ['bath', 'acre_lot', 'zip_code', 'house_size', 'prev_sold_year']


In [13]:
dtree_sfs = SequentialFeatureSelector(DecisionTreeRegressor(), n_features_to_select=num_features_to_select)
dtree_sfs.fit(X, y)
dtree_sfs_selected_features = dtree_sfs.get_support()
print('The selected features are:', list(X.columns[dtree_sfs_selected_features]))

The selected features are: ['bed', 'bath', 'zip_code', 'house_size', 'prev_sold_year']


In [14]:
cv_scores = []
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for k in kernels:
    svc = SVR(kernel=k)
    scores = cross_val_score(svc, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

optimal_kernel = kernels[cv_scores.index(max(cv_scores))]

In [26]:
params = {
    'C': [1],
    'gamma': [1],
    'kernel': [optimal_kernel]
}

svc_gscv = GridSearchCV(SVR(), params, refit=True, verbose=2, cv=5).fit(X, y)
print(f"Best parameters found: {svc_gscv.best_params_}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   1.8s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   1.6s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   1.5s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   1.2s
[CV] END ......................C=1, gamma=0.1, kernel=linear; total time=   1.4s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.4s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.4s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.3s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.2s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.4s
[CV] END .......................C=1, gamma=10, kernel=linear; total time=   1.4s
[CV] END .......................C=1, gamma=10, ke

In [16]:
svc_rfe = RFE(SVR(**svc_gscv.best_params_), n_features_to_select=num_features_to_select, step=1)
svc_rfe.fit(X, y)
svc_rfe_selected_features = svc_rfe.get_support()
print('The selected features are:', list(X.columns[svc_rfe_selected_features]))

The selected features are: ['bed', 'bath', 'acre_lot', 'zip_code', 'house_size']


In [17]:
svc_sfs = SequentialFeatureSelector(SVR(**svc_gscv.best_params_), n_features_to_select=num_features_to_select)
svc_sfs.fit(X, y)
svc_sfs_selected_features = svc_sfs.get_support()
print('The selected features are:', list(X.columns[svc_sfs_selected_features]))

The selected features are: ['bed', 'bath', 'acre_lot', 'zip_code', 'house_size']


['bed', 'bath', 'acre_lot', 'zip_code', 'prev_sold_year']
['bed', 'bath', 'zip_code', 'house_size', 'prev_sold_year']
['bed', 'bath', 'acre_lot', 'zip_code', 'prev_sold_year']
['bath', 'acre_lot', 'zip_code', 'house_size', 'prev_sold_year']
['bed', 'bath', 'zip_code', 'house_size', 'prev_sold_year']
['bed', 'bath', 'acre_lot', 'zip_code', 'house_size']
['bed', 'bath', 'acre_lot', 'zip_code', 'house_size']