In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd

from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, f1_score

In [2]:
outliers_df = pd.read_csv('outliers-data.csv')
outliers_df.drop(columns=["Unnamed: 0"],inplace=True)

In [3]:
outliers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7757 entries, 0 to 7756
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bed             7757 non-null   float64
 1   bath            7757 non-null   float64
 2   acre_lot        7757 non-null   float64
 3   zip_code        7757 non-null   float64
 4   house_size      7757 non-null   float64
 5   prev_sold_date  7757 non-null   object 
 6   price           7757 non-null   float64
dtypes: float64(6), object(1)
memory usage: 424.3+ KB


In [4]:
outliers_df["prev_sold_year"] = pd.to_datetime(outliers_df["prev_sold_date"], format='%Y-%m-%d').dt.year
outliers_df.drop(columns=["prev_sold_date"],inplace=True)
outliers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7757 entries, 0 to 7756
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bed             7757 non-null   float64
 1   bath            7757 non-null   float64
 2   acre_lot        7757 non-null   float64
 3   zip_code        7757 non-null   float64
 4   house_size      7757 non-null   float64
 5   price           7757 non-null   float64
 6   prev_sold_year  7757 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 424.3 KB


In [5]:
num_features_to_select = 5
random_state_value = 42

In [6]:
X = outliers_df[["bed","bath","acre_lot","zip_code","house_size","prev_sold_year"]]
y = outliers_df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state_value)

In [7]:
cv_scores = []
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
for k in kernels:
    svc = SVR(kernel=k)
    scores = cross_val_score(svc, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

optimal_kernel = kernels[cv_scores.index(max(cv_scores))]

In [8]:
params = {
    'C': [1],
    'gamma': [1],
    'kernel': [optimal_kernel]
}

svc_gscv = GridSearchCV(SVR(), params, refit=True, verbose=2, cv=5).fit(X, y)
print(f"Best parameters found: {svc_gscv.best_params_}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.9s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.4s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.3s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.5s
[CV] END ........................C=1, gamma=1, kernel=linear; total time=   1.8s
Best parameters found: {'C': 1, 'gamma': 1, 'kernel': 'linear'}


In [17]:
svc = SVR(**svc_gscv.best_params_)
svc_model = svc.fit(X_train,y_train)
svc_predict = svc_model.predict(X_test)
svc_mse = mean_squared_error(y_test,svc_predict)
print("{:,.6f}".format(svc_mse))

206,280,132,144.347687


In [9]:
svc_rfe = RFE(SVR(**svc_gscv.best_params_), n_features_to_select=num_features_to_select, step=1)
svc_rfe.fit(X, y)
svc_rfe_selected_features = svc_rfe.get_support()
print('The selected features are:', list(X.columns[svc_rfe_selected_features]))

The selected features are: ['bed', 'bath', 'acre_lot', 'zip_code', 'house_size']


In [10]:
X_rfe = outliers_df[list(X.columns[svc_rfe_selected_features])]
y_rfe = outliers_df["price"]
X_rfe_train, X_rfe_test, y_rfe_train, y_rfe_test = train_test_split(X_rfe, y_rfe, test_size=0.3, random_state=random_state_value)

In [16]:
svc_rfe = SVR(**svc_gscv.best_params_)
svc_rfe_model = svc_rfe.fit(X_rfe_train,y_rfe_train)
svc_rfe_predict = svc_rfe_model.predict(X_rfe_test)
svc_rfe_mse = mean_squared_error(y_rfe_test,svc_rfe_predict)
print("{:,.6f}".format(svc_rfe_mse))

206,289,633,183.080200


In [12]:
svc_sfs = SequentialFeatureSelector(SVR(**svc_gscv.best_params_), n_features_to_select=num_features_to_select)
svc_sfs.fit(X, y)
svc_sfs_selected_features = svc_sfs.get_support()
print('The selected features are:', list(X.columns[svc_sfs_selected_features]))

The selected features are: ['bed', 'bath', 'acre_lot', 'zip_code', 'house_size']


In [13]:
X_sfs = outliers_df[list(X.columns[svc_sfs_selected_features])]
y_sfs = outliers_df["price"]
X_sfs_train, X_sfs_test, y_sfs_train, y_sfs_test = train_test_split(X_sfs, y_sfs, test_size=0.3, random_state=random_state_value)

In [15]:
svc_rfe = SVR(**svc_gscv.best_params_)
svc_rfe_model = svc_rfe.fit(X_rfe_train,y_rfe_train)
svc_rfe_predict = svc_rfe_model.predict(X_rfe_test)
svc_rfe_mse = mean_squared_error(y_rfe_test,svc_rfe_predict)
print("{:,.6f}".format(svc_rfe_mse))

206,289,633,183.080200
