In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import linear_model
from sklearn.feature_selection import SequentialFeatureSelector, RFE
from sklearn.linear_model import LinearRegression, Ridge

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
outliers_df = pd.read_csv('outliers-data.csv')
outliers_df.drop(columns=["Unnamed: 0"],inplace=True)

In [3]:
outliers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7757 entries, 0 to 7756
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bed             7757 non-null   float64
 1   bath            7757 non-null   float64
 2   acre_lot        7757 non-null   float64
 3   zip_code        7757 non-null   float64
 4   house_size      7757 non-null   float64
 5   prev_sold_date  7757 non-null   object 
 6   price           7757 non-null   float64
dtypes: float64(6), object(1)
memory usage: 424.3+ KB


In [4]:
outliers_df["prev_sold_year"] = pd.to_datetime(outliers_df["prev_sold_date"], format='%Y-%m-%d').dt.year
outliers_df.drop(columns=["prev_sold_date"],inplace=True)
outliers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7757 entries, 0 to 7756
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bed             7757 non-null   float64
 1   bath            7757 non-null   float64
 2   acre_lot        7757 non-null   float64
 3   zip_code        7757 non-null   float64
 4   house_size      7757 non-null   float64
 5   price           7757 non-null   float64
 6   prev_sold_year  7757 non-null   int64  
dtypes: float64(6), int64(1)
memory usage: 424.3 KB


In [5]:
num_features_to_select = 5
random_state_value = 42

In [6]:
X = outliers_df[["bed","bath","acre_lot","zip_code","house_size","prev_sold_year"]]
y = outliers_df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state_value)

In [7]:
def print_stuff(title, mse,mae,r2):
    print(title + " MSE = {:,.6f}".format(lasso_mse))
    print(title + " MAE = {:,.6f}".format(lasso_mae))
    print(title + " R2 = {:,.6f}".format(lasso_r2))

In [8]:
lasso = linear_model.Lasso()
lasso_model = lasso.fit(X_train,y_train)
lasso_predict = lasso_model.predict(X_test)
lasso_mse = mean_squared_error(lasso_predict,y_test)
lasso_mae = mean_absolute_error(lasso_predict,y_test)
lasso_r2 = r2_score(y_test,lasso_predict)

In [9]:
print_stuff("Lasso",lasso_mse,lasso_mae,lasso_r2)

Lasso MSE = 123,732,489,420.844391
Lasso MAE = 226,348.922371
Lasso R2 = 0.379322


In [10]:
lasso_sfs = SequentialFeatureSelector(linear_model.Lasso(), n_features_to_select=num_features_to_select)
lasso_sfs.fit(X, y)
lasso_sfs_selected_features = lasso_sfs.get_support()
print('The selected features are:', list(X.columns[lasso_sfs_selected_features]))

The selected features are: ['bed', 'bath', 'zip_code', 'house_size', 'prev_sold_year']


In [11]:
X_sfs = outliers_df[list(X.columns[lasso_sfs_selected_features])]
y_sfs = outliers_df["price"]
X_sfs_train, X_sfs_test, y_sfs_train, y_sfs_test = train_test_split(X_sfs, y_sfs, test_size=0.3, random_state=random_state_value)

In [12]:
lasso_sfs = linear_model.Lasso()
lasso_sfs_model = lasso_sfs.fit(X_sfs_train,y_sfs_train)
lasso_sfs_predict = lasso_sfs_model.predict(X_sfs_test)

lasso_sfs_mse = mean_squared_error(y_sfs_test,lasso_sfs_predict)
lasso_sfs_mae = mean_absolute_error(y_sfs_test,lasso_sfs_predict)
lasso_sfs_r2 = r2_score(y_sfs_test,lasso_sfs_predict)

In [13]:
print_stuff("Lasso",lasso_sfs_mse,lasso_sfs_mae,lasso_sfs_r2)

Lasso MSE = 123,732,489,420.844391
Lasso MAE = 226,348.922371
Lasso R2 = 0.379322


In [14]:
lasso_rfe = RFE(linear_model.Lasso(), n_features_to_select=num_features_to_select)
lasso_rfe.fit(X, y)
lasso_rfe_selected_features = lasso_rfe.get_support()
print('The selected features are:', list(X.columns[lasso_rfe_selected_features]))

The selected features are: ['bed', 'bath', 'acre_lot', 'zip_code', 'prev_sold_year']


In [15]:
X_rfe = outliers_df[list(X.columns[lasso_rfe_selected_features])]
y_rfe = outliers_df["price"]
X_rfe_train, X_rfe_test, y_rfe_train, y_rfe_test = train_test_split(X_rfe, y_rfe, test_size=0.3, random_state=random_state_value)

In [16]:
lasso_rfe = linear_model.Lasso()
lasso_rfe_model = lasso_rfe.fit(X_rfe_train,y_rfe_train)
lasso_rfe_predict = lasso_rfe_model.predict(X_rfe_test)

lasso_rfe_mse = mean_squared_error(y_rfe_test,lasso_rfe_predict)
lasso_rfe_mae = mean_absolute_error(y_rfe_test,lasso_rfe_predict)
lasso_rfe_r2 = r2_score(y_rfe_test,lasso_rfe_predict)

In [17]:
print_stuff("Lasso",lasso_rfe_mse,lasso_rfe_mae,lasso_rfe_r2)

Lasso MSE = 123,732,489,420.844391
Lasso MAE = 226,348.922371
Lasso R2 = 0.379322
