In [1]:
import pandas as pd
import numpy as np

In [2]:
data_train=pd.read_csv('./Datasets/Seattle House Data/train.csv')
data_test=pd.read_csv('./Datasets/Seattle House Data/test.csv')

In [3]:
data = pd.concat([data_train, data_test], axis=0, ignore_index=True)

In [4]:
data.head()

Unnamed: 0,beds,baths,size,size_units,lot_size,lot_size_units,zip_code,price
0,3,2.5,2590.0,sqft,6000.0,sqft,98144,795000.0
1,4,2.0,2240.0,sqft,0.31,acre,98106,915000.0
2,4,3.0,2040.0,sqft,3783.0,sqft,98107,950000.0
3,4,3.0,3800.0,sqft,5175.0,sqft,98199,1950000.0
4,2,2.0,1042.0,sqft,,,98102,950000.0


In [5]:
data.shape

(2521, 8)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2521 entries, 0 to 2520
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   beds            2521 non-null   int64  
 1   baths           2521 non-null   float64
 2   size            2521 non-null   float64
 3   size_units      2521 non-null   object 
 4   lot_size        2097 non-null   float64
 5   lot_size_units  2097 non-null   object 
 6   zip_code        2521 non-null   int64  
 7   price           2521 non-null   float64
dtypes: float64(4), int64(2), object(2)
memory usage: 157.7+ KB


In [7]:
data['lot_size'] = data['lot_size'].fillna(data['lot_size'].median())
data['lot_size_units'] = data['lot_size_units'].fillna(data['lot_size_units'].mode()[0])

In [8]:
data['lot_size'] = data.apply(
    lambda row: row['lot_size'] * 43560 if row['lot_size_units'] == 'acre' else row['lot_size'], axis=1)

In [9]:
data.drop(['size_units', 'lot_size_units'], axis=1, inplace=True)

In [10]:
data.head()

Unnamed: 0,beds,baths,size,lot_size,zip_code,price
0,3,2.5,2590.0,6000.0,98144,795000.0
1,4,2.0,2240.0,13503.6,98106,915000.0
2,4,3.0,2040.0,3783.0,98107,950000.0
3,4,3.0,3800.0,5175.0,98199,1950000.0
4,2,2.0,1042.0,4001.0,98102,950000.0


saving final dataset to be used

In [11]:
data.to_csv("./Datasets/Seattle House Data/final_dataset.csv")

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [13]:
X = data.drop('price', axis=1)
y = data['price']

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['beds'])
    ],
    remainder='passthrough'
)

In [15]:
def build_pipeline(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),
        ('model', model)
    ])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying Linear Regression


In [17]:
lr_pipeline = build_pipeline(LinearRegression())

In [18]:
lr_pipeline.fit(X_train, y_train)

In [19]:
y_pred_lr = lr_pipeline.predict(X_test)

In [20]:
print('Linear Regression R²:', r2_score(y_test, y_pred_lr))

Linear Regression R²: -1.2237946417626388e+25


# using lasso

In [21]:
lasso_pipeline = build_pipeline(Lasso(alpha=0.1))

In [22]:
lasso_pipeline.fit(X_train, y_train)

In [23]:
y_pred_lasso = lasso_pipeline.predict(X_test)

In [24]:
print('Lasso Regression R²:', r2_score(y_test, y_pred_lasso))

Lasso Regression R²: 0.6544176699040548


# using Ridge

In [25]:
ridge_pipeline = build_pipeline(Ridge(alpha=1.0))

In [26]:
ridge_pipeline.fit(X_train, y_train)

In [27]:
y_pred_ridge = ridge_pipeline.predict(X_test)

In [28]:
print('Ridge Regression R²:', r2_score(y_test, y_pred_ridge))

Ridge Regression R²: 0.6532054400889888


# Comparison

In [29]:
print(f'Linear Regression R² Score: {r2_score(y_test, y_pred_lr):.4f}')
print(f'Lasso Regression R² Score: {r2_score(y_test, y_pred_lasso):.4f}')
print(f'Ridge Regression R² Score: {r2_score(y_test, y_pred_ridge):.4f}')

Linear Regression R² Score: -12237946417626388281425920.0000
Lasso Regression R² Score: 0.6544
Ridge Regression R² Score: 0.6532


In [30]:
import pickle

In [32]:
pickle.dump(ridge_pipeline, open('./Datasets/Seattle House Data/RidgeModel.pkl','wb'))