In [17]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr

In [18]:
df = pd.read_csv('housing_price_dataset.csv')
df.head()

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,Rural,1969,215355.283618
1,2459,3,2,Rural,1980,195014.221626
2,1860,2,1,Suburb,1970,306891.012076
3,2294,2,1,Urban,1996,206786.787153
4,2130,5,2,Suburb,2001,272436.239065


Scaling function

In [20]:
df['Neighborhood'] = df['Neighborhood'].map({
    'Rural': 1,
    'Suburb': 2,
    'Urban': 3
})

In [21]:
df

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,1,1969,215355.283618
1,2459,3,2,1,1980,195014.221626
2,1860,2,1,2,1970,306891.012076
3,2294,2,1,3,1996,206786.787153
4,2130,5,2,2,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,1,1975,100080.865895
49996,2854,2,2,2,1988,374507.656727
49997,2979,5,3,2,1962,384110.555590
49998,2596,5,2,1,1984,380512.685957


In [22]:
scalers = [MinMaxScaler(), StandardScaler(), RobustScaler()]
def scale (n, numerical_col):
    scaler = scalers[n]
    df[numerical_col] = scaler.fit_transform(df[numerical_col])
  
    return df

In [23]:
df

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,2126,4,1,1,1969,215355.283618
1,2459,3,2,1,1980,195014.221626
2,1860,2,1,2,1970,306891.012076
3,2294,2,1,3,1996,206786.787153
4,2130,5,2,2,2001,272436.239065
...,...,...,...,...,...,...
49995,1282,5,3,1,1975,100080.865895
49996,2854,2,2,2,1988,374507.656727
49997,2979,5,3,2,1962,384110.555590
49998,2596,5,2,1,1984,380512.685957


Apply polynomial regression

In [24]:
def polynomial_regression(X_train, X_test, y_train, y_test, degree):
    # Generate polynomial features
    poly = PolynomialFeatures(degree=degree, include_bias=False)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.fit_transform(X_test)

    # model fiting
    model = LinearRegression()
    model = model.fit(X_train_poly, y_train)

    # Y prediction
    y_pred = model.predict(X_test_poly)

    return y_pred

In [None]:
def evaluate_metrices(y_pred, y_test):
    rmse = root_mean_squared_error(y_pred, y_test)
    mae = mean_absolute_error(y_pred, y_test)
    corr_coef, _ = pearsonr(y_pred, y_test)
    return rmse, mae, corr_coef

In [26]:
num = df.select_dtypes(['number']).columns
df = scale(2, num)
df

Unnamed: 0,SquareFeet,Bedrooms,Bathrooms,Neighborhood,YearBuilt,Price
0,0.119839,1.0,-0.5,-0.5,-0.444444,-0.088622
1,0.455186,0.0,0.0,-0.5,-0.138889,-0.274525
2,-0.148036,-1.0,-0.5,0.0,-0.416667,0.747949
3,0.289023,-1.0,-0.5,0.5,0.305556,-0.166932
4,0.123867,2.0,0.0,0.0,0.444444,0.433057
...,...,...,...,...,...,...
49995,-0.730111,2.0,0.5,-0.5,-0.277778,-1.142148
49996,0.852971,-1.0,0.0,0.0,0.083333,1.365916
49997,0.978852,2.0,0.5,0.0,-0.638889,1.453680
49998,0.593152,2.0,0.0,-0.5,-0.027778,1.420798


In [27]:
from sklearn.model_selection import train_test_split

# X = data_df.drop(columns='Price')
X = df[['SquareFeet', 'Bathrooms', 'Bathrooms', 'Neighborhood',	'YearBuilt']]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_pred = polynomial_regression(X_train, X_test, y_train, y_test, degree= 1)

In [28]:
y_pred

array([-0.13093212, -0.89119606,  0.25516708, ...,  0.80197869,
       -0.29581523,  0.19417432])

In [29]:
y_test.shape

(10000,)

In [30]:
y_pred

array([-0.13093212, -0.89119606,  0.25516708, ...,  0.80197869,
       -0.29581523,  0.19417432])

In [31]:
rmse, mae, cor = evaluate_metrices (y_pred, y_test)

In [32]:
print(f"Mean Square error: {rmse:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")
print(f"Correlation : {cor:.4f}")

Mean Square error: 0.4534
Mean Absolute Error: 0.3622
Correlation : 0.7558
