In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load dataset
data = pd.read_csv('House Price Data.csv')

# Display initial shape of data
print("Initial shape:", data.shape)

# Remove rows with any null values
data_cleaned = data.dropna()

# Display shape after removing null values
print("Shape after removing null values:", data_cleaned.shape)

# Separate features and target variable
X = data_cleaned.drop('price', axis=1)
y = data_cleaned['price']

# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Create transformers for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)])

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

# Convert the result to a DataFrame for better readability
X_processed_df = pd.DataFrame(X_processed, columns=preprocessor.get_feature_names_out())

# Display the first few rows of the processed data
print(X_processed_df.head())

Initial shape: (4600, 18)
Shape after removing null values: (4600, 18)
   num__bedrooms  num__bathrooms  num__sqft_living  num__sqft_lot  \
0      -0.441122       -0.843204         -0.829971      -0.193434   
1       1.759705        0.432802          1.568528      -0.161718   
2      -0.441122       -0.205201         -0.217367      -0.080978   
3      -0.441122        0.113800         -0.144686      -0.190145   
4       0.659291        0.432802         -0.206984      -0.121306   

   num__floors  num__waterfront  num__view  num__condition  num__sqft_above  \
0    -0.022416        -0.085004  -0.309194       -0.667112        -0.565224   
1     0.906555        -0.085004   4.830079        2.286416         1.789559   
2    -0.951388        -0.085004  -0.309194        0.809652         0.119171   
3    -0.951388        -0.085004  -0.309194        0.809652        -0.959621   
4    -0.951388        -0.085004  -0.309194        0.809652        -0.797222   

   num__sqft_basement  ...  cat__statez

In [3]:
X2 = X_processed_df
X2.head()

Unnamed: 0,num__bedrooms,num__bathrooms,num__sqft_living,num__sqft_lot,num__floors,num__waterfront,num__view,num__condition,num__sqft_above,num__sqft_basement,...,cat__statezip_WA 98155,cat__statezip_WA 98166,cat__statezip_WA 98168,cat__statezip_WA 98177,cat__statezip_WA 98178,cat__statezip_WA 98188,cat__statezip_WA 98198,cat__statezip_WA 98199,cat__statezip_WA 98288,cat__statezip_WA 98354
0,-0.441122,-0.843204,-0.829971,-0.193434,-0.022416,-0.085004,-0.309194,-0.667112,-0.565224,-0.672464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.759705,0.432802,1.568528,-0.161718,0.906555,-0.085004,4.830079,2.286416,1.789559,-0.069128,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.441122,-0.205201,-0.217367,-0.080978,-0.951388,-0.085004,-0.309194,0.809652,0.119171,-0.672464,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.441122,0.1138,-0.144686,-0.190145,-0.951388,-0.085004,-0.309194,0.809652,-0.959621,1.482306,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.659291,0.432802,-0.206984,-0.121306,-0.951388,-0.085004,-0.309194,0.809652,-0.797222,1.051352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor




# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor()
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name}:\n MAE: {mae}\n MSE: {mse}\n RMSE: {rmse}\n R²: {r2}\n")

# Example of hyperparameter tuning with Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print(f"Best Model (Random Forest) after Grid Search:\n MAE: {mean_absolute_error(y_test, y_pred)}\n MSE: {mean_squared_error(y_test, y_pred)}\n RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}\n R²: {r2_score(y_test, y_pred)}\n")

Linear Regression:
 MAE: 305524.259673621
 MSE: 1157231555450.4912
 RMSE: 1075746.975571157
 R²: -0.13471168151843638

Ridge Regression:
 MAE: 158701.50197805333
 MSE: 964978486531.6531
 RMSE: 982333.1850913177
 R²: 0.05380011811448315



  model = cd_fast.enet_coordinate_descent(


Lasso Regression:
 MAE: 189091.24442010652
 MSE: 1035580692895.7814
 RMSE: 1017634.8524376419
 R²: -0.015428160292741078

ElasticNet Regression:
 MAE: 201173.16625216964
 MSE: 977294993133.6837
 RMSE: 988582.3147991692
 R²: 0.041723292304645554

Decision Tree:
 MAE: 198500.25246843373
 MSE: 1033004547722.1337
 RMSE: 1016368.3130254179
 R²: -0.012902147233334382

Random Forest:
 MAE: 157779.83420775319
 MSE: 970617733078.0342
 RMSE: 985199.336722287
 R²: 0.04827061202643823

Gradient Boosting:
 MAE: 163925.06197197145
 MSE: 967404699279.0879
 RMSE: 983567.333373312
 R²: 0.05142111977711805

SVR:
 MAE: 267354.0803092375
 MSE: 1033484601618.5894
 RMSE: 1016604.4469795463
 R²: -0.013372859219627165

KNN:
 MAE: 172544.05016422697
 MSE: 979233843991.0836
 RMSE: 989562.4507786679
 R²: 0.03982217173266356

