# **House Price Prediction**

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

**Step 1: Load the dataset**

In [2]:
data = pd.read_csv('data_House_Price.csv')
data.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


**Step 2: Data Cleaning**

In [3]:
data=data[['price','bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront','view','condition']]
print(data.isnull().sum())  # Check for missing values

price          0
bedrooms       0
bathrooms      0
sqft_living    0
sqft_lot       0
floors         0
waterfront     0
view           0
condition      0
dtype: int64


In [4]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4


In [5]:
# Drop rows with missing values (if any)
print(f"Shape of data before dropna: {data.shape}")  # Check shape before dropna
data = data.dropna()
print(f"Shape of data after dropna: {data.shape}")  # Check shape after dropna

Shape of data before dropna: (4600, 9)
Shape of data after dropna: (4600, 9)


**Step 3: Data Preprocessing**

In [6]:
# Separate features (X) and target (y)
X = data.drop('price', axis=1)  # Features
y = data['price']  # Target

In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Scale the features (use scaling for better model performance, especially for models like Linear Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# **Step 4: Model Building**

In [9]:
# 1. Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)
y_pred_lin_reg = lin_reg.predict(X_test_scaled)

# Convert regression predictions to binary classification (0/1)
y_pred_lin_reg_class = np.where(y_pred_lin_reg > 0.5, 1, 0)
# Ensure that y_test is also binary (convert it if necessary)
y_test_class = np.where(y_test > 0.5, 1, 0)
print("Linear Regression Accuracy:", accuracy_score(y_test_class, y_pred_lin_reg_class))
print(classification_report(y_test_class, y_pred_lin_reg_class))

Linear Regression Accuracy: 0.9902173913043478
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.99      1.00      1.00       911

    accuracy                           0.99       920
   macro avg       0.50      0.50      0.50       920
weighted avg       0.98      0.99      0.99       920



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# 2. Random Forest Regressor
rfc = RandomForestRegressor(n_estimators=100, random_state=42)
rfc.fit(X_train_scaled, y_train)
y_pred_rfc = rfc.predict(X_test_scaled)
# Convert regression predictions to binary classification (0/1)
y_pred_rfc_class = np.where(y_pred_rfc > 0.5, 1, 0)
# Ensure that y_test is also binary (convert it if necessary)
y_test_class = np.where(y_test > 0.5, 1, 0)
print("Random Forest Regressor Accuracy:", accuracy_score(y_test_class, y_pred_rfc_class))
print(classification_report(y_test_class, y_pred_rfc_class))

Random Forest Regressor Accuracy: 0.9902173913043478
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.99      1.00      1.00       911

    accuracy                           0.99       920
   macro avg       0.50      0.50      0.50       920
weighted avg       0.98      0.99      0.99       920



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
# 3. Decision Tree Regressor
dtc = DecisionTreeRegressor(random_state=42)
dtc.fit(X_train_scaled, y_train)
y_pred_dtc = dtc.predict(X_test_scaled)

# Convert regression predictions to binary classification (0/1)
y_pred_dtc_class = np.where(y_pred_dtc > 0.5, 1, 0)
# Ensure that y_test is also binary (convert it if necessary)
y_test_class = np.where(y_test > 0.5, 1, 0)

print("Decision Tree Regressor Accuracy:", accuracy_score(y_test_class, y_pred_dtc_class))
print(classification_report(y_test_class, y_pred_dtc_class))

Decision Tree Regressor Accuracy: 0.9793478260869565
              precision    recall  f1-score   support

           0       0.08      0.11      0.10         9
           1       0.99      0.99      0.99       911

    accuracy                           0.98       920
   macro avg       0.54      0.55      0.54       920
weighted avg       0.98      0.98      0.98       920



In [13]:
# 4. Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr.fit(X_train_scaled, y_train)
y_pred_gbr = gbr.predict(X_test_scaled)

# Convert regression predictions to binary classification (0/1)
y_pred_gbr_class = np.where(y_pred_gbr > 0.5, 1, 0)

# Ensure that y_test is also binary (convert it if necessary)
y_test_class = np.where(y_test > 0.5, 1, 0)
print("Gradient Boosting Regressor Accuracy:", accuracy_score(y_test_class, y_pred_gbr_class))
print(classification_report(y_test_class, y_pred_gbr_class))

Gradient Boosting Regressor Accuracy: 0.9902173913043478
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.99      1.00      1.00       911

    accuracy                           0.99       920
   macro avg       0.50      0.50      0.50       920
weighted avg       0.98      0.99      0.99       920



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Step 5: Model Evaluation**

In [14]:
# Function to evaluate model performance
def evaluate_model(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{model_name} - MAE: {mae}, RMSE: {rmse}")
    return mae, rmse

# Evaluate each model
evaluate_model(y_test, y_pred_lin_reg, "Linear Regression")
evaluate_model(y_test, y_pred_rfc, "Random Forest Regressor")
evaluate_model(y_test, y_pred_dtc, "Decision Tree Regressor")
evaluate_model(y_test, y_pred_gbr, "Gradient Boosting Regressor")

Linear Regression - MAE: 215838.5573996204, RMSE: 993413.0132799632
Random Forest Regressor - MAE: 215796.45980927526, RMSE: 996980.8416133375
Decision Tree Regressor - MAE: 272964.3740022033, RMSE: 1028180.0878834052
Gradient Boosting Regressor - MAE: 207429.91869667333, RMSE: 995927.8705993197


(207429.91869667333, 995927.8705993197)

In [15]:
# List of models
models = ['Random Forest', 'Decision Tree', 'Linear Regression', 'Gradient Boosting Regressor']

# List of accuracies from your predictions
accuracies = [
    accuracy_score(y_test_class, y_pred_rfc_class),
    accuracy_score(y_test_class, y_pred_dtc_class),
    accuracy_score(y_test_class, y_pred_lin_reg_class),
    accuracy_score(y_test_class, y_pred_gbr_class)
]


In [16]:
# Print model accuracies
for model, accuracy in zip(models, accuracies):
    print(f"{model} Accuracy: {accuracy:.2f}")

Random Forest Accuracy: 0.99
Decision Tree Accuracy: 0.98
Linear Regression Accuracy: 0.99
Gradient Boosting Regressor Accuracy: 0.99


**Step 7: Take New User Input for Prediction**

In [17]:
def get_user_input():
    print("Please provide the following details to predict the house price:")
    # Dictionary to collect user input
    features = {
        'bedrooms': float(input("Number of bedrooms: ")),
        'bathrooms': float(input("Number of bathrooms: ")),
        'sqft_living': float(input("Square footage of living area: ")),
        'sqft_lot': float(input("Square footage of the lot: ")),
        'floors': float(input("Number of floors: ")),
        'waterfront': float(input("Waterfront (1 for Yes, 0 for No): ")),
        'view': float(input("View rating (0-4): ")),
        'condition': float(input("Condition rating (1-5): ")),
    }
    return pd.DataFrame([features])

# Get new user input
user_input = get_user_input()
user_input_scaled = scaler.transform(user_input)
user_pred_lin_reg = lin_reg.predict(user_input_scaled)
user_pred_rfc = rfc.predict(user_input_scaled)
user_pred_dtc = dtc.predict(user_input_scaled)
user_pred_gbr = gbr.predict(user_input_scaled)

# Display the predictions for the user
print(f"Predicted house price using Linear Regression: ${user_pred_lin_reg[0]:.2f}")
print(f"Predicted house price using Random Forest Regressor: ${user_pred_rfc[0]:.2f}")
print(f"Predicted house price using Decision Tree Regressor: ${user_pred_dtc[0]:.2f}")
print(f"Predicted house price using Gradient Boosting Regressor: ${user_pred_gbr[0]:.2f}")


Please provide the following details to predict the house price:
Number of bedrooms: 5
Number of bathrooms: 2.5
Square footage of living area: 3650
Square footage of the lot: 9050
Number of floors: 2
Waterfront (1 for Yes, 0 for No): 0
View rating (0-4): 4
Condition rating (1-5): 5
Predicted house price using Linear Regression: $1199453.15
Predicted house price using Random Forest Regressor: $2120234.50
Predicted house price using Decision Tree Regressor: $2384000.00
Predicted house price using Gradient Boosting Regressor: $1354264.39
