# üè° House Price Prediction using Random Forest
This notebook teaches how to predict house prices using a Random Forest model in a simple and step-by-step way.

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

## Step 1: Load the Dataset

In [7]:
# Remove invalid or missing price entries

data = pd.read_csv("/kaggle/input/housedata/output.csv")
data = data[data["price"] > 0].dropna(subset=["price"])
data.head()


Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


## Step 2: Select Useful Features

In [8]:
# Include location features by encoding them numerically
data["city"] = data["city"].astype("category").cat.codes
data["statezip"] = data["statezip"].astype("category").cat.codes

features = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot",
            "floors", "waterfront", "view", "condition",
            "sqft_above", "sqft_basement", "yr_built", "yr_renovated",
            "city", "statezip"]

X = data[features]
y = data["price"]



## ‚úÇÔ∏è Step 3: Split Data into Training and Testing Sets

In [9]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## üå≥ Step 4: Train the Random Forest Model

In [10]:
model = RandomForestRegressor(
    n_estimators=400,
    max_depth=30,
    min_samples_split=3,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)


## üìà Step 5: Test the Model

In [11]:
# Predict prices for the test set
predictions = model.predict(X_test)

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, predictions)

# Calculate average house price
avg_price = y_test.mean()

# Calculate MAE variation percentage
variation_percent = (mae / avg_price) * 100

print(f"Mean Absolute Error: ${mae:,.0f}")
print(f"Average House Price: ${avg_price:,.0f}")
print(f"Average Variation: {variation_percent:.2f}%")


Mean Absolute Error: $118,252
Average House Price: $556,497
Average Variation: 21.25%


## üè† Step 6: Predict Price for a Sample House

In [13]:
# Choose which house to test (you can change the index)
index = 1

sample_house = X_test.iloc[index]
real_price = y_test.iloc[index]

print(f"üè° House details (Index {index}):\n", sample_house)

predicted_price = model.predict([sample_house])[0]

if real_price > 0:
    difference = abs(predicted_price - real_price)
    variation_percent = (difference / real_price) * 100
    print(f"\nüí∞ Real Price: ${real_price:,.0f}")
    print(f"ü§ñ Predicted Price: ${predicted_price:,.0f}")
    print(f"üìâ Variation: {variation_percent:.2f}%")
else:
    print(f"\n‚ö†Ô∏è Real Price is 0 (invalid entry), predicted: ${predicted_price:,.0f}")


üè° House details (Index 1):
 bedrooms            2.0
bathrooms           1.0
sqft_living      1980.0
sqft_lot         5000.0
floors              1.0
waterfront          0.0
view                0.0
condition           4.0
sqft_above       1090.0
sqft_basement     890.0
yr_built         1923.0
yr_renovated        0.0
city               35.0
statezip           64.0
Name: 2518, dtype: float64

üí∞ Real Price: $496,752
ü§ñ Predicted Price: $489,951
üìâ Variation: 1.37%


