In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
file_path = "/mnt/data/Hyderbad_House_price.csv"
df = pd.read_csv("C:/Users/jangi/house price (PP)/Hyderbad_House_price.csv")

# Drop unnecessary index column
df.drop(columns=['Unnamed: 0'], inplace=True)


In [2]:
df

Unnamed: 0,title,location,price(L),rate_persqft,area_insqft,building_status
0,3 BHK Apartment,Nizampet,108.00,6000,1805,Under Construction
1,3 BHK Apartment,Bachupally,85.80,5500,1560,Under Construction
2,2 BHK Apartment,Dundigal,55.64,5200,1070,Under Construction
3,2 BHK Apartment,Pocharam,60.48,4999,1210,Under Construction
4,3 BHK Apartment,Kollur,113.00,5999,1900,Under Construction
...,...,...,...,...,...,...
3655,2 BHK Apartment,Ameerpet,60.00,6000,1000,Ready to move
3656,2 BHK Independent House,Jawahar nagar,65.00,3250,2000,Ready to move
3657,2 BHK Independent Floor,Rasoolpura,65.00,6500,1000,Ready to move
3658,2 BHK Independent Floor,Begum Bazar Chatri,55.00,5500,1000,Ready to move


In [3]:
# Extract BHK count from 'title'
df['BHK'] = df['title'].str.extract(r'(\d+)').astype(float)
df['BHK'].fillna(df['BHK'].median(), inplace=True)  # Fill missing values
df['BHK'] = df['BHK'].astype(int)
df.drop(columns=['title'], inplace=True)

# One-hot encoding for categorical features
df = pd.get_dummies(df, columns=['location', 'building_status'], drop_first=True)

# Log transformation to reduce price skewness
df['log_price'] = np.log1p(df['price(L)'])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BHK'].fillna(df['BHK'].median(), inplace=True)  # Fill missing values


In [4]:
# Define features (X) and target variable (y)
X = df.drop(columns=['price(L)', 'log_price'])
y = df['log_price']

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Convert log predictions back to original price scale
y_pred_original = np.expm1(np.clip(y_pred, a_min=None, a_max=20))  # Avoid overflow
y_test_original = np.expm1(y_test)

# Evaluate model performance
mae = mean_absolute_error(y_test_original, y_pred_original)
rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
r2 = r2_score(y_test_original, y_pred_original)

print(f"MAE: {mae:.2f} Lakhs")
print(f"RMSE: {rmse:.2f} Lakhs")
print(f"R² Score: {r2:.4f}")


MAE: 5.73 Lakhs
RMSE: 45.44 Lakhs
R² Score: 0.9456


In [6]:
import joblib

# Save trained model
joblib.dump(rf_model, "house_price_model.pkl")

# Load model for future use
# loaded_model = joblib.load("house_price_model.pkl")


['house_price_model.pkl']

In [7]:
joblib.dump(X_train.columns.tolist(), "features.pkl")


['features.pkl']