In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [None]:
df = pd.read_csv("Hyderabad_House_Data.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,Bedrooms,Washrooms,Furnishing,Tennants,Area,Price,Locality
0,0,3 BHK Builder Floor,2,Furnished,Bachelors/Family,1800 sqft,34000,"Bhagyalaxmi Nagar, Kavadiguda"
1,1,3 BHK Apartment,2,Semi-Furnished,Family,2500 sqft,45000,"Gachibowli, Outer Ring Road"
2,2,1 BHK Builder Floor,Immediately,Furnished,Bachelors/Family,read more,18000,Gachibowli
3,3,3 BHK Apartment,Immediately,Furnished,Bachelors/Family,2160 sqft,40000,"Moosapet, NH"
4,4,3 BHK Apartment,2,Semi-Furnished,Family,1580 sqft,23000,Raghavendra Colony kondapur


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1171 entries, 0 to 1170
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1171 non-null   int64 
 1   Bedrooms    1171 non-null   object
 2   Washrooms   1150 non-null   object
 3   Furnishing  1171 non-null   object
 4   Tennants    1170 non-null   object
 5   Area        1149 non-null   object
 6   Price       1171 non-null   object
 7   Locality    1171 non-null   object
dtypes: int64(1), object(7)
memory usage: 73.3+ KB


In [None]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,1171.0
mean,585.0
std,338.182889
min,0.0
25%,292.5
50%,585.0
75%,877.5
max,1170.0


In [None]:
df.isnull().sum()

Unnamed: 0     0
Bedrooms       0
Washrooms     21
Furnishing     0
Tennants       1
Area          22
Price          0
Locality       0
dtype: int64

In [None]:
df['Bedrooms'] = df['Bedrooms'].astype(str).str.extract('(\d+)')
df['Bedrooms'] = pd.to_numeric(df['Bedrooms'], errors='coerce')


In [None]:
df['Washrooms'] = pd.to_numeric(df['Washrooms'], errors='coerce')


In [None]:
df['Area'] = df['Area'].astype(str)

df['Area'] = df['Area'].str.replace('sqft', '', regex=False)
df['Area'] = df['Area'].str.replace('sqyrd', '', regex=False)

df['Area'] = pd.to_numeric(df['Area'], errors='coerce')


In [None]:
df['Price'] = df['Price'].astype(str)
df['Price'] = df['Price'].str.replace(',', '', regex=False)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')


In [None]:
df['Bedrooms'].fillna(df['Bedrooms'].median(), inplace=True)
df['Washrooms'].fillna(df['Washrooms'].median(), inplace=True)
df['Area'].fillna(df['Area'].median(), inplace=True)
df['Tennants'].fillna(df['Tennants'].mode()[0], inplace=True)
df['Furnishing'].fillna(df['Furnishing'].mode()[0], inplace=True)
df['Locality'].fillna(df['Locality'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Bedrooms'].fillna(df['Bedrooms'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Washrooms'].fillna(df['Washrooms'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

In [None]:
X = df[['Bedrooms', 'Washrooms', 'Area','Tennants', 'Furnishing', 'Locality']]
y = df['Price']


In [None]:
num_features = ['Bedrooms', 'Washrooms', 'Area']
cat_features = ['Furnishing', 'Tennants', 'Locality']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

lr_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

lr_model.fit(X_train, y_train)

lr_pred = lr_model.predict(X_test)
print("Linear Regression")
print("MAE:", mean_absolute_error(y_test, lr_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, lr_pred)))
print("R2:", r2_score(y_test, lr_pred))




Linear Regression
MAE: 5198.766271656865
RMSE: 6854.367805165599
R2: 0.6113128429225576


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
rf_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=200,
        random_state=42
    ))
])

rf_model.fit(X_train, y_train)

rf_pred = rf_model.predict(X_test)

print("Random Forest")
print("MAE:", mean_absolute_error(y_test, rf_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, rf_pred)))
print("R2:", r2_score(y_test, rf_pred))


Random Forest
MAE: 5328.050035460993
RMSE: 7024.204942054142
R2: 0.591812473127536


In [None]:
comparison = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "R2 Score": [
        r2_score(y_test, lr_pred),
        r2_score(y_test, rf_pred)
    ]
})

comparison


Unnamed: 0,Model,R2 Score
0,Linear Regression,0.611313
1,Random Forest,0.591812


In [None]:
import pickle
with open("house_rent_prediction_model.pkl", "wb") as f:
    pickle.dump(rf_model, f)


In [None]:
with open("house_rent_prediction_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)


In [None]:
new_house = pd.DataFrame({
    'Bedrooms': [2],
    'Washrooms': [2],
    'Area': [1000],
    'Furnishing': ['Semi-Furnished'],
    'Tennants': ['Family'],
    'Locality': ['Gachibowli']
})

predicted_price = loaded_model.predict(new_house)
print("Predicted Rent:", predicted_price[0])


Predicted Rent: 14266.190476190479


In [None]:

scalar = preprocessor.named_transformers_['num']
from custom_scaler import CustomScaler
model = pickle.load(f)

import pickle
with open("scalar.pkl", "wb") as f:
    pickle.dump(scalar, f)
