In [347]:
# The dataset used for this notebook can be found here: 
# https://www.kaggle.com/datasets/iamsouravbanerjee/house-rent-prediction-dataset

In [348]:
import pandas as pd
import numpy as np

In [349]:
datapath = 'C:/Users/user/Downloads/archive (1)/House_Rent_Dataset.csv'

In [350]:
dataframe = pd.read_csv(datapath)
dataframe.head()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [351]:
dataframe.shape

(4746, 12)

In [352]:
dataframe.columns

Index(['Posted On', 'BHK', 'Rent', 'Size', 'Floor', 'Area Type',
       'Area Locality', 'City', 'Furnishing Status', 'Tenant Preferred',
       'Bathroom', 'Point of Contact'],
      dtype='object')

In [353]:
# Picking the most important features for X
important_features = ['BHK','Size','Area Type','City','Furnishing Status']
X = dataframe[important_features]
X = X.dropna()
X

Unnamed: 0,BHK,Size,Area Type,City,Furnishing Status
0,2,1100,Super Area,Kolkata,Unfurnished
1,2,800,Super Area,Kolkata,Semi-Furnished
2,2,1000,Super Area,Kolkata,Semi-Furnished
3,2,800,Super Area,Kolkata,Unfurnished
4,2,850,Carpet Area,Kolkata,Unfurnished
...,...,...,...,...,...
4741,2,1000,Carpet Area,Hyderabad,Semi-Furnished
4742,3,2000,Super Area,Hyderabad,Semi-Furnished
4743,3,1750,Carpet Area,Hyderabad,Semi-Furnished
4744,3,1500,Carpet Area,Hyderabad,Semi-Furnished


In [354]:
# specify the target variable as y
y = dataframe.Rent
y

0       10000
1       20000
2       17000
3       10000
4        7500
        ...  
4741    15000
4742    29000
4743    35000
4744    45000
4745    15000
Name: Rent, Length: 4746, dtype: int64

In [355]:
# seems there are no null values, so we proceed to split the data
from sklearn.model_selection import train_test_split

In [356]:
train_X,val_X,train_y,val_y = train_test_split(X,y,test_size=0.3,random_state=1)

In [357]:
print(train_X.shape)
print(val_X.shape)
print(train_y.shape)
print(val_y.shape)

(3322, 5)
(1424, 5)
(3322,)
(1424,)


In [358]:
# Handling the categorical data in the dataset
# 'Area Type','City','Furnishing Status'
categorical_features = ['Area Type','City','Furnishing Status']

In [359]:
from sklearn.preprocessing import OneHotEncoder

In [360]:
encoder = OneHotEncoder(sparse_output = False)

In [361]:
encoded_train_X = encoder.fit_transform(train_X[categorical_features])
encoded_val_X = encoder.transform(val_X[categorical_features])
# print(encoded_train_X)
# print(encoded_val_X)
encoded_train_X.shape

(3322, 12)

In [362]:
# drop the features and use the numerical dataframe
# for the training set
X_train = train_X.drop(categorical_features,axis = 1).reset_index(drop=True)
X_train = pd.concat([X_train,pd.DataFrame(encoded_train_X)],axis=1,join='inner')

X_train

Unnamed: 0,BHK,Size,0,1,2,3,4,5,6,7,8,9,10,11
0,1,400,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,800,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2,700,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2,1250,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2,742,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3317,1,450,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3318,1,545,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3319,3,806,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3320,2,1600,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [363]:
# for the validation set
X_val = val_X.drop(categorical_features,axis=1).reset_index(drop=True)
X_val = pd.concat([X_val,pd.DataFrame(encoded_val_X)],axis=1,join='inner')
X_val

Unnamed: 0,BHK,Size,0,1,2,3,4,5,6,7,8,9,10,11
0,1,700,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,2,1050,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,2,800,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1,750,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,2,530,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,3,704,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1420,1,400,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1421,3,1450,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1422,3,950,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [364]:
from sklearn.linear_model import LinearRegression

In [365]:
model = LinearRegression()

In [366]:
# convert all feature names to String
X_train.columns = X_train.columns.astype(str)
X_val.columns = X_val.columns.astype(str)

In [367]:
X_train.shape

(3322, 14)

In [368]:
y.shape

(4746,)

In [369]:
lr_model = model.fit(X_train,train_y)

In [370]:
prediction = lr_model.predict(X_val)

In [371]:
prediction[:5]

array([ 3734.95392301, 10370.38569963,  9248.41396877, -6619.02973783,
        4544.17550595])

In [372]:
val_y.head(5)

230     18000
4500    10500
3277    16000
4045    13000
1561     9500
Name: Rent, dtype: int64

In [373]:
# obtain the mse
from sklearn.metrics import mean_absolute_error

In [374]:
mae = mean_absolute_error(prediction,val_y)
mae

24726.12605846923