In [19]:
# libraries for data wrangling and computing
import pandas as pd
import numpy as np
import math


# vizualistion libraries
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

# statistical libraries
from scipy import stats
import statsmodels.api as sm



# customize my visuals

sns.set_style('darkgrid')
sns.set_palette('husl')


In [20]:
data = pd.read_csv('House_Rent_Dataset.csv')

In [21]:
data.head(5)

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [22]:
data.shape

(4746, 12)

In [23]:
# taking a list of the continuous variables for modelling

data_df = data[['BHK', 'Size','Bathroom','Floor','Rent']]

In [29]:
data_df.head()

Unnamed: 0,BHK,Size,Bathroom,Floor,Rent,house floor,no floor
0,2,1100,2,Ground out of 2,10000,,
1,2,800,1,1 out of 3,20000,1.0,3.0
2,2,1000,1,1 out of 3,17000,1.0,3.0
3,2,800,1,1 out of 2,10000,1.0,2.0
4,2,850,1,1 out of 2,7500,1.0,2.0


In [None]:

data_df['house floor'] = data_df['Floor'].str.extract(r'(\d+) out of \d+')
data_df['no floor'] = data_df['Floor'].str.extract(r'\d+ out of (\d+)')

In [27]:
data_df[data_df['Floor'] ==  '19 out of 24']

Unnamed: 0,BHK,Size,Bathroom,Floor,Rent,house floor,no floor
550,1,275,1,19 out of 24,20000,19,24


In [31]:
# checking missing values
data_df.isna().sum()

BHK              0
Size             0
Bathroom         0
Floor            0
Rent             0
house floor    964
no floor       964
dtype: int64

In [None]:
data_df.dropna(inplace=True)

data_df.reset_index(inplace= True, drop = True)

In [None]:
data_df.drop(columns= ['index'], inplace= True)

In [40]:
data_df.corr(method='pearson')

  data_df.corr(method='pearson')


Unnamed: 0,BHK,Size,Bathroom,Rent
BHK,1.0,0.736489,0.817497,0.372882
Size,0.736489,1.0,0.745046,0.411222
Bathroom,0.817497,0.745046,1.0,0.441335
Rent,0.372882,0.411222,0.441335,1.0


In [54]:
X = sm.add_constant(data= data_df[['BHK', 'Size', 'Bathroom', 'house floor', 'no floor']])

model = sm.OLS(data_df['Rent'], X).fit()

model.summary()

0,1,2,3
Dep. Variable:,Rent,R-squared:,0.254
Model:,OLS,Adj. R-squared:,0.253
Method:,Least Squares,F-statistic:,256.5
Date:,"Tue, 19 Dec 2023",Prob (F-statistic):,1.3e-236
Time:,13:48:36,Log-Likelihood:,-47769.0
No. Observations:,3782,AIC:,95550.0
Df Residuals:,3776,BIC:,95590.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-4.201e+04,3392.283,-12.384,0.000,-4.87e+04,-3.54e+04
BHK,-2071.0839,2651.604,-0.781,0.435,-7269.799,3127.631
Size,31.3994,3.061,10.257,0.000,25.397,37.401
Bathroom,1.951e+04,2667.914,7.313,0.000,1.43e+04,2.47e+04
house floor,929.7182,391.018,2.378,0.017,163.092,1696.345
no floor,1384.0327,238.875,5.794,0.000,915.696,1852.370

0,1,2,3
Omnibus:,9608.778,Durbin-Watson:,1.898
Prob(Omnibus):,0.0,Jarque-Bera (JB):,226491370.861
Skew:,27.716,Prob(JB):,0.0
Kurtosis:,1200.584,Cond. No.,3630.0


In [55]:
def predict_rent():
    size = float(input('size: '))
    bathroom = float(input('bathroom: '))
    floor = float(input('floor: '))
    no_floor = float(input('no_floor: '))
    
    predicted_rent = -0.0004201 + (31.3994 * size) + (0.00041951 * bathroom) + (929.7182 * floor) + (1384.0327 * no_floor)
    return predicted_rent

In [51]:
data_df.drop(columns= ['Floor'], inplace= True)

data_df = data_df.apply(pd.to_numeric)

In [52]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3782 entries, 0 to 3781
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   BHK          3782 non-null   int64
 1   Size         3782 non-null   int64
 2   Bathroom     3782 non-null   int64
 3   Rent         3782 non-null   int64
 4   house floor  3782 non-null   int64
 5   no floor     3782 non-null   int64
dtypes: int64(6)
memory usage: 177.4 KB


In [None]:
const	-4.201e+04	3392.283	-12.384	0.000	-4.87e+04	-3.54e+04
Size	31.3994	3.061	10.257	0.000	25.397	37.401
Bathroom	1.951e+04	2667.914	7.313	0.000	1.43e+04	2.47e+04
house floor	929.7182	391.018	2.378	0.017	163.092	1696.345
no floor	1384.0327	238.875	5.794	0.000	915.696	1852.370


In [56]:
data_df.head(3)

Unnamed: 0,BHK,Size,Bathroom,Rent,house floor,no floor
0,2,800,1,20000,1,3
1,2,1000,1,17000,1,3
2,2,800,1,10000,1,2


In [57]:
predict_rent()

30201.33629941