In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore')

In [None]:
data = pd.read_csv('House_Rent_Dataset.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [3]:
data = data.drop(['Posted On', 'Floor', 'Area Locality',"Point of Contact"], axis=1)
data.head()

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom
0,2,10000,1100,Super Area,Kolkata,Unfurnished,Bachelors/Family,2
1,2,20000,800,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
2,2,17000,1000,Super Area,Kolkata,Semi-Furnished,Bachelors/Family,1
3,2,10000,800,Super Area,Kolkata,Unfurnished,Bachelors/Family,1
4,2,7500,850,Carpet Area,Kolkata,Unfurnished,Bachelors,1


In [None]:
plt.figure(figsize=(14,15))
for i,col in enumerate(["Area Type","City","Furnishing Status","Tenant Preferred"]):
    plt.subplot(4,2,i+1)
    sns.countplot(data=data,x=col)

In [None]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="Rent",y="BHK",data=data,hue="City")
plt.legend(bbox_to_anchor=(1.3,0.8))
plt.title("Relation between rent and BHK by city")

In [None]:
plt.figure(figsize=(5,5))
sns.scatterplot(data=data,x="Rent",y="Size",hue="City")

In [None]:
sns.barplot(data=data,x="City",y="Rent")
plt.title("Average rent by the city")
plt.show()

In [None]:
sns.barplot(data=data,x="Bathroom",y="Rent",estimator=np.median)
plt.title("Average rent by number of bathrooms")
plt.show()

In [None]:
sns.barplot(data=data,x="Tenant Preferred",y="Rent")
plt.title("Average rent by Tenant Preferred")
plt.show()

In [19]:
data["Area Type"].unique()

array(['Super Area', 'Carpet Area', 'Built Area'], dtype=object)

In [20]:
unique_values = data["Area Type"].unique()
mappings = {}
num = 0

for u_val in unique_values:
    mappings[u_val] = num
    num += 1

mappings

{'Super Area': 0, 'Carpet Area': 1, 'Built Area': 2}

In [4]:
def create_mappings(feature):
    unique_values = data[feature].unique()
    mappings = {}
    num = 0
    
    for u_val in unique_values:
        mappings[u_val] = num
        num += 1
    return mappings

In [21]:
data.select_dtypes(include="object").columns.to_list()

['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred']

In [22]:
cat_cols = data.select_dtypes(include="object").columns.to_list()

for col in cat_cols:
    mapping = create_mappings(col)
    print(col," ", mapping)

Area Type   {'Super Area': 0, 'Carpet Area': 1, 'Built Area': 2}
City   {'Kolkata': 0, 'Mumbai': 1, 'Bangalore': 2, 'Delhi': 3, 'Chennai': 4, 'Hyderabad': 5}
Furnishing Status   {'Unfurnished': 0, 'Semi-Furnished': 1, 'Furnished': 2}
Tenant Preferred   {'Bachelors/Family': 0, 'Bachelors': 1, 'Family': 2}


In [23]:
def data_preprocessing(raw_data):
    df = raw_data.copy()

    cat_cols = df.select_dtypes(include="object").columns.to_list()

    for col in cat_cols:
        mapping = create_mappings(col)
        df[col] = df[col].map(mapping)

    return df

In [24]:
df = data_preprocessing(data)
df.head()

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom
0,2,10000,1100,0,0,0,0,2
1,2,20000,800,0,0,1,0,1
2,2,17000,1000,0,0,1,0,1
3,2,10000,800,0,0,0,0,1
4,2,7500,850,1,0,0,1,1


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df.drop(columns = ["Rent"], axis = 1)
y = df["Rent"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3796, 7), (950, 7), (3796,), (950,))

In [11]:
from sklearn.linear_model import LinearRegression

In [None]:
lrm = LinearRegression().fit(X_train, y_train) 

In [13]:
y_hat = lrm.predict(X_test)

In [14]:
predictions = {
    "Actual": y_test,
    "Predicted": y_hat
}

predictions = pd.DataFrame(predictions)
predictions.head()

Unnamed: 0,Actual,Predicted
1566,16000,34664.070238
3159,12000,19378.453484
538,28000,48433.219277
2630,8000,75934.387977
4418,46000,124482.928193


In [15]:
from sklearn.metrics import mean_squared_error, root_mean_squared_error, r2_score

In [16]:
mean_squared_error(
    y_test, y_hat
)

2223670741.3871746

In [17]:
root_mean_squared_error(
    y_test, y_hat
)

47155.81344211098

In [18]:
r2_score(
    y_test, y_hat
)

0.4420431661569556