In [1]:
# Install required packages
!pip install pandas scikit-learn




Load Dataset into Colab

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/House_Rent_Dataset.csv")

# Display first 5 rows
print(df.head())

# Show dataset shape (rows, columns)
print("Shape of dataset:", df.shape)


    Posted On  BHK   Rent  Size            Floor    Area Type  \
0  2022-05-18    2  10000  1100  Ground out of 2   Super Area   
1  2022-05-13    2  20000   800       1 out of 3   Super Area   
2  2022-05-16    2  17000  1000       1 out of 3   Super Area   
3  2022-07-04    2  10000   800       1 out of 2   Super Area   
4  2022-05-09    2   7500   850       1 out of 2  Carpet Area   

              Area Locality     City Furnishing Status  Tenant Preferred  \
0                    Bandel  Kolkata       Unfurnished  Bachelors/Family   
1  Phool Bagan, Kankurgachi  Kolkata    Semi-Furnished  Bachelors/Family   
2   Salt Lake City Sector 2  Kolkata    Semi-Furnished  Bachelors/Family   
3               Dumdum Park  Kolkata       Unfurnished  Bachelors/Family   
4             South Dum Dum  Kolkata       Unfurnished         Bachelors   

   Bathroom Point of Contact  
0         2    Contact Owner  
1         1    Contact Owner  
2         1    Contact Owner  
3         1    Contact Owner

Explore Data (EDA Basics)

In [3]:
# See column names
print("Columns:", df.columns)

# Data types & missing values
print("\nDataset Info:")
print(df.info())

# Summary statistics for numeric columns
print("\nSummary Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


Columns: Index(['Posted On', 'BHK', 'Rent', 'Size', 'Floor', 'Area Type',
       'Area Locality', 'City', 'Furnishing Status', 'Tenant Preferred',
       'Bathroom', 'Point of Contact'],
      dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int

#Preprocessing Plan



Drop irrelevant columns:

Posted On (date not useful now)

Point of Contact (not useful for prediction)

Handle “Floor” column:

Example: "Ground out of 2", "3 out of 5".

We’ll split into two numeric features:

Current Floor (e.g., Ground = 0, 3 = 3)

Total Floors (e.g., 2, 5, etc.)

Encode categorical features:

Area Type, City, Furnishing Status, Tenant Preferred → convert to numbers using Label Encoding / OneHotEncoding.

Area Locality has too many unique values → we’ll drop it (too detailed, won’t help model).

In [4]:
# Drop irrelevant columns
df = df.drop(["Posted On", "Point of Contact", "Area Locality"], axis=1)

# Handle Floor column
df[['Current Floor', 'Total Floors']] = df['Floor'].str.split(' out of ', expand=True)

# Replace "Ground" with 0 and convert to numeric
df['Current Floor'] = df['Current Floor'].replace('Ground', 0)
df['Current Floor'] = pd.to_numeric(df['Current Floor'], errors='coerce')
df['Total Floors'] = pd.to_numeric(df['Total Floors'], errors='coerce')

# Drop old Floor column
df = df.drop('Floor', axis=1)

# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

categorical_cols = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred']

le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Check processed dataset
print(df.head())
print("\nProcessed Shape:", df.shape)


   BHK   Rent  Size  Area Type  City  Furnishing Status  Tenant Preferred  \
0    2  10000  1100          2     4                  2                 1   
1    2  20000   800          2     4                  1                 1   
2    2  17000  1000          2     4                  1                 1   
3    2  10000   800          2     4                  2                 1   
4    2   7500   850          1     4                  2                 0   

   Bathroom  Current Floor  Total Floors  
0         2            0.0           2.0  
1         1            1.0           3.0  
2         1            1.0           3.0  
3         1            1.0           2.0  
4         1            1.0           2.0  

Processed Shape: (4746, 10)


Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

# Features (X) and Target (y)
X = df.drop("Rent", axis=1)
y = df["Rent"]

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (3796, 9)
Test set shape: (950, 9)


Scaling the data

In [6]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit on training data, transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Scaled Training Data Shape:", X_train_scaled.shape)
print("Scaled Test Data Shape:", X_test_scaled.shape)


Scaled Training Data Shape: (3796, 9)
Scaled Test Data Shape: (950, 9)


Linear Regression

Check for NaNs

In [8]:
import numpy as np

print("NaNs in training set:", np.isnan(X_train_scaled).sum())
print("NaNs in test set:", np.isnan(X_test_scaled).sum())


NaNs in training set: 32
NaNs in test set: 6


Fill NaNs with median of the column

In [9]:
from sklearn.impute import SimpleImputer

# Initialize imputer to fill NaNs with median
imputer = SimpleImputer(strategy='median')

# Fit on training data and transform both train and test
X_train_scaled = imputer.fit_transform(X_train_scaled)
X_test_scaled = imputer.transform(X_test_scaled)

# Check again for NaNs
import numpy as np
print("NaNs in training set after imputation:", np.isnan(X_train_scaled).sum())
print("NaNs in test set after imputation:", np.isnan(X_test_scaled).sum())



NaNs in training set after imputation: 0
NaNs in test set after imputation: 0


Train Linear Regression Model

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Initialize Linear Regression
lin_reg = LinearRegression()

# Train the model
lin_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = lin_reg.predict(X_train_scaled)
y_pred_test = lin_reg.predict(X_test_scaled)

# Evaluate
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print("Linear Regression Results:")
print("Train MSE:", mse_train)
print("Test MSE:", mse_test)
print("Train R²:", r2_train)
print("Test R²:", r2_test)


Linear Regression Results:
Train MSE: 4917224830.29888
Test MSE: 2134057395.8914723
Train R²: 0.2581407149699284
Test R²: 0.46452867967847356


Train Random Forest Regressor

In [11]:
from sklearn.ensemble import RandomForestRegressor

# Initialize model
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)

# Train
rf_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred_train_rf = rf_reg.predict(X_train_scaled)
y_pred_test_rf = rf_reg.predict(X_test_scaled)

# Evaluation
mse_train_rf = mean_squared_error(y_train, y_pred_train_rf)
mse_test_rf = mean_squared_error(y_test, y_pred_test_rf)
r2_train_rf = r2_score(y_train, y_pred_train_rf)
r2_test_rf = r2_score(y_test, y_pred_test_rf)

print("Random Forest Results:")
print("Train MSE:", mse_train_rf)
print("Test MSE:", mse_test_rf)
print("Train R²:", r2_train_rf)
print("Test R²:", r2_test_rf)


Random Forest Results:
Train MSE: 841508732.7666111
Test MSE: 1983606037.7670372
Train R²: 0.8730419925096542
Test R²: 0.5022794859755104


In [12]:
import joblib

# Save Random Forest model
joblib.dump(rf_reg, "house_rent_rf_model.pkl")

# Save the scaler
joblib.dump(scaler, "scaler.pkl")

print("Model and scaler saved successfully!")


Model and scaler saved successfully!


In [14]:
import pandas as pd

# Example new data
new_data = pd.DataFrame({
    'BHK': [2],
    'Size': [950],
    'Area Type': [2],           # Encoded same as training (LabelEncoder)
    'City': [4],                # Encoded same as training
    'Furnishing Status': [1],   # Encoded same as training
    'Tenant Preferred': [1],    # Encoded same as training
    'Bathroom': [2],
    'Current Floor': [1],
    'Total Floors': [3]
})


In [15]:
import joblib

# Load model and scaler
model = joblib.load("house_rent_rf_model.pkl")
scaler = joblib.load("scaler.pkl")


In [16]:
new_data_scaled = scaler.transform(new_data)


In [17]:
predicted_rent = model.predict(new_data_scaled)
print("Predicted Rent:", predicted_rent[0])


Predicted Rent: 16763.333333333336


In [18]:
import pandas as pd

# Example: 3 new houses
new_data_batch = pd.DataFrame({
    'BHK': [2, 3, 1],
    'Size': [950, 1200, 500],
    'Area Type': [2, 2, 1],          # Encoded same as training
    'City': [4, 4, 3],               # Encoded same as training
    'Furnishing Status': [1, 2, 0],  # Encoded same as training
    'Tenant Preferred': [1, 1, 0],   # Encoded same as training
    'Bathroom': [2, 3, 1],
    'Current Floor': [1, 2, 0],
    'Total Floors': [3, 5, 2]
})


In [19]:
new_data_scaled = scaler.transform(new_data_batch)
predicted_rents = model.predict(new_data_scaled)

# Display results
new_data_batch['Predicted Rent'] = predicted_rents
print(new_data_batch)


   BHK  Size  Area Type  City  Furnishing Status  Tenant Preferred  Bathroom  \
0    2   950          2     4                  1                 1         2   
1    3  1200          2     4                  2                 1         3   
2    1   500          1     3                  0                 0         1   

   Current Floor  Total Floors  Predicted Rent  
0              1             3    16763.333333  
1              2             5    15730.000000  
2              0             2     9675.000000  


In [25]:
from google.colab import files

files.download("house_rent_rf_model.pkl")
files.download("scaler.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>