In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


pd.set_option('display.max_columns', None)


df = pd.read_csv('../data/bangalore_rent_data_model_ready.csv')

print(f"Dataset shape: {df.shape}")
df.head()


Dataset shape: (2065, 6)


Unnamed: 0,Title,Location,Rent,Bathrooms,BHK,Area
0,"3 BHK Flat for Rent in L&T Raintree Boulevard,...","L&T Raintree Boulevard, Hebbal",69000,3.0,3.0,986
1,"4 BHK Flat for Rent in Brigade Gateway, Rajaji...","Brigade Gateway, Rajajinagar",100000,4.0,4.0,2000
2,"2 BHK Flat for Rent in Sparkle Marquise, Banne...","Sparkle Marquise, Bannerghatta Main Road",55000,2.0,2.0,834
3,"2 BHK Flat for Rent in Godrej Woodsman Estate,...","Godrej Woodsman Estate, Hebbal",59000,2.0,2.0,1302
4,4 BHK Flat for Rent in Prestige Estates Shanti...,"Prestige Estates Shantiniketan, Whitefield",130000,5.0,4.0,3122


In [2]:
df_model = df.drop(['Title'], axis=1)

print("Columns used for modeling:", df_model.columns.tolist())

X = df_model.drop('Rent', axis=1)
y = df_model['Rent']


Columns used for modeling: ['Location', 'Rent', 'Bathrooms', 'BHK', 'Area']


In [3]:
# Step 3: Handle Categorical Variable - Location

# Count of each location
location_counts = X['Location'].value_counts()

# Replace locations with less than 10 listings as 'Other'
locations_to_replace = location_counts[location_counts < 10].index
X['Location'] = X['Location'].apply(lambda loc: 'Other' if loc in locations_to_replace else loc)

# One-hot encoding for location
X_encoded = pd.get_dummies(X, columns=['Location'], drop_first=True)

# Show the shape after encoding
print("Shape after one-hot encoding:", X_encoded.shape)


Shape after one-hot encoding: (2065, 28)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Step 5: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    mae = mean_absolute_error(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    r2 = r2_score(y_test, predictions)

    results.append({
        'Model': name,
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R² Score': round(r2, 4)
    })

#DataFrame for comparison
results_df = pd.DataFrame(results).sort_values(by='R² Score', ascending=False)
print("Model Comparison:\n")
print(results_df)


Model Comparison:

               Model       MAE      RMSE  R² Score
0  Linear Regression  13045.74  21084.55    0.7243
3  Gradient Boosting  13156.55  21333.14    0.7178
2      Random Forest  14483.97  24716.60    0.6211
1                SVR  28894.21  41138.91   -0.0495


In [6]:
df['Rent'].mean()


50849.341888619856