In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [3]:
data = pd.read_csv('House Price India.csv')  
data.describe()
print("Rows, Columns:", data.shape)
display(data.head())


Rows, Columns: (14619, 23)


Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,...,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
1,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,...,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
2,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,...,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
3,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,...,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000
4,6762813105,42491,3,2.5,2600,4750,1.0,0,0,4,...,1951,0,122007,52.9133,-114.59,2380,4750,1,67,790000


In [4]:
data.isnull().sum() 
data = data.dropna()  # drop missing
data = pd.get_dummies(data, drop_first=True)  # encode categories

In [5]:
X = data[[ 'number of bedrooms' ,'number of bathrooms', 'living area', 'lot area', 'number of floors', 'waterfront present', 'number of views', 'grade of the house', 'condition of the house', 'Area of the house(excluding basement)', 'Area of the basement', 'Built Year', 'Renovation Year','living_area_renov','lot_area_renov','Number of schools nearby','Distance from the airport']]
y = data['Price']


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
model = LinearRegression()
model.fit(X_train, y_train)

coeff=pd.DataFrame(model.coef_,X.columns,columns=['Coef'])
coeff


Unnamed: 0,Coef
number of bedrooms,-36978.655296
number of bathrooms,44738.049442
living area,108.756433
lot area,-0.103841
number of floors,25443.884031
waterfront present,562261.058343
number of views,41558.472013
grade of the house,119420.069186
condition of the house,23548.598814
Area of the house(excluding basement),55.3595


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load data
data = pd.read_csv('House Price India.csv')

# Drop missing values
data = data.dropna()

# Feature Engineering
data['house_age'] = 2025 - data['Built Year']  # Age of the house
data['renovated'] = np.where(data['Renovation Year'] > 0, 1, 0)  # 1 if renovated else 0
data['living_area_per_floor'] = data['living area'] / data['number of floors']  # derived feature

# Define final structural features
selected_features = [
    'number of bedrooms', 'number of bathrooms', 'living area', 'lot area',
    'number of floors', 'waterfront present', 'number of views', 'grade of the house',
    'condition of the house', 'Area of the basement', 'house_age', 'renovated',
    'living_area_per_floor'
]

X = data[selected_features]
y = data['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

# Show coefficients
coeff = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])
print("\nFeature Coefficients:\n", coeff)

# Model score
print("Model Score on Test Set:", model.score(X_test, y_test))


R² Score: 0.6722656113516154
RMSE: 220234.87926530032

Feature Coefficients:
                           Coefficient
number of bedrooms      -34838.567123
number of bathrooms      45543.866407
living area                205.034875
lot area                    -0.343246
number of floors        -33327.670297
waterfront present      555766.458230
number of views          43744.167578
grade of the house      125091.945434
condition of the house   24595.484353
Area of the basement        19.022508
house_age                 3341.736512
renovated                33865.187562
living_area_per_floor      -71.575778
Model Score on Test Set: 0.6722656113516154


In [9]:
y_pred = model.predict(X_test)


In [10]:
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R² Score: 0.6722656113516154
RMSE: 220234.87926530032


In [11]:
model.score(X_test,y_test)

0.6722656113516154

In [12]:
from sklearn.linear_model import Ridge, Lasso

# Ridge Regression
ridge = Ridge(alpha=10)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

print("Ridge R²:", r2_score(y_test, ridge_pred))
print("Ridge RMSE:", np.sqrt(mean_squared_error(y_test, ridge_pred)))

# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

print("Lasso R²:", r2_score(y_test, lasso_pred))
print("Lasso RMSE:", np.sqrt(mean_squared_error(y_test, lasso_pred)))


Ridge R²: 0.6716925518959194
Ridge RMSE: 220427.3408477998
Lasso R²: 0.672265466484374
Lasso RMSE: 220234.92794010683
