In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import pickle

## Load Data :


In [2]:
data = pd.read_csv("Mumbai.csv")

In [None]:
data.head()
data.info()
data.describe()


## Data Visualization and EDA :


In [None]:
data.isna().sum()

In [None]:
sns.histplot(data["Price"], kde=True)
plt.title("Distribution of House Prices")
plt.show()

In [None]:
correlation_matrix = data.drop(columns="Location",axis=1).corr()
plt.figure(figsize=(9,10))
sns.heatmap(correlation_matrix, cmap="coolwarm")
plt.title("Feature Correlation heatmap")
plt.show()

## Preprocessing

In [36]:
data_numeric = data.drop(columns="Location",axis=1)
data_numeric = data_numeric.fillna(data_numeric.mean(), inplace=True)

In [None]:
pd.get_dummies(data)

In [97]:
X = data.drop(columns={"Price","Location"}) # Features
y = data["Price"] # Target

## Split the data

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Model Training

In [99]:
linear_model = LinearRegression()

In [None]:
linear_model.fit(X_train,y_train)

## Model Evaluation

In [None]:
y_pred_lr = linear_model.predict(X_test)
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("Linear Regression R^2:", r2_score(y_test, y_pred_lr))

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None]
}
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(grid_search.best_estimator_)


## Save Model

In [None]:
pickle.dump(linear_model,open("linear_model.pkl","wb"))

## Test Model

In [None]:
model = pickle.load(open("linear_model.pkl", "rb"))

In [None]:
new_data = [[720, 1]]
predicted_price = model.predict(new_data)
print("Predicted Price:", predicted_price)

In [88]:
data["Log_area"] = np.log1p(data["Area"])

In [89]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.linear_model import Ridge, Lasso
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)


In [None]:
y_pred_ridge = ridge_model.predict(X_test)
print("Linear Regression RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_ridge)))
print("Linear Regression R^2:", r2_score(y_test, y_pred_ridge))