In [None]:
## 1. Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
# from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pickle


In [None]:
# Display settings
sns.set(style="darkgrid")
sns.set_palette('coolwarm')


In [None]:
## 2. Loading the Dataset
df = pd.read_csv('../data/Housing.csv')
print('Dataset Loaded Successfully!')
print('Shape of the dataset:', df.shape)
print(df.head())

In [None]:

## 3. Data Exploration and EDA

# Checking for missing values
print('\nMissing Values:')
print(df.isnull().sum())

In [None]:

# Summary statistics
print('\nSummary Statistics:')
print(df.describe())

In [None]:
# Data types
print('\nData Types:')
print(df.dtypes)


In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
encoded_df = pd.get_dummies(df, drop_first=True)
sns.heatmap(encoded_df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
#Feature Engineering
# Creating total rooms feature
df['total_rooms'] = df['bedrooms'] + df['bathrooms'] + df['stories'] + df['parking']

# Creating luxury feature count
df['luxury_count'] = (df['airconditioning'] == 'yes').astype(int) + \
                     (df['guestroom'] == 'yes').astype(int) + \
                     (df['hotwaterheating'] == 'yes').astype(int) + \
                     (df['prefarea'] == 'yes').astype(int)

# Encoding furnishing status as an ordinal value
furnishing_mapping = {'unfurnished': 0, 'semi-furnished': 1, 'furnished': 2}
df['furnishingstatus_encoded'] = df['furnishingstatus'].map(furnishing_mapping)

print('\nFeature Engineering Completed:')
print(df[['total_rooms', 'luxury_count', 'furnishingstatus', 'furnishingstatus_encoded']].head())



Feature Engineering Completed:
   total_rooms  luxury_count furnishingstatus  furnishingstatus_encoded
0           11             2        furnished                         2
1           15             1        furnished                         2
2            9             1   semi-furnished                         1
3           11             2        furnished                         2
4            9             2        furnished                         2


In [32]:
# Separate features and target
X = df[['area', 'total_rooms', 'luxury_count', 'furnishingstatus_encoded']]
Y = df['price']

# Encoding categorical features
categorical_cols = X.select_dtypes(include=['object']).columns
column_transformer = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_cols)
    ],
    remainder='passthrough'
)

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [33]:
## 5. Model Training and Evaluation

def evaluate_model(model, X_train, X_test, Y_train, Y_test):
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test, predictions))
    mae = mean_absolute_error(Y_test, predictions)
    r2 = r2_score(Y_test, predictions)
    print(f'Model: {model.__class__.__name__}')
    print(f'RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}\n')
    return model, rmse, mae, r2

In [34]:
# Linear Regression
lr = Pipeline(steps=[('preprocessor', column_transformer), ('regressor', LinearRegression())])
lr_model, lr_rmse, lr_mae, lr_r2 = evaluate_model(lr, X_train, X_test, Y_train, Y_test)


Model: Pipeline
RMSE: 1345743.53, MAE: 960709.25, R²: 0.64



In [35]:
# Random Forest
rf = Pipeline(steps=[('preprocessor', column_transformer), ('regressor', RandomForestRegressor())])
rf_model, rf_rmse, rf_mae, rf_r2 = evaluate_model(rf, X_train, X_test, Y_train, Y_test)

Model: Pipeline
RMSE: 1391299.92, MAE: 988261.81, R²: 0.62



In [36]:
## 6. Model Optimization (Linear Regression)

param_grid_lr = {
    'regressor__fit_intercept': [True, False]
}

grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='neg_mean_squared_error', error_score='raise')
grid_search_lr.fit(X_train, Y_train)

print('Best Hyperparameters for Linear Regression:', grid_search_lr.best_params_)
print('Best Score (MSE):', -grid_search_lr.best_score_)

# Use the best model from GridSearchCV
best_lr_model = grid_search_lr.best_estimator_


Best Hyperparameters for Linear Regression: {'regressor__fit_intercept': False}
Best Score (MSE): 1164950298392.1943


In [37]:
# Hyperparameters for Random Forest
param_grid_rf = {
    'regressor__n_estimators': [50, 100, 200],      # Number of trees
    'regressor__max_depth': [None, 10, 20, 30],     # Maximum depth of each tree
    'regressor__min_samples_split': [2, 5, 10],     # Minimum samples required to split a node
    'regressor__min_samples_leaf': [1, 2, 4],       # Minimum samples required at a leaf node
    'regressor__bootstrap': [True, False]           # Whether bootstrap samples are used
}

# GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, Y_train)

print('Best Hyperparameters for Random Forest:', grid_search_rf.best_params_)
print('Best Score (MSE):', -grid_search_rf.best_score_)


Best Hyperparameters for Random Forest: {'regressor__bootstrap': True, 'regressor__max_depth': None, 'regressor__min_samples_leaf': 4, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 100}
Best Score (MSE): 1218106464789.3225


In [38]:
## 7. Saving the Best Models

# Saving the best Linear Regression model
best_lr_model = lr if lr_r2 > rf_r2 else rf  # Choose the better model
with open('best_linear_model.pkl', 'wb') as file:
    pickle.dump(lr, file)
print('Best Linear Regression model saved as best_linear_model.pkl')

# Saving the best Random Forest model
best_rf_model = grid_search_rf.best_estimator_
with open('best_random_forest_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)
print('Best Random Forest model saved as best_random_forest_model.pkl')




Best Linear Regression model saved as best_linear_model.pkl
Best Random Forest model saved as best_random_forest_model.pkl


In [39]:
## 8. Loading and Testing Saved Models
def load_and_test_model(filename, X_test, Y_test):
    with open(filename, 'rb') as file:
        model = pickle.load(file)
    predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test, predictions))
    mae = mean_absolute_error(Y_test, predictions)
    r2 = r2_score(Y_test, predictions)
    print(f'\nLoaded Model: {filename}')
    print(f'RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.2f}\n')

# Test the saved models
load_and_test_model('best_linear_model.pkl', X_test, Y_test)
load_and_test_model('best_random_forest_model.pkl', X_test, Y_test)


Loaded Model: best_linear_model.pkl
RMSE: 1345743.53, MAE: 960709.25, R²: 0.64


Loaded Model: best_random_forest_model.pkl
RMSE: 1425033.27, MAE: 1007792.96, R²: 0.60

