1.Load the datasets

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
from google.colab import files
uploaded = files.upload()

In [None]:
house_data=pd.read_csv('data.csv')
house_data.head()

In [None]:
house_data.describe()

In [None]:
house_data.info()

In [None]:
house_data.shape

In [None]:
house_data.columns

2.Data Description

i) Handling Missing Data

In [None]:
house_data.isna().sum()

ii) Duplicate records

In [None]:
house_data.duplicated().sum()

iii)Treat and Detect Outliers

In [None]:
def remove_outliers(df,column) :
    Q1 = df[column].quantile(0.25)
    Q3= df[column].quantile(0.75)
    IQR = Q3-Q1
    lower = Q1-IQR*1.5
    upper = Q3+IQR*1.5
    return df[(df[column]>=lower) & (df[column]<=upper)]

iv)Convert datatypes and Ensure Consistency

In [None]:
house_data.dtypes

v)Encode Categorical Values

In [None]:
y = house_data['price']
X = house_data.drop('price', axis=1)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
X_train_cat = pd.DataFrame(encoder.fit_transform(X_train[categorical_cols]),
                           columns=encoder.get_feature_names_out(categorical_cols),
                           index=X_train.index)
X_test_cat = pd.DataFrame(encoder.transform(X_test[categorical_cols]),
                          columns=encoder.get_feature_names_out(categorical_cols),index=X_test.index)

vi)Normalize or Standardize features

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train.select_dtypes(exclude=['object']))
x_test = scaler.transform(X_test.select_dtypes(exclude=['object']))

3.Exploratory Data Analysis(EDA)

a) Univariate Analysis

i) Histogram

In [None]:
house_data.hist(figsize=(20,30))
plt.show()

ii) Barplot

In [None]:
house_data.city.value_counts().plot(kind='bar', figsize=(12,5))
plt.show()

b) Bivariate/Multivariate Analysis

i) Correlation Matrix

In [None]:
numeric_columns = house_data.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_columns.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

ii) Pairplot

In [None]:
plt.figure(figsize=(25, 5))
sns.pairplot(house_data)
plt.show()

iii) Scatterplot

In [None]:
sns.scatterplot(x='price', y='floors', data=house_data)

C) Analysis of relationship

In [None]:
feature_names = house_data.drop(columns=['price','country']).columns.to_numpy()
for col in feature_names :
                                plt.figure(figsize=(18, 8))
                                plt.bar(house_data[col], house_data['price'], color='skyblue')
                                plt.xlabel(col)
                                plt.xticks(rotation=90)
                                plt.ylabel('Price')
                                plt.title(f'Price vs {col}')
                                plt.show()
                                plt.bar(house_data[col], house_data['price'], color='skyblue')
                                plt.xlabel(col)
                                plt.xticks(rotation=90)
                                plt.ylabel('Price')
                                plt.title(f'Price vs {col}')
                                plt.show()

4.Feature Engineering

i) Features based on EDA insights

In [None]:
if 'yr_built' in house_data.columns:
                       house_data['house_age'] = 2025 - house_data['yr_built']

ii) Split columns

In [None]:
if 'state' in house_data.columns:
    house_data.rename(columns={'state': 'State'}, inplace=True)
    house_data[['Country', 'City', 'State']] = house_data['State'].str.split(' , ', expand=True)
else:
    print("Column 'state' not found in the DataFrame.")

iii) Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures
Pf=PolynomialFeatures(degree=2,interaction_only=False, include_bias=False)

iv) Dimensionality Reduction(PCA)

In [None]:
X['date'] = pd.to_datetime(X['date']).apply(lambda date: date.toordinal())
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))
X_encoded.columns = encoder.get_feature_names_out(categorical_cols)
X = X.drop(columns=categorical_cols, axis=1)
X = pd.concat([X, X_encoded], axis=1)
X.columns = X.columns.astype(str)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

5.Model Building

a) Machine Training Models

i) Linear Regression

In [None]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(X_train,y_train)

ii) Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg.fit(X_train, y_train)

iii) Train model using MAE,RMSE,R^2

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = pd.DataFrame(encoder.fit_transform(X[categorical_cols]))
X_encoded.columns = encoder.get_feature_names_out(categorical_cols)
X = X.drop(columns=categorical_cols, axis=1)
X = pd.concat([X, X_encoded], axis=1)
X.columns = X.columns.astype(str)
X['date'] = pd.to_datetime(X['date']).apply(lambda date: date.toordinal())
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestRegressor
reg = RandomForestRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

9.Visualization of Results and Model Insights

a) Model Behaviour

i) Residual Plots

In [None]:
def plot_residuals(y_true, y_pred, model_name):
                                 residuals = y_true - y_pred
                                 plt.figure(figsize=(8, 5))
                                 sns.histplot(residuals, kde=True, color='purple')
                                 plt.title(f"{model_name} Residuals")
                                 plt.xlabel("Residual")
                                 plt.grid(True)
                                 plt.show()

ii) Feature Importantance Plot

In [None]:
importances = reg.feature_importances_
features = X_train.columns
sns.barplot(x=importances, y=features)
plt.title("Feature Importance")
plt.show()

iii) Visual comparision of Model Performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf = r2_score(y_test, y_pred_rf)
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)
results = pd.DataFrame({
    'Model': ['Linear', 'Random Forest', 'XGBoost'],
    'MAE': [mae_lr, mae_rf, mae_xgb],
    'RMSE': [rmse_lr, rmse_rf, rmse_xgb],
    'R²': [r2_lr, r2_rf, r2_xgb]
})
results.plot(x='Model', kind='bar', figsize=(10, 6))
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.show()