In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


df = pd.read_excel("realistic_housing_data(1).xlsx")

print(f"\nðŸ“Š Dataset Shape: {df.shape}")
print(f"Features: {df.columns.tolist()}")
print("\nðŸ“ˆ Dataset Info:")
print(df.info())
print("\nðŸ“‹ First 5 rows:")
df.head()
print("\nðŸ“Š Statistical Summary:")
print(df.describe())
# Split features and target
X = df.drop("price", axis=1)
y = df["price"]
categorical_features = X.select_dtypes(include=["object"]).columns
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns
print(categorical_features)
print(numerical_features)

preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ("scale", StandardScaler(), numerical_features)
])

pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("ðŸ“Š Model Evaluation:")
print(f"Mean Absolute Error (MAE): â‚¹{mae:.2f}")
print(f"Root Mean Squared Error (RMSE): â‚¹{rmse:.2f}")
print(f"RÂ² Score: {r2:.2f}")




ðŸ“Š Dataset Shape: (5000, 15)
Features: ['bedrooms', 'bathrooms', 'sqft', 'lot_size', 'age', 'year_built', 'garage', 'location', 'house_type', 'condition', 'has_pool', 'has_fireplace', 'has_basement', 'school_rating', 'price']

ðŸ“ˆ Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   bedrooms       5000 non-null   int64 
 1   bathrooms      5000 non-null   int64 
 2   sqft           5000 non-null   int64 
 3   lot_size       5000 non-null   int64 
 4   age            5000 non-null   int64 
 5   year_built     5000 non-null   int64 
 6   garage         5000 non-null   int64 
 7   location       5000 non-null   object
 8   house_type     5000 non-null   object
 9   condition      5000 non-null   int64 
 10  has_pool       5000 non-null   int64 
 11  has_fireplace  5000 non-null   int64 
 12  has_basement   5000 non-null   int64 