In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('..\\data\\cleaned\\data.csv')
X = df[['area', 'rooms', 'floor', 'city']]
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)


Training set shape: (9363, 4)
Test set shape: (2341, 4)


In [10]:
numeric_features = ['area', 'rooms', 'floor']
categorical_features = ['city']

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [11]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

model.fit(X_train, y_train)

In [None]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

metrics = {
    'R2 Score (Train)': r2_score(y_train, y_pred_train),
    'R2 Score (Test)': r2_score(y_test, y_pred_test),
    'RMSE (Train)': np.sqrt(mean_squared_error(y_train, y_pred_train)),
    'RMSE (Test)': np.sqrt(mean_squared_error(y_test, y_pred_test)),
    'MAE (Train)': mean_absolute_error(y_train, y_pred_train),
    'MAE (Test)': mean_absolute_error(y_test, y_pred_test)
}

print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:,.2f}")


In [15]:
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print("\nCross-validation R² scores:", cv_scores)
print(f"Average CV R² score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price')
plt.ylabel('Predicted Price')
plt.title('Actual vs Predicted House Prices')
plt.tight_layout()
plt.show()


In [None]:
errors = y_test - y_pred_test
plt.figure(figsize=(10, 6))
plt.hist(errors, bins=50)
plt.xlabel('Prediction Error')
plt.ylabel('Count')
plt.title('Distribution of Prediction Errors')
plt.tight_layout()
plt.show()

In [None]:
feature_names = (
    numeric_features + 
    [f"city_{cat}" for cat in model.named_steps['preprocessor']
     .named_transformers_['cat'].get_feature_names_out(['city'])]
)

coefficients = pd.DataFrame(
    model.named_steps['regressor'].coef_,
    index=feature_names,
    columns=['Coefficient']
)

coefficients['Abs_Coefficient'] = abs(coefficients['Coefficient'])
coefficients = coefficients.sort_values('Abs_Coefficient', ascending=False)

plt.figure(figsize=(12, 6))
sns.barplot(x=coefficients['Coefficient'], y=coefficients.index)
plt.title('Feature Importance (Linear Regression Coefficients)')
plt.xlabel('Coefficient Value')
plt.tight_layout()
plt.show()


In [None]:
print("\nTop 10 Most Important Features:")
print(coefficients.head(10))

In [None]:
percentage_errors = (abs(y_test - y_pred_test) / y_test) * 100
print("\nPercentage Error Statistics:")
print(f"Mean percentage error: {percentage_errors.mean():.2f}%")
print(f"Median percentage error: {percentage_errors.median():.2f}%")
print(f"90th percentile of percentage error: {np.percentile(percentage_errors, 90):.2f}%")