In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedShuffleSplit

housing_path = os.path.join("..", "data", "housing.csv")

housing = pd.read_csv(housing_path)

housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index].drop("income_cat", axis=1)
    strat_test_set = housing.loc[test_index].drop("income_cat", axis=1)

train_set = strat_train_set.copy()
test_set = strat_test_set.copy()

train_labels = train_set["median_house_value"].copy()
test_labels = test_set["median_house_value"].copy()

train_set = train_set.drop("median_house_value", axis=1)
test_set = test_set.drop("median_house_value", axis=1)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = train_set.drop("ocean_proximity", axis=1).columns
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)
])

X_train = full_pipeline.fit_transform(train_set)
X_test = full_pipeline.transform(test_set)
y_train = train_labels
y_test = test_labels

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.05]
}

xgb = XGBRegressor(random_state=42)
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5,
                               scoring='neg_mean_squared_error',
                               return_train_score=True, n_jobs=-1)

grid_search_xgb.fit(X_train, y_train)
xgb_best_model = grid_search_xgb.best_estimator_

xgb_preds = xgb_best_model.predict(X_test)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_preds))
print("✅ XGBoost RMSE:", xgb_rmse)

from sklearn.ensemble import RandomForestRegressor

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_features': ['sqrt','log2'],
    'max_depth': [None, 10, 30],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'bootstrap': [True, False]
}

rf = RandomForestRegressor(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5,
                              scoring='neg_mean_squared_error',
                              return_train_score=True, n_jobs=-1)

grid_search_rf.fit(X_train, y_train)
rf_best_model = grid_search_rf.best_estimator_

rf_preds = rf_best_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
print("✅ Random Forest RMSE:", rf_rmse)



5 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Saghar\AppData\Local\Programs\Python\Python313\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Saghar\AppData\Local\Programs\Python\Python313\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\Saghar\AppData\Local\Programs\Python\Python313\Lib\site-packages\xgboost\sklearn.py", line 1222, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
               

✅ XGBoost RMSE: 48237.00609316731


In [None]:
final_X_train = strat_train_set.drop("median_house_value",axis=1)
final_y_train = strat_train_set["median_house_value"]
final_X_train_prepared = full_pipeline.fit_transform(final_X_train)
final_model = XGBRegressor(
    n_estimators=grid_search_xgb.best_params_['n_estimators'],
    max_depth = grid_search_xgb.best_params_['max_depth'],
    learning_rate= grid_search_xgb.best_params_['learning_rate'],
    random_state=42
)
final_model.fit(final_X_train_prepared,final_y_train)

In [None]:
import joblib
joblib.dump(full_pipeline,"full_pipeline.pkl")
joblib.dump(final_model,"final_model_xgb.pkl")

['final_model_xgb.pkl']

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
xgb_mae = mean_absolute_error(y_test,xgb_preds)
xgb_r2 = r2_score(y_test,xgb_preds)
print("📊 XGBoost Evaluation:")
print("MAE:", xgb_mae)
print("R²:", xgb_r2)

# Random Forest Evaluation
rf_mae = mean_absolute_error(y_test, rf_preds)
rf_r2 = r2_score(y_test, rf_preds)

print("\n📊 Random Forest Evaluation:")
print("MAE:", rf_mae)
print("R²:", rf_r2)


📊 XGBoost Evaluation:
MAE: 32084.112471233042
R²: 0.8261754563121965

📊 Random Forest Evaluation:
MAE: 32450.90597383721
R²: 0.8149719881362222


In [None]:
import matplotlib.pyplot as plt
residuals = y_test - xgb_preds
plt.figure(figsize=(10,6))
plt.scatter(xgb_preds, residuals, alpha=0.3)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted values")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residual Plot for XGBoost")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(xgb_preds, y_test - xgb_preds, alpha=0.3, label="XGBoost Residuals")
plt.scatter(rf_preds, y_test - rf_preds, alpha=0.3, label="Random Forest Residuals", color="orange")
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel("Predicted values")
plt.ylabel("Residuals (Actual - Predicted)")
plt.title("Residual Plot Comparison")
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# باقیمانده‌ها رو حساب کن (فرق مقدار واقعی با پیش‌بینی)
xgb_residuals = y_test - xgb_preds
rf_residuals = y_test - rf_preds

plt.figure(figsize=(14,6))

# نمودار باقیمانده‌ها برای XGBoost
plt.subplot(1, 2, 1)
plt.scatter(xgb_preds, xgb_residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title("XGBoost Residuals")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")

# نمودار باقیمانده‌ها برای Random Forest
plt.subplot(1, 2, 2)
plt.scatter(rf_preds, rf_residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.title("Random Forest Residuals")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.hist(train_labels, bins=50, edgecolor='black')
plt.title("Distribution of Median House Value")
plt.xlabel("Median House Value")
plt.ylabel("Count")
plt.show()


In [None]:
import numpy as np

np.sum(train_labels >= 500000)


np.int64(787)

In [None]:
import numpy as np

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

In [None]:
xgb_log = XGBRegressor(random_state=42)

param_grid_xgb_log = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.05]
}

grid_search_xgb_log = GridSearchCV(xgb_log, param_grid_xgb_log, cv=5,
                                   scoring='neg_mean_squared_error',
                                   return_train_score=True, n_jobs=-1)

grid_search_xgb_log.fit(X_train, y_train_log)
xgb_log_best_model = grid_search_xgb_log.best_estimator_


In [None]:
print("Best parameters found by GridSearchCV:")
print(grid_search.best_params_)

Best parameters found by GridSearchCV:
{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
final_model = RandomForestRegressor(
    bootstrap=False,
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

final_model.fit(X_train, y_train)


In [None]:
final_preds = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, final_preds))
print("Final Random Forest RMSE:", final_rmse)

Final Random Forest RMSE: 49767.237791707084


In [None]:
import joblib

# ذخیره مدل نهایی
joblib.dump(final_model, "random_forest_final_model.pkl")

# ذخیره pipeline پردازش داده (full_pipeline) که برای تبدیل داده ها استفاده کردیم
joblib.dump(full_pipeline, "data_preprocessing_pipeline.pkl")

['data_preprocessing_pipeline.pkl']

In [None]:
import matplotlib.pyplot as plt
# فرض می‌کنیم X_train و y_train داده‌های اصلی آموزشی هستند
price_threshold = 500000

# پیدا کردن اندیس‌هایی که قیمت‌ها کمتر یا مساوی 500000 هستن
indices_filtered = y_train <= price_threshold

# ساختن داده‌های فیلتر شده
X_train_filtered = X_train[indices_filtered]
y_train_filtered = y_train[indices_filtered]


test_filter = y_test < price_threshold
X_test_filtered = X_test[test_filter]
y_test_filtered = y_test[test_filter]

# پیش‌بینی با مدل‌های نهایی
xgb_final_preds = xgb_best_model.predict(X_train_filtered)  # توجه کن X_train_filtered ورودی‌ باید داده‌های فیلتر شده باشه
rf_final_preds = rf_best_model.predict(X_train_filtered)

# محاسبه Residual ها
xgb_final_residuals = y_train_filtered - xgb_final_preds
rf_final_residuals = y_train_filtered - rf_final_preds

plt.figure(figsize=(14,6))

plt.subplot(1, 2, 1)
plt.scatter(xgb_final_preds, xgb_final_residuals, alpha=0.3)
plt.axhline(y=0, color='r', linestyle='--')
plt.title("XGBoost Residuals After Outlier Removal")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")

plt.subplot(1, 2, 2)
plt.scatter(rf_final_preds, rf_final_residuals, alpha=0.3)
plt.axhline(y=0, color='r', linestyle='--')
plt.title("Random Forest Residuals After Outlier Removal")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")

plt.show()


In [None]:
# فرض می‌کنیم داده‌های آموزش و تست بعد از حذف داده‌های پرت تعریف شدن به صورت:
# X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered

# آموزش مجدد XGBoost
xgb_final = XGBRegressor(random_state=42)
xgb_final.fit(X_train_filtered, y_train_filtered)
xgb_pred_filtered = xgb_final.predict(X_test_filtered)

# آموزش مجدد Random Forest با بهترین پارامترها
rf_final = RandomForestRegressor(
    bootstrap=False,
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)
rf_final.fit(X_train_filtered, y_train_filtered)
rf_pred_filtered = rf_final.predict(X_test_filtered)

# ارزیابی مجدد مدل‌ها
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

print("XGBoost Evaluation After Outlier Removal:")
print("MAE:", mean_absolute_error(y_test_filtered, xgb_pred_filtered))
print("R²:", r2_score(y_test_filtered, xgb_pred_filtered))
mse_xgb = root_mean_squared_error(y_test_filtered, xgb_pred_filtered)
rmse = np.sqrt(mse_xgb)
print("RMSE:", rmse)

print("\nRandom Forest Evaluation After Outlier Removal:")
print("MAE:", mean_absolute_error(y_test_filtered, rf_pred_filtered))
print("R²:", r2_score(y_test_filtered, rf_pred_filtered))
mse_rf = root_mean_squared_error(y_test_filtered, rf_pred_filtered)
rmse_rf = np.sqrt(mse_rf)
print("RMSE:", rmse_rf)


XGBoost Evaluation After Outlier Removal:
MAE: 29764.14152713166
R²: 0.7933500481858213
RMSE: 210.09038864998604

Random Forest Evaluation After Outlier Removal:
MAE: 30189.541130876514
R²: 0.7899343918214787
RMSE: 210.95318962925109


In [None]:
import joblib
import os

# ساخت پوشه models در صورت نبود
os.makedirs("models", exist_ok=True)

# ذخیره مدل‌ها
joblib.dump(xgb_final, "models/xgboost_final_model.joblib")
joblib.dump(rf_final, "models/random_forest_final_model.joblib")

['models/random_forest_final_model.joblib']

In [None]:
joblib.dump(full_pipeline, "models/data_preprocessor.joblib")

['models/data_preprocessor.joblib']