In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from  sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error, mean_squared_error, r2_score
#from  typing import
from scipy import sparse

In [None]:
df=pd.read_csv("data/nonull_smalldata.csv")
df.columns
df.dtypes
numeric_columns=["bedroomCount","toilet_and_bath","habitableSurface","facedeCount","hasTerrace","totalParkingCount"]
categorical_columns=["type","subtype","province","locality","postCode","buildingCondition","epcScore"]

encoder = OneHotEncoder(sparse_output=False, drop="first")
one_hot_encoded = encoder.fit_transform(df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df = pd.concat([df, one_hot_df], axis=1)
df = df.drop(categorical_columns, axis=1)
#df.to_csv("data/imputed_encoded_data.csv")

X=df.drop(columns="price")
y=df["price"]
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=1234)
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled= scaler.transform(X_test)
#X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
#X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)
X_train = pd.DataFrame(X_train, columns=X_train.columns)
X_test = pd.DataFrame(X_test, columns=X_test.columns)


In [None]:
len(X_train.columns)

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)
mse = mean_squared_error(y_test, predictions)
print("MSE:", mse)
len(X_train), len(y_train),len(X_test), len(y_test)
accu = r2_score(y_test, predictions)
print("Accuracy of test:", accu)

pred = regressor.predict(X_train)
accu = r2_score(y_train, pred)
print("Accuracy of train:", accu)

In [None]:
'''# One-hot encode categorical columns with sparse matrix output
encoder = OneHotEncoder(sparse_output=True, handle_unknown="ignore")
one_hot_sparse = encoder.fit_transform(df[categorical_columns])

# Drop the categorical columns
df = df.drop(columns=categorical_columns)

# Separate numeric features and target
X_numeric = df.drop(columns="price")
y = df["price"]

X_numeric = X_numeric.apply(pd.to_numeric, errors='coerce')
X_numeric = X_numeric.dropna(axis=1, how='all')

X_numeric_sparse = sparse.csr_matrix(X_numeric.values.astype("float64"))
# Combine numeric and one-hot encoded categorical features
X_full_sparse = sparse.hstack([X_numeric_sparse, one_hot_sparse]).tocsr()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_full_sparse, y, test_size=0.2, random_state=1234)

# Define parameter grid for DecisionTreeRegressor
param_grid = {
    'max_depth': [8,10],
    'min_samples_split': [1,2,3 ],
    'min_samples_leaf': [1,2]
    #'max_features': ['sqrt'] , #'log2'
}

# Run GridSearchCV
dt = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print("Test MSE:", mse)
print("Test R2 Score:", r2)'''

In [None]:
dt_regressor = DecisionTreeRegressor(random_state=42, max_depth=10) # Example: max_depth=10
# Fit the model to the training data
dt_regressor.fit(X_train, y_train)
# Make predictions on the test data
dt_predictions = dt_regressor.predict(X_test)
# Evaluate the Decision Tree model
dt_mse = mean_squared_error(y_test, dt_predictions)
dt_r2 = r2_score(y_test, dt_predictions)
print(f"Decision Tree MSE: {dt_mse}")
predictions = dt_regressor.predict(X_test)
accu = r2_score(y_test, predictions)
print("Accuracy of test:", accu)

pred = dt_regressor.predict(X_train)
accu = r2_score(y_train, pred)
print("Accuracy of train:", accu)

# --- Optional: Plotting Actual vs. Predicted for Decision Tree ---

'''plt.figure(figsize=(10, 8))
plt.scatter(y_test, dt_predictions, alpha=0.6, color='green', label='Decision Tree Predictions')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction (y=x)')

plt.xlabel("Actual Prices (y_test)", fontsize=12)
plt.ylabel("Predicted Prices", fontsize=12)
plt.title("Decision Tree: Actual vs. Predicted Prices", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=10)
plt.show()'''

# --- Optional: Plotting against a single feature ('habitableSurface') for Decision Tree ---

'''if 'habitableSurface' in X_test_scaled_df.columns:
    habitable_surface_test = X_test_scaled_df['habitableSurface']

    plt.figure(figsize=(12, 8))
    plt.scatter(habitable_surface_test, y_test, alpha=0.6, color='blue', label='Actual Price')
    plt.scatter(habitable_surface_test, dt_predictions, alpha=0.6, color='green', marker='x', label='Decision Tree Predicted Price')

    plt.xlabel("Scaled Habitable Surface", fontsize=12)
    plt.ylabel("Price", fontsize=12)
    plt.title("Decision Tree: Actual vs. Predicted Prices against Habitable Surface", fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.legend(fontsize=10)
    plt.tight_layout()
    plt.show()
else:
    print("\nSkipping Decision Tree 'habitableSurface' plot: 'habitableSurface' column not found.")'''

In [None]:
#Tuning
param_grid = {
    'max_depth': [6,7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    #'max_features': ['sqrt', 'log2']
}
dt = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='r2',  # You can also try 'neg_mean_squared_error'
    n_jobs=-1,  # Use all available cores for faster computation
    verbose=2
)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score (CV):", grid_search.best_score_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Test MSE:", mse)
print("Test R2:", r2)
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Decision Tree Predictions vs Actual")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=8, n_jobs=-1)

# Fit the model to the training data
rf_regressor.fit(X_train, y_train)

# Make predictions on the test data
rf_predictions = rf_regressor.predict(X_test)

# Evaluate the Random Forest model
rf_mse = mean_squared_error(y_test, rf_predictions)

print(f"Random Forest MSE: {rf_mse}")
predictions = rf_regressor.predict(X_test)
accu = r2_score(y_test, predictions)
print("Accuracy of test:", accu)

pred = rf_regressor.predict(X_train)
accu = r2_score(y_train, pred)
print("Accuracy of train:", accu)

In [None]:
xgb_regressor = XGBRegressor(
    objective='reg:squarederror', # For regression tasks, specify the objective
    n_estimators=100,             # Number of boosting rounds
    learning_rate=0.1,            # Step size shrinkage
    max_depth=5,                  # Maximum depth of a tree
    subsample=0.8,                # Subsample ratio of the training instance
    colsample_bytree=0.8,         # Subsample ratio of columns
    random_state=42,              # For reproducibility
    n_jobs=-1                     # Use all available CPU cores
)

# Fit the model to the training data
xgb_regressor.fit(X_train, y_train)

# Make predictions on the test data
xgb_predictions = xgb_regressor.predict(X_test)

# Evaluate the XGBoost model
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
predictions = xgb_regressor.predict(X_test)
accu = r2_score(y_test, predictions)
print("Accuracy of test:", accu)

pred = xgb_regressor.predict(X_train)
accu = r2_score(y_train, pred)
print("Accuracy of train:", accu)

In [None]:
param_grid = {
    'n_estimators': [100],
    'max_depth': [5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 10]
}

xgb = XGBRegressor(random_state=42, n_jobs=-1)

# Grid search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

'''# Predict and evaluate
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

predictions = best_model.predict(X_test)
accu = r2_score(y_test, predictions)
print("Accuracy of test:", accu)
pred = best_model.predict(X_train)
accu = r2_score(y_train, pred)
print("Accuracy of train:", accu)'''


In [None]:
pca = PCA(n_components=80)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror', # For regression tasks, specify the objective
    n_estimators=100,             # Number of boosting rounds
    learning_rate=0.1,            # Step size shrinkage
    max_depth=5,                  # Maximum depth of a tree
    subsample=0.8,                # Subsample ratio of the training instance
    colsample_bytree=0.8,         # Subsample ratio of columns
    random_state=42,              # For reproducibility
    n_jobs=-1                     # Use all available CPU cores
)

# Fit the model to the training data
xgb_regressor.fit(X_train_pca, y_train)

# Make predictions on the test data
xgb_predictions = xgb_regressor.predict(X_test_pca)

# Evaluate the XGBoost model
xgb_mse = mean_squared_error(y_test, xgb_predictions)
xgb_r2 = r2_score(y_test, xgb_predictions)

print(f"XGBoost MSE: {xgb_mse}")
predictions = xgb_regressor.predict(X_test_pca)
accu = r2_score(y_test, predictions)
print("Accuracy of test:", accu)

pred = xgb_regressor.predict(X_train_pca)
accu = r2_score(y_train, pred)
print("Accuracy of train:", accu)