In [18]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Install XGBoost if not already installed
from xgboost import XGBRegressor
import sklearn
print(sklearn.__version__)


1.1.1


In [15]:
pip install --upgrade scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp39-cp39-win_amd64.whl.metadata (12 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Downloading scikit_learn-1.5.1-cp39-cp39-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB 1.3 MB/s eta 0:00:09
   ---------------------------------------- 0.1/11.0 MB 919.0 kB/s eta 0:00:12
    --------------------------------------- 0.2/11.0 MB 1.2 MB/s eta 0:00:09
   - -------------------------------------- 0.3/11.0 MB 2.0 MB/s eta 0:00:06
   - -------------------------------------- 0.5/11.0 MB 2.4 MB/s eta 0:00:05
   -- ------------------------------------- 0.7/11.0 MB 2.7 MB/s eta 0:00:04
   --- ------------------------------------ 1.0/11.0 MB 3.0 MB/s eta 0:00:04
   ---- ----------------------------------- 1.1/11.0 MB 3.1 MB/s eta 0:00:04
   ---- ----------------------------------

  You can safely remove it manually.


In [2]:
# Load the California housing dataset
data = fetch_california_housing()
X = data.data
y = data.target

In [8]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
# Linear Regression (OLS)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Calculate metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mape_lr = np.mean(np.abs((y_test - y_pred_lr) / y_test)) * 100
r2_lr = r2_score(y_test, y_pred_lr)

# Print results
print("Linear Regression (OLS)")
print(f'MSE: {mse_lr}')
print(f'RMSE: {rmse_lr}')
print(f'MAE: {mae_lr}')
print(f'MAPE (%): {mape_lr}')
print(f'R squared: {r2_lr}\n')

Linear Regression (OLS)
MSE: 0.5305677824766755
RMSE: 0.7284008391515454
MAE: 0.527247453830616
MAPE (%): 31.750265760071688
R squared: 0.5957702326061662



In [10]:

# SGD Regression
sgd = SGDRegressor(max_iter=1000, tol=1e-3)
sgd.fit(X_train, y_train)
y_pred_sgd = sgd.predict(X_test)

# Calculate metrics
mse_sgd = mean_squared_error(y_test, y_pred_sgd)
rmse_sgd = np.sqrt(mse_sgd)
mae_sgd = mean_absolute_error(y_test, y_pred_sgd)
mape_sgd = np.mean(np.abs((y_test - y_pred_sgd) / y_test)) * 100
r2_sgd = r2_score(y_test, y_pred_sgd)

# Print results
print("SGD Regression")
print(f'MSE: {mse_sgd}')
print(f'RMSE: {rmse_sgd}')
print(f'MAE: {mae_sgd}')
print(f'MAPE (%): {mape_sgd}')
print(f'R squared: {r2_sgd}\n')


SGD Regression
MSE: 1.9934332314069628e+30
RMSE: 1411889950175637.0
MAE: 1112322575457411.6
MAPE (%): 7.23283167852104e+16
R squared: -1.5187598607765056e+30



In [20]:

# Lasso Regression
lasso = Lasso()
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)

# Calculate metrics
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
mape_lasso = np.mean(np.abs((y_test - y_pred_lasso) / y_test)) * 100
r2_lasso = r2_score(y_test, y_pred_lasso)

# Print results
print("Lasso Regression")
print(f'MSE: {mse_lasso}')
print(f'RMSE: {rmse_lasso}')
print(f'MAE: {mae_lasso}')
print(f'MAPE (%): {mape_lasso}')
print(f'R squared: {r2_lasso}\n')


Lasso Regression
MSE: 0.9345280531749283
RMSE: 0.9667099115944391
MAE: 0.7609780549289142
MAPE (%): 52.01400652865025
R squared: 0.288000383674784



In [21]:
from sklearn.linear_model import ElasticNet

# Elastic Net
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
y_pred_en = elastic_net.predict(X_test)

# Calculate metrics
mse_en = mean_squared_error(y_test, y_pred_en)
rmse_en = np.sqrt(mse_en)
mae_en = mean_absolute_error(y_test, y_pred_en)
mape_en = np.mean(np.abs((y_test - y_pred_en) / y_test)) * 100
r2_en = r2_score(y_test, y_pred_en)

# Print results
print("Elastic Net")
print(f'MSE: {mse_en}')
print(f'RMSE: {rmse_en}')
print(f'MAE: {mae_en}')
print(f'MAPE (%): {mape_en}')
print(f'R squared: {r2_en}\n')


Elastic Net
MSE: 0.7562926012142382
RMSE: 0.8696508501773791
MAE: 0.6738692599693638
MAPE (%): 45.394602074695264
R squared: 0.4237946736165634



In [22]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Polynomial Regression (Degree 2)
poly_reg = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
poly_reg.fit(X_train, y_train)
y_pred_poly = poly_reg.predict(X_test)

# Calculate metrics
mse_poly = mean_squared_error(y_test, y_pred_poly)
rmse_poly = np.sqrt(mse_poly)
mae_poly = mean_absolute_error(y_test, y_pred_poly)
mape_poly = np.mean(np.abs((y_test - y_pred_poly) / y_test)) * 100
r2_poly = r2_score(y_test, y_pred_poly)

# Print results
print("Polynomial Regression (Degree 2)")
print(f'MSE: {mse_poly}')
print(f'RMSE: {rmse_poly}')
print(f'MAE: {mae_poly}')
print(f'MAPE (%): {mape_poly}')
print(f'R squared: {r2_poly}\n')


Polynomial Regression (Degree 2)
MSE: 0.4549723312023182
RMSE: 0.67451636837242
MAE: 0.46333006176513025
MAPE (%): 26.7907883609311
R squared: 0.6533650069100672



In [23]:
from sklearn.neighbors import KNeighborsRegressor

# k-Nearest Neighbors
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

# Calculate metrics
mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn = np.sqrt(mse_knn)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
mape_knn = np.mean(np.abs((y_test - y_pred_knn) / y_test)) * 100
r2_knn = r2_score(y_test, y_pred_knn)

# Print results
print("k-Nearest Neighbors")
print(f'MSE: {mse_knn}')
print(f'RMSE: {rmse_knn}')
print(f'MAE: {mae_knn}')
print(f'MAPE (%): {mape_knn}')
print(f'R squared: {r2_knn}\n')


k-Nearest Neighbors
MSE: 1.136942049088978
RMSE: 1.066274846880005
MAE: 0.8217871040051679
MAPE (%): 53.857102534079274
R squared: 0.1337849088797427



In [24]:
from sklearn.tree import DecisionTreeRegressor

# Decision Tree
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# Calculate metrics
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = np.sqrt(mse_dt)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mape_dt = np.mean(np.abs((y_test - y_pred_dt) / y_test)) * 100
r2_dt = r2_score(y_test, y_pred_dt)

# Print results
print("Decision Tree")
print(f'MSE: {mse_dt}')
print(f'RMSE: {rmse_dt}')
print(f'MAE: {mae_dt}')
print(f'MAPE (%): {mape_dt}')
print(f'R squared: {r2_dt}\n')


Decision Tree
MSE: 0.5283933525042798
RMSE: 0.7269067013752726
MAE: 0.47254742409560724
MAPE (%): 26.142704308128305
R squared: 0.5974268905318562



In [27]:

# Random Forest
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Calculate metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mape_rf = np.mean(np.abs((y_test - y_pred_rf) / y_test)) * 100
r2_rf = r2_score(y_test, y_pred_rf)

# Print results
print("Random Forest")
print(f'MSE: {mse_rf}')
print(f'RMSE: {rmse_rf}')
print(f'MAE: {mae_rf}')
print(f'MAPE (%): {mape_rf}')
print(f'R squared: {r2_rf}\n')


Random Forest
MSE: 0.2539805093260179
RMSE: 0.5039647897681125
MAE: 0.3305438337047805
MAPE (%): 19.04475202070329
R squared: 0.8064969536443027



In [3]:
# Define regression models
models = {
    'Linear Regression (OLS)': LinearRegression(),
    'SGD Regression': SGDRegressor(max_iter=1000, tol=1e-3),
    'Lasso Regression': Lasso(),
    'Elastic Net': ElasticNet(),
    'Polynomial Regression (Degree 2)': make_pipeline(PolynomialFeatures(degree=2), LinearRegression()),
    'k-Nearest Neighbors': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'Support Vector Machine': SVR(),
    'XGBoost': XGBRegressor(use_label_encoder=False, eval_metric='rmse')
}

# Initialize list to store results
results = []

# Evaluation function
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    r2 = r2_score(y_test, y_pred)
    
    return mse, rmse, mae, mape, r2

# Evaluate each model
for name, model in models.items():
    mse, rmse, mae, mape, r2 = evaluate_model(model, X_train, X_test, y_train, y_test)
    
    results.append({
        'Model': name,
        'MSE': mse,
        'RMSE': rmse,
        'MAE': mae,
        'MAPE (%)': mape,
        'R squared': r2
    })

# Create a DataFrame with the results
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

NameError: name 'LinearRegression' is not defined