In [1]:
from google.colab import drive
import pandas as pd

# This will prompt for authorization to access your Google Drive
drive.mount('/content/drive')

# Update the file path to the location where you uploaded your CSV in Google Drive
file_path = '/content/drive/My Drive/Thesis/df-englisch_cat.csv'

# Read the CSV file
df = pd.read_csv(file_path)

df.columns.tolist()

Mounted at /content/drive


['Unnamed: 0',
 'ada_embedding_eng',
 'numerical_price',
 'num_bedrooms',
 'num_rooms',
 'building_type_Bestaande bouw',
 'building_type_Nieuwbouw',
 'building_type_na',
 'tag_k.k.',
 'tag_v.o.n.',
 'house_category_Appartement',
 'house_category_Bungalow',
 'house_category_Eengezinswoning',
 'house_category_Grachtenpand',
 'house_category_Herenhuis',
 'house_category_Landhuis',
 'house_category_Other',
 'house_category_Unknown',
 'house_category_Villa',
 'house_category_Woonboerderij',
 'house_category_Woonboot',
 'energy_label_encoded',
 'size_scaled',
 'longitude_scaled',
 'latitude_scaled']

In [2]:
df = df.drop(df.columns[0], axis=1)

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


y = df['numerical_price']

# Dropping the 'ada_embedding_eng' column from df
z = df.drop(["ada_embedding_eng", "numerical_price"], axis=1)


x_train, x_test, y_train, y_test = train_test_split(z, y, test_size=0.2, random_state=42)

In [4]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np

def train_and_evaluate(x_train, y_train, x_test, y_test, best_params):
    """
    Trains the Random Forest model with the given parameters and evaluates it on the test set.

    Parameters:
    x_train (list/array): Training features
    y_train (list/array): Training target variable
    x_test (list/array): Test features
    y_test (list/array): Test target variable
    best_params (dict): Dictionary of best hyperparameters

    Returns:
    dict: Dictionary containing R2, MSE, and RMSE metrics
    """

    # Initialize RandomForestRegressor with best parameters
    clf_rf_best = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)

    # Initialize and fit StandardScaler on y_train
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))

    # Train the model on the entire training dataset
    clf_rf_best.fit(x_train, y_train_scaled.ravel())

    # Predict on the test data and inverse transform the predictions
    y_pred_scaled = clf_rf_best.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}


best_params = {'max_depth': 22, 'n_estimators': 300, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8, 'bootstrap': False}
performance_metrics = train_and_evaluate(x_train, y_train, x_test, y_test, best_params)
print("Final Model Performance on Test Set:", performance_metrics)

Final Model Performance on Test Set: {'R2': 0.7623871847600269, 'MSE': 38234334087.846054, 'RMSE': 195536.01736725142, 'MAE': 91330.68687623629, 'MAPE': 16.00861377761797, 'explained': 0.7623964660475206}


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np

def train_and_evaluate_model(x_train, y_train, x_test, y_test, params):
    """
    Train and evaluate a neural network model.

    Parameters:
    - x_train: Training features
    - y_train: Training target values
    - x_test: Test features
    - y_test: Test target values
    - params: Dictionary containing the optimal parameters

    Returns:
    - R2 score, MSE, and RMSE on the test set.
    """
    x_train = x_train.to_numpy()
    x_test = x_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')


    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)



    model = Sequential()
    if params['regularization'] == 'l1':
        reg = l1(params['l1_reg'])
    elif params['regularization'] == 'l2':
        reg = l2(0)
    elif params['regularization'] == 'l1_l2':
        reg = l1_l2(l1=params['l1_reg'], l2=0)
    else:
        reg = None

    model.add(Dense(params['neurons_layer_1'], activation='relu', input_shape=(x_train.shape[1],), kernel_regularizer=reg))
    model.add(Dense(params['neurons_layer_2'], activation='relu', kernel_regularizer=reg))
    model.add(Dense(params['neurons_layer_3'], activation='relu', kernel_regularizer=reg))
    model.add(Dense(1, activation='linear'))

    # Compile the model
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    # Scaling y_train
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()

    # Fit the model
    model.fit(x_train, y_train_scaled, epochs=params['epochs'], batch_size=params['batch_size'], verbose=1)

    # Predict and evaluate on the test set
    y_pred_scaled = model.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}


optimal_params = {'learning_rate': 0.0001481458629411212, 'neurons_layer_1': 416, 'neurons_layer_2': 208, 'neurons_layer_3': 80, 'batch_size': 128, 'epochs': 72, 'regularization': 'l1', 'l1_reg': 9.650241971828681e-05}

performance_metrics = train_and_evaluate_model(x_train, y_train, x_test, y_test, optimal_params)
print("Final Model Performance on Test Set:", performance_metrics)

Epoch 1/72
Epoch 2/72
Epoch 3/72
Epoch 4/72
Epoch 5/72
Epoch 6/72
Epoch 7/72
Epoch 8/72
Epoch 9/72
Epoch 10/72
Epoch 11/72
Epoch 12/72
Epoch 13/72
Epoch 14/72
Epoch 15/72
Epoch 16/72
Epoch 17/72
Epoch 18/72
Epoch 19/72
Epoch 20/72
Epoch 21/72
Epoch 22/72
Epoch 23/72
Epoch 24/72
Epoch 25/72
Epoch 26/72
Epoch 27/72
Epoch 28/72
Epoch 29/72
Epoch 30/72
Epoch 31/72
Epoch 32/72
Epoch 33/72
Epoch 34/72
Epoch 35/72
Epoch 36/72
Epoch 37/72
Epoch 38/72
Epoch 39/72
Epoch 40/72
Epoch 41/72
Epoch 42/72
Epoch 43/72
Epoch 44/72
Epoch 45/72
Epoch 46/72
Epoch 47/72
Epoch 48/72
Epoch 49/72
Epoch 50/72
Epoch 51/72
Epoch 52/72
Epoch 53/72
Epoch 54/72
Epoch 55/72
Epoch 56/72
Epoch 57/72
Epoch 58/72
Epoch 59/72
Epoch 60/72
Epoch 61/72
Epoch 62/72
Epoch 63/72
Epoch 64/72
Epoch 65/72
Epoch 66/72
Epoch 67/72
Epoch 68/72
Epoch 69/72
Epoch 70/72
Epoch 71/72
Epoch 72/72
Final Model Performance on Test Set: {'R2': 0.7075435960668552, 'MSE': 47059227000.0, 'RMSE': 216931.39, 'MAE': 110939.84, 'MAPE': 18.21815520524

In [7]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np


def run_svr_and_evaluate(x_train,x_test, y_train,y_test ):
    # Split data into training and testing sets


    # Initialize SVR with your predefined parameters
    clf_svr_opt =  LinearSVR(**linear_svr_params, random_state=0)

    # StandardScaler for y
    scaler = StandardScaler()

    # Scaling
    y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()


    # Fitting the model
    clf_svr_opt.fit(x_train, y_train_scaled)

    # Predicting and inverse transformation for the test set
    y_pred_scaled = clf_svr_opt.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}


linear_svr_params = {'C': 0.012413439869308911, 'epsilon': 0.0029686327476415772, 'tol': 9.433742678639591e-05, 'loss': 'squared_epsilon_insensitive', 'dual': False, 'fit_intercept': True, 'intercept_scaling': 5.864212196204296, 'max_iter': 5428}

performance_metrics = run_svr_and_evaluate(x_train=x_train,x_test=x_test, y_train=y_train,y_test=y_test)
print("Final Model Performance on Test Set:", performance_metrics)


Final Model Performance on Test Set: {'R2': 0.5645547838746261, 'MSE': 70067592328.62732, 'RMSE': 264702.83777970215, 'MAE': 140921.2718668702, 'MAPE': 26.338464998406103, 'explained': 0.5645726272085992}
