In [2]:
from google.colab import drive
import pandas as pd

# This will prompt for authorization to access your Google Drive
drive.mount('/content/drive')

# Update the file path to the location where you uploaded your CSV in Google Drive
file_path = '/content/drive/My Drive/Thesis/df-BERT_cat.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Drop the first column (if it's an unwanted index column)
df = df.drop(df.columns[0], axis=1)

# Display the column names
df.columns.tolist()


Mounted at /content/drive


['numerical_price',
 'embeddings',
 'num_bedrooms',
 'num_rooms',
 'building_type_Bestaande bouw',
 'building_type_Nieuwbouw',
 'building_type_na',
 'tag_k.k.',
 'tag_v.o.n.',
 'house_category_Appartement',
 'house_category_Bungalow',
 'house_category_Eengezinswoning',
 'house_category_Grachtenpand',
 'house_category_Herenhuis',
 'house_category_Landhuis',
 'house_category_Other',
 'house_category_Unknown',
 'house_category_Villa',
 'house_category_Woonboerderij',
 'house_category_Woonboot',
 'energy_label_encoded',
 'size_scaled',
 'longitude_scaled',
 'latitude_scaled']

In [3]:
import pandas as pd
import numpy as np
from ast import literal_eval



df["embeddings"] = df["embeddings"].apply(literal_eval).apply(np.array)

# Separating the target variable and embeddings
x = df["embeddings"]
y = df['numerical_price']

# Dropping the 'ada_embedding_eng' column from df
z = df.drop(["embeddings", "numerical_price"], axis=1)


# Deleting original dataframe for memory purpose
del df

# Explode the embedding arrays into separate columns
x = x.apply(pd.Series)

# Concatenating the exploded embeddings with the rest of the data
concatenated_df = pd.concat([x, z], axis=1).reset_index(drop=True)

In [4]:
from sklearn.model_selection import train_test_split
concatenated_df.columns = concatenated_df.columns.astype(str)


x_train, x_test, y_train, y_test = train_test_split(concatenated_df, y, test_size=0.2, random_state=42)

In [5]:


from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np

def train_and_evaluate(x_train, y_train, x_test, y_test, best_params):
    """
    Trains the Random Forest model with the given parameters and evaluates it on the test set.

    Parameters:
    x_train (list/array): Training features
    y_train (list/array): Training target variable
    x_test (list/array): Test features
    y_test (list/array): Test target variable
    best_params (dict): Dictionary of best hyperparameters

    Returns:
    dict: Dictionary containing R2, MSE, and RMSE metrics
    """

    # Initialize RandomForestRegressor with best parameters
    clf_rf_best = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)

    # Initialize and fit StandardScaler on y_train
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))

    # Train the model on the entire training dataset
    clf_rf_best.fit(x_train, y_train_scaled.ravel())

    # Predict on the test data and inverse transform the predictions
    y_pred_scaled = clf_rf_best.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}


best_params = {'max_depth': 24, 'n_estimators': 550, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8, 'bootstrap': False}
performance_metrics = train_and_evaluate(x_train, y_train, x_test, y_test, best_params)
print("Final Model Performance on Test Set:", performance_metrics)

Final Model Performance on Test Set: {'R2': 0.5795396175647093, 'MSE': 67656379208.75064, 'RMSE': 260108.39895849314, 'MAE': 125823.94117539897, 'MAPE': 25.48885406788416, 'explained': 0.5798537296891237}


In [6]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np


def run_svr_and_evaluate(x_train,x_test, y_train,y_test ):
    # Split data into training and testing sets


    # Initialize SVR with your predefined parameters
    clf_svr_opt =  LinearSVR(**linear_svr_params, random_state=0)

    # StandardScaler for y
    scaler = StandardScaler()

    # Scaling
    y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()


    # Fitting the model
    clf_svr_opt.fit(x_train, y_train_scaled)

    # Predicting and inverse transformation for the test set
    y_pred_scaled = clf_svr_opt.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}


linear_svr_params = {'C': 0.22348106317685834, 'epsilon': 0.0010146073912052189, 'tol': 0.003974565015315399, 'loss': 'squared_epsilon_insensitive', 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1.560470019225011, 'max_iter': 8343}

performance_metrics = run_svr_and_evaluate(x_train=x_train,x_test=x_test, y_train=y_train,y_test=y_test)
print("Final Model Performance on Test Set:", performance_metrics)


Final Model Performance on Test Set: {'R2': 0.6420403839908111, 'MSE': 57599366156.367096, 'RMSE': 239998.67948879863, 'MAE': 133614.8637361109, 'MAPE': 25.950019264182412, 'explained': 0.6423670018064684}


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np

def train_and_evaluate_model(x_train, y_train, x_test, y_test, params):
    """
    Train and evaluate a neural network model.

    Parameters:
    - x_train: Training features
    - y_train: Training target values
    - x_test: Test features
    - y_test: Test target values
    - params: Dictionary containing the optimal parameters

    Returns:
    - R2 score, MSE, and RMSE on the test set.
    """
    x_train = x_train.to_numpy()
    x_test = x_test.to_numpy()
    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()

    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')


    y_train = np.ravel(y_train)
    y_test = np.ravel(y_test)



    model = Sequential()
    if params['regularization'] == 'l1':
        reg = l1(params['l1_reg'])
    elif params['regularization'] == 'l2':
        reg = l2(0)
    elif params['regularization'] == 'l1_l2':
        reg = l1_l2(l1=params['l1_reg'], l2=0)
    else:
        reg = None

    model.add(Dense(params['neurons_layer_1'], activation='relu', input_shape=(x_train.shape[1],), kernel_regularizer=reg))
    model.add(Dense(params['neurons_layer_2'], activation='relu', kernel_regularizer=reg))
    model.add(Dense(params['neurons_layer_3'], activation='relu', kernel_regularizer=reg))
    model.add(Dense(1, activation='linear'))

    # Compile the model
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    # Scaling y_train
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()

    # Fit the model
    model.fit(x_train, y_train_scaled, epochs=params['epochs'], batch_size=params['batch_size'], verbose=1)

    # Predict and evaluate on the test set
    y_pred_scaled = model.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}


optimal_params = {'learning_rate': 0.00011723245844623722, 'neurons_layer_1': 624, 'neurons_layer_2': 32, 'neurons_layer_3': 256, 'batch_size': 64, 'epochs': 78, 'regularization': 'l1', 'l1_reg': 1.802535830576335e-05}

performance_metrics = train_and_evaluate_model(x_train, y_train, x_test, y_test, optimal_params)
print("Final Model Performance on Test Set:", performance_metrics)

Epoch 1/78
Epoch 2/78
Epoch 3/78
Epoch 4/78
Epoch 5/78
Epoch 6/78
Epoch 7/78
Epoch 8/78
Epoch 9/78
Epoch 10/78
Epoch 11/78
Epoch 12/78
Epoch 13/78
Epoch 14/78
Epoch 15/78
Epoch 16/78
Epoch 17/78
Epoch 18/78
Epoch 19/78
Epoch 20/78
Epoch 21/78
Epoch 22/78
Epoch 23/78
Epoch 24/78
Epoch 25/78
Epoch 26/78
Epoch 27/78
Epoch 28/78
Epoch 29/78
Epoch 30/78
Epoch 31/78
Epoch 32/78
Epoch 33/78
Epoch 34/78
Epoch 35/78
Epoch 36/78
Epoch 37/78
Epoch 38/78
Epoch 39/78
Epoch 40/78
Epoch 41/78
Epoch 42/78
Epoch 43/78
Epoch 44/78
Epoch 45/78
Epoch 46/78
Epoch 47/78
Epoch 48/78
Epoch 49/78
Epoch 50/78
Epoch 51/78
Epoch 52/78
Epoch 53/78
Epoch 54/78
Epoch 55/78
Epoch 56/78
Epoch 57/78
Epoch 58/78
Epoch 59/78
Epoch 60/78
Epoch 61/78
Epoch 62/78
Epoch 63/78
Epoch 64/78
Epoch 65/78
Epoch 66/78
Epoch 67/78
Epoch 68/78
Epoch 69/78
Epoch 70/78
Epoch 71/78
Epoch 72/78
Epoch 73/78
Epoch 74/78
Epoch 75/78
Epoch 76/78
Epoch 77/78
Epoch 78/78
Final Model Performance on Test Set: {'R2': 0.7787071716354658, 'MSE': 35