In [3]:
from google.colab import drive
import pandas as pd

# This will prompt for authorization to access your Google Drive
drive.mount('/content/drive')

# Update the file path to the location where you uploaded your CSV in Google Drive
file_path = '/content/drive/My Drive/Thesis/df-BERT.csv'

# Read the CSV file
df = pd.read_csv(file_path)



# Display the column names
df.columns.tolist()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['url',
 'price',
 'address',
 'descrip',
 'listed_since',
 'zip_code',
 'size',
 'year',
 'living_area',
 'kind_of_house',
 'building_type',
 'num_of_rooms',
 'num_of_bathrooms',
 'layout',
 'energy_label',
 'insulation',
 'heating',
 'ownership',
 'exteriors',
 'parking',
 'date_list',
 'last_ask_price',
 'last_ask_price_m2',
 'city',
 'log_id',
 'num of tokens per descrip',
 'descrip_en',
 'numerical_price',
 'numerical_price_per_m2',
 'tag',
 'house_category',
 'living_area_float',
 'size_float',
 'zip_code_4_digits',
 'postcode',
 'latitude',
 'longitude',
 'embeddings']

In [4]:
# Assuming df is your original DataFrame
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from ast import literal_eval


# Convert the 'ada_embedding_eng' column to a list of arrays if they are not already
df["embeddings"] = df["embeddings"].apply(literal_eval).apply(np.array)

x = df["embeddings"]
y = df['numerical_price']


###### deleting original dataframe for memory purpose ####
del df

In [5]:
from sklearn.model_selection import train_test_split
x = x.apply(pd.Series)

# First split: separate out a test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np

def train_and_evaluate(x_train, y_train, x_test, y_test, best_params):
    """
    Trains the Random Forest model with the given parameters and evaluates it on the test set.

    Parameters:
    x_train (list/array): Training features
    y_train (list/array): Training target variable
    x_test (list/array): Test features
    y_test (list/array): Test target variable
    best_params (dict): Dictionary of best hyperparameters

    Returns:
    dict: Dictionary containing R2, MSE, and RMSE metrics
    """

    # Initialize RandomForestRegressor with best parameters
    clf_rf_best = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)

    # Initialize and fit StandardScaler on y_train
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))

    # Train the model on the entire training dataset
    clf_rf_best.fit(x_train, y_train_scaled.ravel())

    # Predict on the test data and inverse transform the predictions
    y_pred_scaled = clf_rf_best.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}

# Best parameters from cross-validation
best_params = {'max_depth': 16, 'n_estimators': 900, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'bootstrap': False}



performance_metrics = train_and_evaluate(x_train, y_train, x_test, y_test, best_params)
print("Final Model Performance on Test Set:", performance_metrics)


Final Model Performance on Test Set: {'R2': 0.3154412647152307, 'MSE': 109373871490.57896, 'RMSE': 330717.2077327985, 'MAE': 176699.8630821869, 'MAPE': 36.94779288046121, 'explained': 0.31557838896551216}


In [9]:
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from sklearn.preprocessing import StandardScaler
import numpy as np


def run_svr_and_evaluate(x_train,x_test, y_train,y_test, ):
    # Split data into training and testing sets


    # Initialize SVR with your predefined parameters
    clf_svr_opt = clf_linear_svr = LinearSVR(**linear_svr_params, random_state=0)

    # StandardScaler for y
    scaler = StandardScaler()

    # Scaling
    y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1)).ravel()


    # Fitting the model
    clf_svr_opt.fit(x_train, y_train_scaled)

    # Predicting and inverse transformation for the test set
    y_pred_scaled = clf_svr_opt.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}



linear_svr_params = {'C': 0.32229360417490505, 'epsilon': 0.04136883229487408, 'tol': 0.00041905031546814767, 'loss': 'squared_epsilon_insensitive', 'dual': True, 'fit_intercept': False, 'intercept_scaling': 2.8419375853277256, 'max_iter': 4824}

performance_metrics = run_svr_and_evaluate(x_train=x_train,x_test=x_test, y_train=y_train,y_test=y_test)
print(performance_metrics)

{'R2': 0.3722086382746581, 'MSE': 100303988804.8169, 'RMSE': 316708.0497947864, 'MAE': 186467.44341936865, 'MAPE': 37.957581710712766, 'explained': 0.3723538455587867}


In [10]:
import numpy as np


x_train = np.array(x_train)
x_test = np.array(x_test)


print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


y_train = np.ravel(y_train)
y_test = np.ravel(y_test)


print("Adjusted y_train shape:", y_train.shape)
print("Adjusted y_test shape:", y_test.shape)


x_train shape: (57611, 768)
x_test shape: (14403, 768)
y_train shape: (57611,)
y_test shape: (14403,)
Adjusted y_train shape: (57611,)
Adjusted y_test shape: (14403,)


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1, l2, l1_l2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
import numpy as np

def train_and_evaluate_model(x_train, y_train, x_test, y_test, params):
    """
    Train and evaluate a neural network model.

    Parameters:
    - x_train: Training features
    - y_train: Training target values
    - x_test: Test features
    - y_test: Test target values
    - params: Dictionary containing the optimal parameters

    Returns:
    - R2 score, MSE, and RMSE on the test set.
    """



    model = Sequential()
    if params['regularization'] == 'l1':
        reg = l1(params['l1_reg'])
    elif params['regularization'] == 'l2':
        reg = l2(0)
    elif params['regularization'] == 'l1_l2':
        reg = l1_l2(l1=params['l1_reg'], l2=0)
    else:
        reg = None

    model.add(Dense(params['neurons_layer_1'], activation='relu', input_shape=(x_train.shape[1],), kernel_regularizer=reg))
    model.add(Dense(params['neurons_layer_2'], activation='relu', kernel_regularizer=reg))
    model.add(Dense(params['neurons_layer_3'], activation='relu', kernel_regularizer=reg))
    model.add(Dense(1, activation='linear'))

    # Compile the model
    optimizer = Adam(learning_rate=params['learning_rate'])
    model.compile(optimizer=optimizer, loss='mean_squared_error')

    # Scaling y_train
    scaler = StandardScaler()
    y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()

    # Fit the model
    model.fit(x_train, y_train_scaled, epochs=params['epochs'], batch_size=params['batch_size'], verbose=1)

    # Predict and evaluate on the test set
    y_pred_scaled = model.predict(x_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

      # Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    rmse = np.sqrt(mse)
    # R-squared Score
    r2_score_value = r2_score(y_test, y_pred)
    # Mean Absolute Error
    mae = mean_absolute_error(y_test, y_pred)
    # Mean Absolute Percentage Error
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    # Explained Variance Score
    explained_variance = explained_variance_score(y_test, y_pred)


    return {"R2": r2_score_value, "MSE": mse, "RMSE": rmse, "MAE": mae, "MAPE": mape,"explained":explained_variance}


optimal_params = {'learning_rate': 0.00013189685881053736, 'neurons_layer_1': 240, 'neurons_layer_2': 384, 'neurons_layer_3': 48, 'batch_size': 128, 'epochs': 57, 'regularization': 'none'}



performance_metrics = train_and_evaluate_model(x_train, y_train, x_test, y_test, optimal_params)
print("Final Model Performance on Test Set:", performance_metrics)


Epoch 1/57
Epoch 2/57
Epoch 3/57
Epoch 4/57
Epoch 5/57
Epoch 6/57
Epoch 7/57
Epoch 8/57
Epoch 9/57
Epoch 10/57
Epoch 11/57
Epoch 12/57
Epoch 13/57
Epoch 14/57
Epoch 15/57
Epoch 16/57
Epoch 17/57
Epoch 18/57
Epoch 19/57
Epoch 20/57
Epoch 21/57
Epoch 22/57
Epoch 23/57
Epoch 24/57
Epoch 25/57
Epoch 26/57
Epoch 27/57
Epoch 28/57
Epoch 29/57
Epoch 30/57
Epoch 31/57
Epoch 32/57
Epoch 33/57
Epoch 34/57
Epoch 35/57
Epoch 36/57
Epoch 37/57
Epoch 38/57
Epoch 39/57
Epoch 40/57
Epoch 41/57
Epoch 42/57
Epoch 43/57
Epoch 44/57
Epoch 45/57
Epoch 46/57
Epoch 47/57
Epoch 48/57
Epoch 49/57
Epoch 50/57
Epoch 51/57
Epoch 52/57
Epoch 53/57
Epoch 54/57
Epoch 55/57
Epoch 56/57
Epoch 57/57
Final Model Performance on Test Set: {'R2': 0.46847468882226373, 'MSE': 84923291577.8369, 'RMSE': 291416.01118990855, 'MAE': 162853.65418771264, 'MAPE': 30.432387157378926, 'explained': 0.46972876688157195}
