In [1]:
from google.colab import drive
import pandas as pd

# This will prompt for authorization to access your Google Drive
drive.mount('/content/drive')

# Update the file path to the location where you uploaded your CSV in Google Drive
file_path = '/content/drive/My Drive/Thesis/df-BERT.csv'

# Read the CSV file
df = pd.read_csv(file_path)


Mounted at /content/drive


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from ast import literal_eval



df["embeddings"] = df["embeddings"].apply(literal_eval).apply(np.array)

x = df["embeddings"]
y = df['numerical_price']
# Explode the embedding arrays into separate columns


In [3]:
x = x.apply(pd.Series)

# First split: separate out a test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, explained_variance_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import numpy as np

def train_and_evaluate_nn(X_train, y_train):
    # Basic hyperparameters
    learning_rate = 0.001
    neurons_layer_1 = 1024
    neurons_layer_2 = 512
    neurons_layer_3 = 256
    batch_size = 32
    epochs = 10

    # KFold Cross-validation
    kf = KFold(n_splits=5)
    r2_scores, mse_scores, rmse_scores, mape_scores, ev_scores = [], [], [], [], []

    for train_index, test_index in kf.split(X_train):
        X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

        # Build model
        model = Sequential()
        model.add(Dense(neurons_layer_1, activation='relu', input_shape=(X_train_fold.shape[1],)))
        model.add(Dense(neurons_layer_2, activation='relu'))
        model.add(Dense(neurons_layer_3, activation='relu'))
        model.add(Dense(1, activation='linear'))

        # Compile model
        optimizer = Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='mean_squared_error')

        # Scaling y_train
        scaler = StandardScaler()
        y_train_scaled = scaler.fit_transform(y_train_fold.reshape(-1, 1)).flatten()

        # Train model
        model.fit(X_train_fold, y_train_scaled, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict and evaluate
        y_pred_scaled = model.predict(X_test_fold)
        y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

        # Metrics calculation
        r2 = r2_score(y_test_fold, y_pred)
        mse = mean_squared_error(y_test_fold, y_pred)
        rmse = np.sqrt(mse)
        mape = mean_absolute_percentage_error(y_test_fold, y_pred)
        ev = explained_variance_score(y_test_fold, y_pred)

        # Append scores
        r2_scores.append(r2)
        mse_scores.append(mse)
        rmse_scores.append(rmse)
        mape_scores.append(mape)
        ev_scores.append(ev)

    # Calculate average scores
    avg_r2 = np.mean(r2_scores)
    avg_mse = np.mean(mse_scores)
    avg_rmse = np.mean(rmse_scores)
    avg_mape = np.mean(mape_scores)
    avg_ev = np.mean(ev_scores)

    return avg_r2, avg_mse, avg_rmse, avg_mape, avg_ev


In [5]:
import numpy as np

# Convert x_train and x_test to numpy arrays
x_train = np.array(x_train)
x_test = np.array(x_test)


print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)



y_train = np.ravel(y_train)
y_test = np.ravel(y_test)


print("Adjusted y_train shape:", y_train.shape)
print("Adjusted y_test shape:", y_test.shape)


x_train shape: (57611, 768)
x_test shape: (14403, 768)
y_train shape: (57611,)
y_test shape: (14403,)
Adjusted y_train shape: (57611,)
Adjusted y_test shape: (14403,)


In [6]:
# Assuming x_train and y_train are already defined and preprocessed

# Call the function with your data
average_r2_score, average_mse, average_rmse, average_mape, average_ev = train_and_evaluate_nn(x_train, y_train)

# Print the performance metrics
print("Average R2 Score:", average_r2_score)
print("Average MSE:", average_mse)
print("Average RMSE:", average_rmse)
print("Average MAPE:", average_mape)
print("Average Explained Variance:", average_ev)


Average R2 Score: 0.3022031852965418
Average MSE: 120607192680.6989
Average RMSE: 347048.601563112
Average MAPE: 0.39107777594214826
Average Explained Variance: 0.3467659411986953
