In [1]:
from google.colab import drive
import pandas as pd

# This will prompt for authorization to access your Google Drive
drive.mount('/content/drive')

# Update the file path to the location where you uploaded your CSV in Google Drive
file_path = '/content/drive/My Drive/Thesis/df-BERT_cat.csv'

# Read the CSV file
df = pd.read_csv(file_path)
df.columns.tolist()


Mounted at /content/drive


['Unnamed: 0',
 'numerical_price',
 'embeddings',
 'num_bedrooms',
 'num_rooms',
 'building_type_Bestaande bouw',
 'building_type_Nieuwbouw',
 'building_type_na',
 'tag_k.k.',
 'tag_v.o.n.',
 'house_category_Appartement',
 'house_category_Bungalow',
 'house_category_Eengezinswoning',
 'house_category_Grachtenpand',
 'house_category_Herenhuis',
 'house_category_Landhuis',
 'house_category_Other',
 'house_category_Unknown',
 'house_category_Villa',
 'house_category_Woonboerderij',
 'house_category_Woonboot',
 'energy_label_encoded',
 'size_scaled',
 'longitude_scaled',
 'latitude_scaled']

In [2]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ast import literal_eval

df = df.drop(df.columns[0], axis=1)


df["embeddings"] = df["embeddings"].apply(literal_eval).apply(np.array)

# Separating the target variable and embeddings
x = df["embeddings"]
y = df['numerical_price']

# Dropping the 'ada_embedding_eng' column from df
z = df.drop(["embeddings", "numerical_price"], axis=1)

# Deleting original dataframe for memory purpose
del df

# Explode the embedding arrays into separate columns
x = x.apply(pd.Series)

# Concatenating the exploded embeddings with the rest of the data
concatenated_df = pd.concat([x, z], axis=1).reset_index(drop=True)
concatenated_df.columns = concatenated_df.columns.astype(str)


x_train, x_test, y_train, y_test = train_test_split(concatenated_df, y, test_size=0.2, random_state=42)

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error, explained_variance_score
from sklearn.preprocessing import StandardScaler

# Function to evaluate the model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Scaling y_train
    y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1, 1))

    # Fitting the model
    model.fit(X_train, y_train_scaled.ravel())

    # Predicting and inverse transformation
    y_pred_scaled = model.predict(X_test)
    y_pred = scaler.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    ev = explained_variance_score(y_test, y_pred)

    return r2, mse, rmse, mape, ev

# Initialize RandomForestRegressor with basic parameters
clf_rf_base = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42, n_jobs=-1)

# Initialize StandardScaler for y
scaler = StandardScaler()

# KFold Cross-validation
kf = KFold(n_splits=5)
r2_scores, mse_scores, rmse_scores, mape_scores, ev_scores = [], [], [], [], []

for train_index, test_index in kf.split(x_train):
    X_train_fold, X_test_fold = x_train.iloc[train_index], x_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    r2, mse, rmse, mape, ev = evaluate_model(clf_rf_base, X_train_fold, X_test_fold, y_train_fold, y_test_fold)
    r2_scores.append(r2)
    mse_scores.append(mse)
    rmse_scores.append(rmse)
    mape_scores.append(mape)
    ev_scores.append(ev)

# Calculate and print average scores
avg_r2 = np.mean(r2_scores)
avg_mse = np.mean(mse_scores)
avg_rmse = np.mean(rmse_scores)
avg_mape = np.mean(mape_scores)
avg_ev = np.mean(ev_scores)

print("Average R2:", avg_r2,
      "Average MSE:", avg_mse,
      "Average RMSE:", avg_rmse,
      "Average MAPE:", avg_mape,
      "Average Explained Variance:", avg_ev)


Average R2: 0.60535888723891 Average MSE: 68417494729.56673 Average RMSE: 260966.8228788158 Average MAPE: 0.2528686175162323 Average Explained Variance: 0.6054583127930141
