# Preparing Environment

In [36]:
import pandas as pd 
import numpy as np 
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import root_mean_squared_error

import warnings
warnings.filterwarnings('ignore')


# Reading files and checking records' details

In [3]:
workspace_prefs = pd.read_csv('workspace_preferences.csv')
user_prefs = pd.read_csv('user_preferences.csv')

print(workspace_prefs.head(2))
print(user_prefs.head(2))

  workspace_id preference_id  rating
0           w1         speed       4
1           w1         clean       3
   user_id preference_id  weight
0        1         speed       2
1        1         clean       0


In [4]:
print(f'Number of users in user_prefereces: {len(user_prefs['user_id'].unique())}')
print(f'Number of workspaces: {len(workspace_prefs['workspace_id'].unique())}')
print(f'Number of prefereces: {len(user_prefs['preference_id'].unique())}')


Number of users in user_prefereces: 4269
Number of workspaces: 10
Number of prefereces: 6


# Pivoting dataframes to compare preferences

In [5]:
users_df = pd.pivot_table(data=user_prefs, values='weight', index='user_id',columns='preference_id')
workspaces_df = pd.pivot_table(data=workspace_prefs, values='rating', index='workspace_id',columns='preference_id')

print(users_df.head(2))
print(workspaces_df.head(2))

preference_id  calm  capacity  clean  comfortable  service  speed
user_id                                                          
1               1.0       1.0    0.0          5.0      4.0    2.0
2               3.0       5.0    2.0          0.0      2.0    5.0
preference_id  calm  capacity  clean  comfortable  service  speed
workspace_id                                                     
w1              4.0       4.0    3.0          1.0      0.0    4.0
w10             2.0       1.0    4.0          3.0      1.0    1.0


# The Recommender
Using euclidean distance instead of cosine similarity as it's suitable for numerical data where the magnitude of differences matters.

In [65]:
def generate_recommendations(user_id):

    user_preferences = users_df.loc[user_id]
    similarity_scores = 1 - euclidean_distances(workspaces_df, [user_preferences]) # Subtracted from 1 to calculate similarities

    top_workspaces = pd.DataFrame({
    "workspace_id": workspaces_df.index.values.reshape(-1),
    "sim_score": similarity_scores.reshape(-1)
    }).sort_values(by='sim_score',ascending=False)

    return top_workspaces, user_preferences


# Validating Function
Using RMSE because it's more interpretable than MSE as it's expressed in the same units as the target variable.

In [155]:
def validate_recommendations(top_workspaces, user_preferences, n=1):
    scores = {}
    for i in range(len(top_workspaces)):
        ith_recommendation = workspaces_df.loc[top_workspaces.iloc[i,0]]
        score = root_mean_squared_error(user_preferences, ith_recommendation)
        scores[f'{ith_recommendation.name}'] = round(score,3)
    least_n_MSEs = pd.DataFrame(scores, index=np.arange(len(workspaces_df))).iloc[0].sort_values()[0:n]
    for i in range(n):
        if least_n_MSEs.index[i] == top_workspaces.iloc[i,0]:
            print(f'Recommendation number {i+1} is valid')
        else:
            print(f'Recommendation number {i+1} is NOT valid')

# TEST

In [156]:
top_workspaces, user_preferences = generate_recommendations(8)

In [158]:
validate_recommendations(top_workspaces, user_preferences,3)

Recommendation number 1 is valid
Recommendation number 2 is valid
Recommendation number 3 is valid
